llvm-project/llvm/test/CodeGen/X86/vector-constrained-fp-intri...

7803 lines
318 KiB
LLVM
Raw Normal View History

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -O3 -mtriple=x86_64-pc-linux < %s | FileCheck %s
; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+avx < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+avx512f < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+avx512dq < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512DQ
define <1 x float> @constrained_vector_fdiv_v1f32() #0 {
; CHECK-LABEL: constrained_vector_fdiv_v1f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: divss {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_fdiv_v1f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vdivss {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
entry:
%div = call <1 x float> @llvm.experimental.constrained.fdiv.v1f32(
<1 x float> <float 1.000000e+00>,
<1 x float> <float 1.000000e+01>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <1 x float> %div
}
define <2 x double> @constrained_vector_fdiv_v2f64() #0 {
; CHECK-LABEL: constrained_vector_fdiv_v2f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movapd {{.*#+}} xmm0 = [1.0E+0,2.0E+0]
; CHECK-NEXT: divpd {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_fdiv_v2f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovapd {{.*#+}} xmm0 = [1.0E+0,2.0E+0]
; AVX-NEXT: vdivpd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
entry:
%div = call <2 x double> @llvm.experimental.constrained.fdiv.v2f64(
<2 x double> <double 1.000000e+00, double 2.000000e+00>,
<2 x double> <double 1.000000e+01, double 1.000000e+01>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <2 x double> %div
}
define <3 x float> @constrained_vector_fdiv_v3f32() #0 {
; CHECK-LABEL: constrained_vector_fdiv_v3f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; CHECK-NEXT: divss %xmm1, %xmm2
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: divss %xmm1, %xmm0
; CHECK-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; CHECK-NEXT: divss %xmm1, %xmm3
; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_fdiv_v3f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm1
; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX-NEXT: vdivss %xmm0, %xmm2, %xmm2
; AVX-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; AVX-NEXT: vdivss %xmm0, %xmm3, %xmm0
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[2,3]
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; AVX-NEXT: retq
entry:
%div = call <3 x float> @llvm.experimental.constrained.fdiv.v3f32(
<3 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>,
<3 x float> <float 1.000000e+01, float 1.000000e+01, float 1.000000e+01>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x float> %div
}
define <3 x double> @constrained_vector_fdiv_v3f64() #0 {
; CHECK-LABEL: constrained_vector_fdiv_v3f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movapd {{.*#+}} xmm0 = [1.0E+0,2.0E+0]
; CHECK-NEXT: divpd {{.*}}(%rip), %xmm0
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: divsd {{.*}}(%rip), %xmm1
; CHECK-NEXT: movsd %xmm1, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movapd %xmm0, %xmm1
; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp)
; CHECK-NEXT: wait
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_fdiv_v3f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vdivsd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [1.0E+0,2.0E+0]
; AVX-NEXT: vdivpd {{.*}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: retq
entry:
%div = call <3 x double> @llvm.experimental.constrained.fdiv.v3f64(
<3 x double> <double 1.000000e+00, double 2.000000e+00, double 3.000000e+00>,
<3 x double> <double 1.000000e+01, double 1.000000e+01, double 1.000000e+01>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x double> %div
}
define <4 x double> @constrained_vector_fdiv_v4f64() #0 {
; CHECK-LABEL: constrained_vector_fdiv_v4f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movapd {{.*#+}} xmm2 = [1.0E+1,1.0E+1]
; CHECK-NEXT: movapd {{.*#+}} xmm1 = [3.0E+0,4.0E+0]
; CHECK-NEXT: divpd %xmm2, %xmm1
; CHECK-NEXT: movapd {{.*#+}} xmm0 = [1.0E+0,2.0E+0]
; CHECK-NEXT: divpd %xmm2, %xmm0
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_fdiv_v4f64:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
; AVX1-NEXT: vdivpd {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_fdiv_v4f64:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1.0E+1,1.0E+1,1.0E+1,1.0E+1]
; AVX512-NEXT: vmovapd {{.*#+}} ymm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
; AVX512-NEXT: vdivpd %ymm0, %ymm1, %ymm0
; AVX512-NEXT: retq
entry:
%div = call <4 x double> @llvm.experimental.constrained.fdiv.v4f64(
<4 x double> <double 1.000000e+00, double 2.000000e+00,
double 3.000000e+00, double 4.000000e+00>,
<4 x double> <double 1.000000e+01, double 1.000000e+01,
double 1.000000e+01, double 1.000000e+01>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <4 x double> %div
}
define <1 x float> @constrained_vector_frem_v1f32() #0 {
; CHECK-LABEL: constrained_vector_frem_v1f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: callq fmodf
; CHECK-NEXT: popq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_frem_v1f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: pushq %rax
; AVX-NEXT: .cfi_def_cfa_offset 16
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: callq fmodf
; AVX-NEXT: popq %rax
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%rem = call <1 x float> @llvm.experimental.constrained.frem.v1f32(
<1 x float> <float 1.000000e+00>,
<1 x float> <float 1.000000e+01>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <1 x float> %rem
}
define <2 x double> @constrained_vector_frem_v2f64() #0 {
; CHECK-LABEL: constrained_vector_frem_v2f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: callq fmod
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: callq fmod
; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_frem_v2f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $24, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 32
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: callq fmod
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: callq fmod
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: addq $24, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%rem = call <2 x double> @llvm.experimental.constrained.frem.v2f64(
<2 x double> <double 1.000000e+00, double 2.000000e+00>,
<2 x double> <double 1.000000e+01, double 1.000000e+01>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <2 x double> %rem
}
define <3 x float> @constrained_vector_frem_v3f32() #0 {
; CHECK-LABEL: constrained_vector_frem_v3f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: callq fmodf
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: callq fmodf
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: callq fmodf
; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
; CHECK-NEXT: movaps %xmm1, %xmm0
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_frem_v3f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 48
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: callq fmodf
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: callq fmodf
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: callq fmodf
; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3]
; AVX-NEXT: addq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%rem = call <3 x float> @llvm.experimental.constrained.frem.v3f32(
<3 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>,
<3 x float> <float 1.000000e+01, float 1.000000e+01, float 1.000000e+01>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x float> %rem
}
define <3 x double> @constrained_vector_frem_v3f64() #0 {
; CHECK-LABEL: constrained_vector_frem_v3f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: callq fmod
; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: callq fmod
; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: callq fmod
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl {{[0-9]+}}(%rsp)
; CHECK-NEXT: wait
; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_frem_v3f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $56, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 64
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: callq fmod
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: callq fmod
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: vzeroupper
; AVX-NEXT: callq fmod
; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: addq $56, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%rem = call <3 x double> @llvm.experimental.constrained.frem.v3f64(
<3 x double> <double 1.000000e+00, double 2.000000e+00, double 3.000000e+00>,
<3 x double> <double 1.000000e+01, double 1.000000e+01, double 1.000000e+01>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x double> %rem
}
define <4 x double> @constrained_vector_frem_v4f64() #0 {
; CHECK-LABEL: constrained_vector_frem_v4f64:
; CHECK: # %bb.0:
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: callq fmod
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: callq fmod
; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: callq fmod
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: callq fmod
; CHECK-NEXT: movaps %xmm0, %xmm1
; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_frem_v4f64:
; AVX: # %bb.0:
; AVX-NEXT: subq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 48
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: callq fmod
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: callq fmod
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: callq fmod
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: callq fmod
; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
; AVX-NEXT: addq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
%rem = call <4 x double> @llvm.experimental.constrained.frem.v4f64(
<4 x double> <double 1.000000e+00, double 2.000000e+00,
double 3.000000e+00, double 4.000000e+00>,
<4 x double> <double 1.000000e+01, double 1.000000e+01,
double 1.000000e+01, double 1.000000e+01>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <4 x double> %rem
}
define <1 x float> @constrained_vector_fmul_v1f32() #0 {
; CHECK-LABEL: constrained_vector_fmul_v1f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: mulss {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_fmul_v1f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
entry:
%mul = call <1 x float> @llvm.experimental.constrained.fmul.v1f32(
<1 x float> <float 0x7FF0000000000000>,
<1 x float> <float 2.000000e+00>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <1 x float> %mul
}
define <2 x double> @constrained_vector_fmul_v2f64() #0 {
; CHECK-LABEL: constrained_vector_fmul_v2f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308]
; CHECK-NEXT: mulpd {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_fmul_v2f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308]
; AVX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
entry:
%mul = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(
<2 x double> <double 0x7FEFFFFFFFFFFFFF, double 0x7FEFFFFFFFFFFFFF>,
<2 x double> <double 2.000000e+00, double 3.000000e+00>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <2 x double> %mul
}
define <3 x float> @constrained_vector_fmul_v3f32() #0 {
; CHECK-LABEL: constrained_vector_fmul_v3f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; CHECK-NEXT: mulss %xmm1, %xmm2
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: mulss %xmm1, %xmm0
; CHECK-NEXT: mulss {{.*}}(%rip), %xmm1
; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_fmul_v3f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1
; AVX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm2
; AVX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[2,3]
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; AVX-NEXT: retq
entry:
%mul = call <3 x float> @llvm.experimental.constrained.fmul.v3f32(
<3 x float> <float 0x7FF0000000000000, float 0x7FF0000000000000,
float 0x7FF0000000000000>,
<3 x float> <float 1.000000e+00, float 1.000000e+01, float 1.000000e+02>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x float> %mul
}
define <3 x double> @constrained_vector_fmul_v3f64() #0 {
; CHECK-LABEL: constrained_vector_fmul_v3f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308]
; CHECK-NEXT: mulpd {{.*}}(%rip), %xmm0
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: mulsd {{.*}}(%rip), %xmm1
; CHECK-NEXT: movsd %xmm1, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movapd %xmm0, %xmm1
; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp)
; CHECK-NEXT: wait
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_fmul_v3f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmulsd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308]
; AVX-NEXT: vmulpd {{.*}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: retq
entry:
%mul = call <3 x double> @llvm.experimental.constrained.fmul.v3f64(
<3 x double> <double 0x7FEFFFFFFFFFFFFF, double 0x7FEFFFFFFFFFFFFF,
double 0x7FEFFFFFFFFFFFFF>,
<3 x double> <double 1.000000e+00, double 1.000000e+01, double 1.000000e+02>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x double> %mul
}
define <4 x double> @constrained_vector_fmul_v4f64() #0 {
; CHECK-LABEL: constrained_vector_fmul_v4f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308]
; CHECK-NEXT: movapd {{.*#+}} xmm1 = [4.0E+0,5.0E+0]
; CHECK-NEXT: mulpd %xmm0, %xmm1
; CHECK-NEXT: mulpd {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_fmul_v4f64:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308]
; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_fmul_v4f64:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308]
; AVX512-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0
; AVX512-NEXT: retq
entry:
%mul = call <4 x double> @llvm.experimental.constrained.fmul.v4f64(
<4 x double> <double 0x7FEFFFFFFFFFFFFF, double 0x7FEFFFFFFFFFFFFF,
double 0x7FEFFFFFFFFFFFFF, double 0x7FEFFFFFFFFFFFFF>,
<4 x double> <double 2.000000e+00, double 3.000000e+00,
double 4.000000e+00, double 5.000000e+00>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <4 x double> %mul
}
define <1 x float> @constrained_vector_fadd_v1f32() #0 {
; CHECK-LABEL: constrained_vector_fadd_v1f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: addss {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_fadd_v1f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
entry:
%add = call <1 x float> @llvm.experimental.constrained.fadd.v1f32(
<1 x float> <float 0x7FF0000000000000>,
<1 x float> <float 1.0>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <1 x float> %add
}
define <2 x double> @constrained_vector_fadd_v2f64() #0 {
; CHECK-LABEL: constrained_vector_fadd_v2f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308]
; CHECK-NEXT: addpd {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_fadd_v2f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308]
; AVX-NEXT: vaddpd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
entry:
%add = call <2 x double> @llvm.experimental.constrained.fadd.v2f64(
<2 x double> <double 0x7FEFFFFFFFFFFFFF, double 0x7FEFFFFFFFFFFFFF>,
<2 x double> <double 1.000000e+00, double 1.000000e-01>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <2 x double> %add
}
define <3 x float> @constrained_vector_fadd_v3f32() #0 {
; CHECK-LABEL: constrained_vector_fadd_v3f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; CHECK-NEXT: addss %xmm2, %xmm1
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: addss %xmm2, %xmm0
; CHECK-NEXT: addss {{.*}}(%rip), %xmm2
; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_fadd_v3f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVX-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm2
; AVX-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
; AVX-NEXT: retq
entry:
%add = call <3 x float> @llvm.experimental.constrained.fadd.v3f32(
<3 x float> <float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000,
float 0xFFFFFFFFE0000000>,
<3 x float> <float 2.0, float 1.0, float 0.0>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x float> %add
}
define <3 x double> @constrained_vector_fadd_v3f64() #0 {
; CHECK-LABEL: constrained_vector_fadd_v3f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308]
; CHECK-NEXT: addpd {{.*}}(%rip), %xmm0
; CHECK-NEXT: xorpd %xmm1, %xmm1
; CHECK-NEXT: addsd {{.*}}(%rip), %xmm1
; CHECK-NEXT: movsd %xmm1, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movapd %xmm0, %xmm1
; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp)
; CHECK-NEXT: wait
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_fadd_v3f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
; AVX-NEXT: vaddsd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308]
; AVX-NEXT: vaddpd {{.*}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: retq
entry:
%add = call <3 x double> @llvm.experimental.constrained.fadd.v3f64(
<3 x double> <double 0x7FEFFFFFFFFFFFFF, double 0x7FEFFFFFFFFFFFFF,
double 0x7FEFFFFFFFFFFFFF>,
<3 x double> <double 2.0, double 1.0, double 0.0>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x double> %add
}
define <4 x double> @constrained_vector_fadd_v4f64() #0 {
; CHECK-LABEL: constrained_vector_fadd_v4f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308]
; CHECK-NEXT: movapd {{.*#+}} xmm1 = [2.0E+0,2.0000000000000001E-1]
; CHECK-NEXT: addpd %xmm0, %xmm1
; CHECK-NEXT: addpd {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_fadd_v4f64:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308]
; AVX1-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_fadd_v4f64:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308]
; AVX512-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0
; AVX512-NEXT: retq
entry:
%add = call <4 x double> @llvm.experimental.constrained.fadd.v4f64(
<4 x double> <double 0x7FEFFFFFFFFFFFFF, double 0x7FEFFFFFFFFFFFFF,
double 0x7FEFFFFFFFFFFFFF, double 0x7FEFFFFFFFFFFFFF>,
<4 x double> <double 1.000000e+00, double 1.000000e-01,
double 2.000000e+00, double 2.000000e-01>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <4 x double> %add
}
define <1 x float> @constrained_vector_fsub_v1f32() #0 {
; CHECK-LABEL: constrained_vector_fsub_v1f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: subss {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_fsub_v1f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vsubss {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
entry:
%sub = call <1 x float> @llvm.experimental.constrained.fsub.v1f32(
<1 x float> <float 0x7FF0000000000000>,
<1 x float> <float 1.000000e+00>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <1 x float> %sub
}
define <2 x double> @constrained_vector_fsub_v2f64() #0 {
; CHECK-LABEL: constrained_vector_fsub_v2f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movapd {{.*#+}} xmm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308]
; CHECK-NEXT: subpd {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_fsub_v2f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovapd {{.*#+}} xmm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308]
; AVX-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
entry:
%sub = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(
<2 x double> <double 0xFFEFFFFFFFFFFFFF, double 0xFFEFFFFFFFFFFFFF>,
<2 x double> <double 1.000000e+00, double 1.000000e-01>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <2 x double> %sub
}
define <3 x float> @constrained_vector_fsub_v3f32() #0 {
; CHECK-LABEL: constrained_vector_fsub_v3f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: movaps %xmm1, %xmm2
; CHECK-NEXT: subss %xmm0, %xmm2
; CHECK-NEXT: movaps %xmm1, %xmm0
; CHECK-NEXT: subss {{.*}}(%rip), %xmm0
; CHECK-NEXT: subss {{.*}}(%rip), %xmm1
; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_fsub_v3f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0
; AVX-NEXT: vsubss {{.*}}(%rip), %xmm1, %xmm2
; AVX-NEXT: vsubss {{.*}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
; AVX-NEXT: retq
entry:
%sub = call <3 x float> @llvm.experimental.constrained.fsub.v3f32(
<3 x float> <float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000,
float 0xFFFFFFFFE0000000>,
<3 x float> <float 2.0, float 1.0, float 0.0>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x float> %sub
}
define <3 x double> @constrained_vector_fsub_v3f64() #0 {
; CHECK-LABEL: constrained_vector_fsub_v3f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xorpd %xmm0, %xmm0
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: subsd %xmm0, %xmm1
; CHECK-NEXT: movapd {{.*#+}} xmm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308]
; CHECK-NEXT: subpd {{.*}}(%rip), %xmm0
; CHECK-NEXT: movsd %xmm1, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movapd %xmm0, %xmm1
; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp)
; CHECK-NEXT: wait
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_fsub_v3f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [-1.7976931348623157E+308,-1.7976931348623157E+308]
; AVX-NEXT: vsubpd {{.*}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: retq
entry:
%sub = call <3 x double> @llvm.experimental.constrained.fsub.v3f64(
<3 x double> <double 0xFFEFFFFFFFFFFFFF, double 0xFFEFFFFFFFFFFFFF,
double 0xFFEFFFFFFFFFFFFF>,
<3 x double> <double 2.0, double 1.0, double 0.0>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x double> %sub
}
define <4 x double> @constrained_vector_fsub_v4f64() #0 {
; CHECK-LABEL: constrained_vector_fsub_v4f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movapd {{.*#+}} xmm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308]
; CHECK-NEXT: movapd %xmm0, %xmm1
; CHECK-NEXT: subpd {{.*}}(%rip), %xmm1
; CHECK-NEXT: subpd {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_fsub_v4f64:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308]
; AVX1-NEXT: vsubpd {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_fsub_v4f64:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308]
; AVX512-NEXT: vsubpd {{.*}}(%rip), %ymm0, %ymm0
; AVX512-NEXT: retq
entry:
%sub = call <4 x double> @llvm.experimental.constrained.fsub.v4f64(
<4 x double> <double 0xFFEFFFFFFFFFFFFF, double 0xFFEFFFFFFFFFFFFF,
double 0xFFEFFFFFFFFFFFFF, double 0xFFEFFFFFFFFFFFFF>,
<4 x double> <double 1.000000e+00, double 1.000000e-01,
double 2.000000e+00, double 2.000000e-01>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <4 x double> %sub
}
define <1 x float> @constrained_vector_sqrt_v1f32() #0 {
; CHECK-LABEL: constrained_vector_sqrt_v1f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: sqrtss %xmm0, %xmm0
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_sqrt_v1f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
%sqrt = call <1 x float> @llvm.experimental.constrained.sqrt.v1f32(
<1 x float> <float 42.0>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <1 x float> %sqrt
}
define <2 x double> @constrained_vector_sqrt_v2f64() #0 {
; CHECK-LABEL: constrained_vector_sqrt_v2f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: sqrtpd {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_sqrt_v2f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vsqrtpd {{.*}}(%rip), %xmm0
; AVX-NEXT: retq
entry:
%sqrt = call <2 x double> @llvm.experimental.constrained.sqrt.v2f64(
<2 x double> <double 42.0, double 42.1>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <2 x double> %sqrt
}
define <3 x float> @constrained_vector_sqrt_v3f32() #0 {
; CHECK-LABEL: constrained_vector_sqrt_v3f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: sqrtss %xmm0, %xmm1
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: sqrtss %xmm0, %xmm0
; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; CHECK-NEXT: sqrtss %xmm2, %xmm2
; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_sqrt_v3f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: vsqrtss %xmm1, %xmm1, %xmm1
; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX-NEXT: vsqrtss %xmm2, %xmm2, %xmm2
; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
; AVX-NEXT: retq
entry:
%sqrt = call <3 x float> @llvm.experimental.constrained.sqrt.v3f32(
<3 x float> <float 42.0, float 43.0, float 44.0>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x float> %sqrt
}
define <3 x double> @constrained_vector_sqrt_v3f64() #0 {
; CHECK-LABEL: constrained_vector_sqrt_v3f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: sqrtsd %xmm0, %xmm1
; CHECK-NEXT: sqrtpd {{.*}}(%rip), %xmm0
; CHECK-NEXT: movsd %xmm1, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movapd %xmm0, %xmm1
; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp)
; CHECK-NEXT: wait
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_sqrt_v3f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
; AVX-NEXT: vsqrtpd {{.*}}(%rip), %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: retq
entry:
%sqrt = call <3 x double> @llvm.experimental.constrained.sqrt.v3f64(
<3 x double> <double 42.0, double 42.1, double 42.2>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x double> %sqrt
}
define <4 x double> @constrained_vector_sqrt_v4f64() #0 {
; CHECK-LABEL: constrained_vector_sqrt_v4f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: sqrtpd {{.*}}(%rip), %xmm1
; CHECK-NEXT: sqrtpd {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_sqrt_v4f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vsqrtpd {{.*}}(%rip), %ymm0
; AVX-NEXT: retq
entry:
%sqrt = call <4 x double> @llvm.experimental.constrained.sqrt.v4f64(
<4 x double> <double 42.0, double 42.1,
double 42.2, double 42.3>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <4 x double> %sqrt
}
define <1 x float> @constrained_vector_pow_v1f32() #0 {
; CHECK-LABEL: constrained_vector_pow_v1f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: callq powf
; CHECK-NEXT: popq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_pow_v1f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: pushq %rax
; AVX-NEXT: .cfi_def_cfa_offset 16
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: callq powf
; AVX-NEXT: popq %rax
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%pow = call <1 x float> @llvm.experimental.constrained.pow.v1f32(
<1 x float> <float 42.0>,
<1 x float> <float 3.0>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <1 x float> %pow
}
define <2 x double> @constrained_vector_pow_v2f64() #0 {
; CHECK-LABEL: constrained_vector_pow_v2f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: callq pow
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: callq pow
; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_pow_v2f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $24, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 32
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: callq pow
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: callq pow
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: addq $24, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%pow = call <2 x double> @llvm.experimental.constrained.pow.v2f64(
<2 x double> <double 42.1, double 42.2>,
<2 x double> <double 3.0, double 3.0>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <2 x double> %pow
}
define <3 x float> @constrained_vector_pow_v3f32() #0 {
; CHECK-LABEL: constrained_vector_pow_v3f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: callq powf
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: callq powf
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: callq powf
; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
; CHECK-NEXT: movaps %xmm1, %xmm0
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_pow_v3f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 48
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: callq powf
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: callq powf
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: callq powf
; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3]
; AVX-NEXT: addq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%pow = call <3 x float> @llvm.experimental.constrained.pow.v3f32(
<3 x float> <float 42.0, float 43.0, float 44.0>,
<3 x float> <float 3.0, float 3.0, float 3.0>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x float> %pow
}
define <3 x double> @constrained_vector_pow_v3f64() #0 {
; CHECK-LABEL: constrained_vector_pow_v3f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: callq pow
; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: callq pow
; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: callq pow
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl {{[0-9]+}}(%rsp)
; CHECK-NEXT: wait
; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_pow_v3f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $56, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 64
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: callq pow
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: callq pow
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: vzeroupper
; AVX-NEXT: callq pow
; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: addq $56, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%pow = call <3 x double> @llvm.experimental.constrained.pow.v3f64(
<3 x double> <double 42.0, double 42.1, double 42.2>,
<3 x double> <double 3.0, double 3.0, double 3.0>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x double> %pow
}
define <4 x double> @constrained_vector_pow_v4f64() #0 {
; CHECK-LABEL: constrained_vector_pow_v4f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: callq pow
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: callq pow
; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: callq pow
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: callq pow
; CHECK-NEXT: movaps %xmm0, %xmm1
; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_pow_v4f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 48
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: callq pow
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: callq pow
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: callq pow
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: callq pow
; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
; AVX-NEXT: addq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%pow = call <4 x double> @llvm.experimental.constrained.pow.v4f64(
<4 x double> <double 42.1, double 42.2,
double 42.3, double 42.4>,
<4 x double> <double 3.0, double 3.0,
double 3.0, double 3.0>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <4 x double> %pow
}
define <1 x float> @constrained_vector_powi_v1f32() #0 {
; CHECK-LABEL: constrained_vector_powi_v1f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: movl $3, %edi
; CHECK-NEXT: callq __powisf2
; CHECK-NEXT: popq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_powi_v1f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: pushq %rax
; AVX-NEXT: .cfi_def_cfa_offset 16
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: movl $3, %edi
; AVX-NEXT: callq __powisf2
; AVX-NEXT: popq %rax
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%powi = call <1 x float> @llvm.experimental.constrained.powi.v1f32(
<1 x float> <float 42.0>,
i32 3,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <1 x float> %powi
}
define <2 x double> @constrained_vector_powi_v2f64() #0 {
; CHECK-LABEL: constrained_vector_powi_v2f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movl $3, %edi
; CHECK-NEXT: callq __powidf2
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movl $3, %edi
; CHECK-NEXT: callq __powidf2
; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_powi_v2f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $24, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 32
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: movl $3, %edi
; AVX-NEXT: callq __powidf2
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: movl $3, %edi
; AVX-NEXT: callq __powidf2
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: addq $24, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%powi = call <2 x double> @llvm.experimental.constrained.powi.v2f64(
<2 x double> <double 42.1, double 42.2>,
i32 3,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <2 x double> %powi
}
define <3 x float> @constrained_vector_powi_v3f32() #0 {
; CHECK-LABEL: constrained_vector_powi_v3f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: movl $3, %edi
; CHECK-NEXT: callq __powisf2
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: movl $3, %edi
; CHECK-NEXT: callq __powisf2
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: movl $3, %edi
; CHECK-NEXT: callq __powisf2
; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
; CHECK-NEXT: movaps %xmm1, %xmm0
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_powi_v3f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 48
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: movl $3, %edi
; AVX-NEXT: callq __powisf2
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: movl $3, %edi
; AVX-NEXT: callq __powisf2
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: movl $3, %edi
; AVX-NEXT: callq __powisf2
; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3]
; AVX-NEXT: addq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%powi = call <3 x float> @llvm.experimental.constrained.powi.v3f32(
<3 x float> <float 42.0, float 43.0, float 44.0>,
i32 3,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x float> %powi
}
define <3 x double> @constrained_vector_powi_v3f64() #0 {
; CHECK-LABEL: constrained_vector_powi_v3f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movl $3, %edi
; CHECK-NEXT: callq __powidf2
; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movl $3, %edi
; CHECK-NEXT: callq __powidf2
; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movl $3, %edi
; CHECK-NEXT: callq __powidf2
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl {{[0-9]+}}(%rsp)
; CHECK-NEXT: wait
; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_powi_v3f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $56, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 64
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: movl $3, %edi
; AVX-NEXT: callq __powidf2
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: movl $3, %edi
; AVX-NEXT: callq __powidf2
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: movl $3, %edi
; AVX-NEXT: vzeroupper
; AVX-NEXT: callq __powidf2
; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: addq $56, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%powi = call <3 x double> @llvm.experimental.constrained.powi.v3f64(
<3 x double> <double 42.0, double 42.1, double 42.2>,
i32 3,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x double> %powi
}
define <4 x double> @constrained_vector_powi_v4f64() #0 {
; CHECK-LABEL: constrained_vector_powi_v4f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movl $3, %edi
; CHECK-NEXT: callq __powidf2
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movl $3, %edi
; CHECK-NEXT: callq __powidf2
; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movl $3, %edi
; CHECK-NEXT: callq __powidf2
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movl $3, %edi
; CHECK-NEXT: callq __powidf2
; CHECK-NEXT: movaps %xmm0, %xmm1
; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_powi_v4f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 48
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: movl $3, %edi
; AVX-NEXT: callq __powidf2
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: movl $3, %edi
; AVX-NEXT: callq __powidf2
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: movl $3, %edi
; AVX-NEXT: callq __powidf2
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: movl $3, %edi
; AVX-NEXT: callq __powidf2
; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
; AVX-NEXT: addq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%powi = call <4 x double> @llvm.experimental.constrained.powi.v4f64(
<4 x double> <double 42.1, double 42.2,
double 42.3, double 42.4>,
i32 3,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <4 x double> %powi
}
define <1 x float> @constrained_vector_sin_v1f32() #0 {
; CHECK-LABEL: constrained_vector_sin_v1f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq sinf
; CHECK-NEXT: popq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_sin_v1f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: pushq %rax
; AVX-NEXT: .cfi_def_cfa_offset 16
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: callq sinf
; AVX-NEXT: popq %rax
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%sin = call <1 x float> @llvm.experimental.constrained.sin.v1f32(
<1 x float> <float 42.0>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <1 x float> %sin
}
define <2 x double> @constrained_vector_sin_v2f64() #0 {
; CHECK-LABEL: constrained_vector_sin_v2f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq sin
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq sin
; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_sin_v2f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $24, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 32
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq sin
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq sin
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: addq $24, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%sin = call <2 x double> @llvm.experimental.constrained.sin.v2f64(
<2 x double> <double 42.0, double 42.1>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <2 x double> %sin
}
define <3 x float> @constrained_vector_sin_v3f32() #0 {
; CHECK-LABEL: constrained_vector_sin_v3f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq sinf
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq sinf
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq sinf
; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
; CHECK-NEXT: movaps %xmm1, %xmm0
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_sin_v3f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 48
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: callq sinf
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: callq sinf
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: callq sinf
; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3]
; AVX-NEXT: addq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%sin = call <3 x float> @llvm.experimental.constrained.sin.v3f32(
<3 x float> <float 42.0, float 43.0, float 44.0>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x float> %sin
}
define <3 x double> @constrained_vector_sin_v3f64() #0 {
; CHECK-LABEL: constrained_vector_sin_v3f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq sin
; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq sin
; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq sin
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl {{[0-9]+}}(%rsp)
; CHECK-NEXT: wait
; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_sin_v3f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $56, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 64
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq sin
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq sin
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vzeroupper
; AVX-NEXT: callq sin
; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: addq $56, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%sin = call <3 x double> @llvm.experimental.constrained.sin.v3f64(
<3 x double> <double 42.0, double 42.1, double 42.2>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x double> %sin
}
define <4 x double> @constrained_vector_sin_v4f64() #0 {
; CHECK-LABEL: constrained_vector_sin_v4f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq sin
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq sin
; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq sin
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq sin
; CHECK-NEXT: movaps %xmm0, %xmm1
; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_sin_v4f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 48
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq sin
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq sin
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq sin
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq sin
; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
; AVX-NEXT: addq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%sin = call <4 x double> @llvm.experimental.constrained.sin.v4f64(
<4 x double> <double 42.0, double 42.1,
double 42.2, double 42.3>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <4 x double> %sin
}
define <1 x float> @constrained_vector_cos_v1f32() #0 {
; CHECK-LABEL: constrained_vector_cos_v1f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq cosf
; CHECK-NEXT: popq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_cos_v1f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: pushq %rax
; AVX-NEXT: .cfi_def_cfa_offset 16
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: callq cosf
; AVX-NEXT: popq %rax
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%cos = call <1 x float> @llvm.experimental.constrained.cos.v1f32(
<1 x float> <float 42.0>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <1 x float> %cos
}
define <2 x double> @constrained_vector_cos_v2f64() #0 {
; CHECK-LABEL: constrained_vector_cos_v2f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq cos
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq cos
; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_cos_v2f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $24, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 32
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq cos
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq cos
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: addq $24, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%cos = call <2 x double> @llvm.experimental.constrained.cos.v2f64(
<2 x double> <double 42.0, double 42.1>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <2 x double> %cos
}
define <3 x float> @constrained_vector_cos_v3f32() #0 {
; CHECK-LABEL: constrained_vector_cos_v3f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq cosf
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq cosf
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq cosf
; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
; CHECK-NEXT: movaps %xmm1, %xmm0
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_cos_v3f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 48
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: callq cosf
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: callq cosf
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: callq cosf
; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3]
; AVX-NEXT: addq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%cos = call <3 x float> @llvm.experimental.constrained.cos.v3f32(
<3 x float> <float 42.0, float 43.0, float 44.0>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x float> %cos
}
define <3 x double> @constrained_vector_cos_v3f64() #0 {
; CHECK-LABEL: constrained_vector_cos_v3f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq cos
; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq cos
; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq cos
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl {{[0-9]+}}(%rsp)
; CHECK-NEXT: wait
; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_cos_v3f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $56, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 64
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq cos
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq cos
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vzeroupper
; AVX-NEXT: callq cos
; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: addq $56, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%cos = call <3 x double> @llvm.experimental.constrained.cos.v3f64(
<3 x double> <double 42.0, double 42.1, double 42.2>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x double> %cos
}
define <4 x double> @constrained_vector_cos_v4f64() #0 {
; CHECK-LABEL: constrained_vector_cos_v4f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq cos
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq cos
; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq cos
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq cos
; CHECK-NEXT: movaps %xmm0, %xmm1
; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_cos_v4f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 48
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq cos
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq cos
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq cos
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq cos
; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
; AVX-NEXT: addq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%cos = call <4 x double> @llvm.experimental.constrained.cos.v4f64(
<4 x double> <double 42.0, double 42.1,
double 42.2, double 42.3>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <4 x double> %cos
}
define <1 x float> @constrained_vector_exp_v1f32() #0 {
; CHECK-LABEL: constrained_vector_exp_v1f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq expf
; CHECK-NEXT: popq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_exp_v1f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: pushq %rax
; AVX-NEXT: .cfi_def_cfa_offset 16
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: callq expf
; AVX-NEXT: popq %rax
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%exp = call <1 x float> @llvm.experimental.constrained.exp.v1f32(
<1 x float> <float 42.0>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <1 x float> %exp
}
define <2 x double> @constrained_vector_exp_v2f64() #0 {
; CHECK-LABEL: constrained_vector_exp_v2f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq exp
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq exp
; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_exp_v2f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $24, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 32
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq exp
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq exp
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: addq $24, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%exp = call <2 x double> @llvm.experimental.constrained.exp.v2f64(
<2 x double> <double 42.0, double 42.1>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <2 x double> %exp
}
define <3 x float> @constrained_vector_exp_v3f32() #0 {
; CHECK-LABEL: constrained_vector_exp_v3f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq expf
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq expf
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq expf
; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
; CHECK-NEXT: movaps %xmm1, %xmm0
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_exp_v3f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 48
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: callq expf
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: callq expf
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: callq expf
; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3]
; AVX-NEXT: addq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%exp = call <3 x float> @llvm.experimental.constrained.exp.v3f32(
<3 x float> <float 42.0, float 43.0, float 44.0>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x float> %exp
}
define <3 x double> @constrained_vector_exp_v3f64() #0 {
; CHECK-LABEL: constrained_vector_exp_v3f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq exp
; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq exp
; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq exp
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl {{[0-9]+}}(%rsp)
; CHECK-NEXT: wait
; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_exp_v3f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $56, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 64
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq exp
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq exp
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vzeroupper
; AVX-NEXT: callq exp
; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: addq $56, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%exp = call <3 x double> @llvm.experimental.constrained.exp.v3f64(
<3 x double> <double 42.0, double 42.1, double 42.2>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x double> %exp
}
define <4 x double> @constrained_vector_exp_v4f64() #0 {
; CHECK-LABEL: constrained_vector_exp_v4f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq exp
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq exp
; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq exp
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq exp
; CHECK-NEXT: movaps %xmm0, %xmm1
; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_exp_v4f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 48
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq exp
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq exp
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq exp
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq exp
; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
; AVX-NEXT: addq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%exp = call <4 x double> @llvm.experimental.constrained.exp.v4f64(
<4 x double> <double 42.0, double 42.1,
double 42.2, double 42.3>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <4 x double> %exp
}
define <1 x float> @constrained_vector_exp2_v1f32() #0 {
; CHECK-LABEL: constrained_vector_exp2_v1f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq exp2f
; CHECK-NEXT: popq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_exp2_v1f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: pushq %rax
; AVX-NEXT: .cfi_def_cfa_offset 16
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: callq exp2f
; AVX-NEXT: popq %rax
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%exp2 = call <1 x float> @llvm.experimental.constrained.exp2.v1f32(
<1 x float> <float 42.0>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <1 x float> %exp2
}
define <2 x double> @constrained_vector_exp2_v2f64() #0 {
; CHECK-LABEL: constrained_vector_exp2_v2f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq exp2
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq exp2
; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_exp2_v2f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $24, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 32
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq exp2
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq exp2
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: addq $24, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%exp2 = call <2 x double> @llvm.experimental.constrained.exp2.v2f64(
<2 x double> <double 42.1, double 42.0>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <2 x double> %exp2
}
define <3 x float> @constrained_vector_exp2_v3f32() #0 {
; CHECK-LABEL: constrained_vector_exp2_v3f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq exp2f
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq exp2f
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq exp2f
; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
; CHECK-NEXT: movaps %xmm1, %xmm0
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_exp2_v3f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 48
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: callq exp2f
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: callq exp2f
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: callq exp2f
; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3]
; AVX-NEXT: addq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%exp2 = call <3 x float> @llvm.experimental.constrained.exp2.v3f32(
<3 x float> <float 42.0, float 43.0, float 44.0>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x float> %exp2
}
define <3 x double> @constrained_vector_exp2_v3f64() #0 {
; CHECK-LABEL: constrained_vector_exp2_v3f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq exp2
; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq exp2
; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq exp2
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl {{[0-9]+}}(%rsp)
; CHECK-NEXT: wait
; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_exp2_v3f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $56, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 64
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq exp2
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq exp2
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vzeroupper
; AVX-NEXT: callq exp2
; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: addq $56, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%exp2 = call <3 x double> @llvm.experimental.constrained.exp2.v3f64(
<3 x double> <double 42.0, double 42.1, double 42.2>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x double> %exp2
}
define <4 x double> @constrained_vector_exp2_v4f64() #0 {
; CHECK-LABEL: constrained_vector_exp2_v4f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq exp2
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq exp2
; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq exp2
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq exp2
; CHECK-NEXT: movaps %xmm0, %xmm1
; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_exp2_v4f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 48
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq exp2
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq exp2
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq exp2
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq exp2
; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
; AVX-NEXT: addq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%exp2 = call <4 x double> @llvm.experimental.constrained.exp2.v4f64(
<4 x double> <double 42.1, double 42.2,
double 42.3, double 42.4>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <4 x double> %exp2
}
define <1 x float> @constrained_vector_log_v1f32() #0 {
; CHECK-LABEL: constrained_vector_log_v1f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq logf
; CHECK-NEXT: popq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_log_v1f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: pushq %rax
; AVX-NEXT: .cfi_def_cfa_offset 16
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: callq logf
; AVX-NEXT: popq %rax
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%log = call <1 x float> @llvm.experimental.constrained.log.v1f32(
<1 x float> <float 42.0>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <1 x float> %log
}
define <2 x double> @constrained_vector_log_v2f64() #0 {
; CHECK-LABEL: constrained_vector_log_v2f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq log
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq log
; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_log_v2f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $24, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 32
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq log
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq log
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: addq $24, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%log = call <2 x double> @llvm.experimental.constrained.log.v2f64(
<2 x double> <double 42.0, double 42.1>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <2 x double> %log
}
define <3 x float> @constrained_vector_log_v3f32() #0 {
; CHECK-LABEL: constrained_vector_log_v3f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq logf
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq logf
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq logf
; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
; CHECK-NEXT: movaps %xmm1, %xmm0
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_log_v3f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 48
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: callq logf
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: callq logf
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: callq logf
; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3]
; AVX-NEXT: addq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%log = call <3 x float> @llvm.experimental.constrained.log.v3f32(
<3 x float> <float 42.0, float 43.0, float 44.0>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x float> %log
}
define <3 x double> @constrained_vector_log_v3f64() #0 {
; CHECK-LABEL: constrained_vector_log_v3f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq log
; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq log
; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq log
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl {{[0-9]+}}(%rsp)
; CHECK-NEXT: wait
; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_log_v3f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $56, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 64
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq log
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq log
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vzeroupper
; AVX-NEXT: callq log
; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: addq $56, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%log = call <3 x double> @llvm.experimental.constrained.log.v3f64(
<3 x double> <double 42.0, double 42.1, double 42.2>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x double> %log
}
define <4 x double> @constrained_vector_log_v4f64() #0 {
; CHECK-LABEL: constrained_vector_log_v4f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq log
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq log
; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq log
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq log
; CHECK-NEXT: movaps %xmm0, %xmm1
; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_log_v4f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 48
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq log
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq log
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq log
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq log
; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
; AVX-NEXT: addq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%log = call <4 x double> @llvm.experimental.constrained.log.v4f64(
<4 x double> <double 42.0, double 42.1,
double 42.2, double 42.3>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <4 x double> %log
}
define <1 x float> @constrained_vector_log10_v1f32() #0 {
; CHECK-LABEL: constrained_vector_log10_v1f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq log10f
; CHECK-NEXT: popq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_log10_v1f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: pushq %rax
; AVX-NEXT: .cfi_def_cfa_offset 16
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: callq log10f
; AVX-NEXT: popq %rax
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%log10 = call <1 x float> @llvm.experimental.constrained.log10.v1f32(
<1 x float> <float 42.0>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <1 x float> %log10
}
define <2 x double> @constrained_vector_log10_v2f64() #0 {
; CHECK-LABEL: constrained_vector_log10_v2f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq log10
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq log10
; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_log10_v2f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $24, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 32
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq log10
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq log10
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: addq $24, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%log10 = call <2 x double> @llvm.experimental.constrained.log10.v2f64(
<2 x double> <double 42.0, double 42.1>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <2 x double> %log10
}
define <3 x float> @constrained_vector_log10_v3f32() #0 {
; CHECK-LABEL: constrained_vector_log10_v3f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq log10f
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq log10f
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq log10f
; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
; CHECK-NEXT: movaps %xmm1, %xmm0
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_log10_v3f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 48
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: callq log10f
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: callq log10f
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: callq log10f
; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3]
; AVX-NEXT: addq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%log10 = call <3 x float> @llvm.experimental.constrained.log10.v3f32(
<3 x float> <float 42.0, float 43.0, float 44.0>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x float> %log10
}
define <3 x double> @constrained_vector_log10_v3f64() #0 {
; CHECK-LABEL: constrained_vector_log10_v3f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq log10
; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq log10
; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq log10
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl {{[0-9]+}}(%rsp)
; CHECK-NEXT: wait
; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_log10_v3f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $56, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 64
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq log10
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq log10
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vzeroupper
; AVX-NEXT: callq log10
; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: addq $56, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%log10 = call <3 x double> @llvm.experimental.constrained.log10.v3f64(
<3 x double> <double 42.0, double 42.1, double 42.2>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x double> %log10
}
define <4 x double> @constrained_vector_log10_v4f64() #0 {
; CHECK-LABEL: constrained_vector_log10_v4f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq log10
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq log10
; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq log10
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq log10
; CHECK-NEXT: movaps %xmm0, %xmm1
; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_log10_v4f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 48
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq log10
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq log10
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq log10
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq log10
; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
; AVX-NEXT: addq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%log10 = call <4 x double> @llvm.experimental.constrained.log10.v4f64(
<4 x double> <double 42.0, double 42.1,
double 42.2, double 42.3>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <4 x double> %log10
}
define <1 x float> @constrained_vector_log2_v1f32() #0 {
; CHECK-LABEL: constrained_vector_log2_v1f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq log2f
; CHECK-NEXT: popq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_log2_v1f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: pushq %rax
; AVX-NEXT: .cfi_def_cfa_offset 16
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: callq log2f
; AVX-NEXT: popq %rax
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%log2 = call <1 x float> @llvm.experimental.constrained.log2.v1f32(
<1 x float> <float 42.0>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <1 x float> %log2
}
define <2 x double> @constrained_vector_log2_v2f64() #0 {
; CHECK-LABEL: constrained_vector_log2_v2f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq log2
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq log2
; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_log2_v2f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $24, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 32
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq log2
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq log2
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: addq $24, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%log2 = call <2 x double> @llvm.experimental.constrained.log2.v2f64(
<2 x double> <double 42.0, double 42.1>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <2 x double> %log2
}
define <3 x float> @constrained_vector_log2_v3f32() #0 {
; CHECK-LABEL: constrained_vector_log2_v3f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq log2f
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq log2f
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq log2f
; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
; CHECK-NEXT: movaps %xmm1, %xmm0
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_log2_v3f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 48
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: callq log2f
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: callq log2f
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: callq log2f
; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3]
; AVX-NEXT: addq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%log2 = call <3 x float> @llvm.experimental.constrained.log2.v3f32(
<3 x float> <float 42.0, float 43.0, float 44.0>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x float> %log2
}
define <3 x double> @constrained_vector_log2_v3f64() #0 {
; CHECK-LABEL: constrained_vector_log2_v3f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq log2
; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq log2
; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq log2
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl {{[0-9]+}}(%rsp)
; CHECK-NEXT: wait
; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_log2_v3f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $56, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 64
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq log2
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq log2
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vzeroupper
; AVX-NEXT: callq log2
; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: addq $56, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%log2 = call <3 x double> @llvm.experimental.constrained.log2.v3f64(
<3 x double> <double 42.0, double 42.1, double 42.2>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x double> %log2
}
define <4 x double> @constrained_vector_log2_v4f64() #0 {
; CHECK-LABEL: constrained_vector_log2_v4f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq log2
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq log2
; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq log2
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq log2
; CHECK-NEXT: movaps %xmm0, %xmm1
; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_log2_v4f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 48
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq log2
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq log2
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq log2
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq log2
; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
; AVX-NEXT: addq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%log2 = call <4 x double> @llvm.experimental.constrained.log2.v4f64(
<4 x double> <double 42.0, double 42.1,
double 42.2, double 42.3>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <4 x double> %log2
}
define <1 x float> @constrained_vector_rint_v1f32() #0 {
; CHECK-LABEL: constrained_vector_rint_v1f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq rintf
; CHECK-NEXT: popq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_rint_v1f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
%rint = call <1 x float> @llvm.experimental.constrained.rint.v1f32(
<1 x float> <float 42.0>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <1 x float> %rint
}
define <2 x double> @constrained_vector_rint_v2f64() #0 {
; CHECK-LABEL: constrained_vector_rint_v2f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq rint
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq rint
; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_rint_v2f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vroundpd $4, {{.*}}(%rip), %xmm0
; AVX-NEXT: retq
entry:
%rint = call <2 x double> @llvm.experimental.constrained.rint.v2f64(
<2 x double> <double 42.1, double 42.0>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <2 x double> %rint
}
define <3 x float> @constrained_vector_rint_v3f32() #0 {
; CHECK-LABEL: constrained_vector_rint_v3f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq rintf
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq rintf
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq rintf
; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
; CHECK-NEXT: movaps %xmm1, %xmm0
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_rint_v3f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: vroundss $4, %xmm1, %xmm1, %xmm1
; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX-NEXT: vroundss $4, %xmm2, %xmm2, %xmm2
; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
; AVX-NEXT: retq
entry:
%rint = call <3 x float> @llvm.experimental.constrained.rint.v3f32(
<3 x float> <float 42.0, float 43.0, float 44.0>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x float> %rint
}
define <3 x double> @constrained_vector_rint_v3f64() #0 {
; CHECK-LABEL: constrained_vector_rint_v3f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq rint
; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq rint
; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq rint
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl {{[0-9]+}}(%rsp)
; CHECK-NEXT: wait
; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_rint_v3f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vroundsd $4, %xmm0, %xmm0, %xmm0
; AVX-NEXT: vroundpd $4, {{.*}}(%rip), %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: retq
entry:
%rint = call <3 x double> @llvm.experimental.constrained.rint.v3f64(
<3 x double> <double 42.0, double 42.1, double 42.2>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x double> %rint
}
define <4 x double> @constrained_vector_rint_v4f64() #0 {
; CHECK-LABEL: constrained_vector_rint_v4f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq rint
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq rint
; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq rint
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq rint
; CHECK-NEXT: movaps %xmm0, %xmm1
; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_rint_v4f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vroundpd $4, {{.*}}(%rip), %ymm0
; AVX-NEXT: retq
entry:
%rint = call <4 x double> @llvm.experimental.constrained.rint.v4f64(
<4 x double> <double 42.1, double 42.2,
double 42.3, double 42.4>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <4 x double> %rint
}
define <1 x float> @constrained_vector_nearbyint_v1f32() #0 {
; CHECK-LABEL: constrained_vector_nearbyint_v1f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq nearbyintf
; CHECK-NEXT: popq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_nearbyint_v1f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vroundss $12, %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
%nearby = call <1 x float> @llvm.experimental.constrained.nearbyint.v1f32(
<1 x float> <float 42.0>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <1 x float> %nearby
}
define <2 x double> @constrained_vector_nearbyint_v2f64() #0 {
; CHECK-LABEL: constrained_vector_nearbyint_v2f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq nearbyint
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq nearbyint
; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_nearbyint_v2f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vroundpd $12, {{.*}}(%rip), %xmm0
; AVX-NEXT: retq
entry:
%nearby = call <2 x double> @llvm.experimental.constrained.nearbyint.v2f64(
<2 x double> <double 42.1, double 42.0>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <2 x double> %nearby
}
define <3 x float> @constrained_vector_nearbyint_v3f32() #0 {
; CHECK-LABEL: constrained_vector_nearbyint_v3f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq nearbyintf
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq nearbyintf
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq nearbyintf
; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
; CHECK-NEXT: movaps %xmm1, %xmm0
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_nearbyint_v3f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vroundss $12, %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: vroundss $12, %xmm1, %xmm1, %xmm1
; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX-NEXT: vroundss $12, %xmm2, %xmm2, %xmm2
; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
; AVX-NEXT: retq
entry:
%nearby = call <3 x float> @llvm.experimental.constrained.nearbyint.v3f32(
<3 x float> <float 42.0, float 43.0, float 44.0>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x float> %nearby
}
define <3 x double> @constrained_vector_nearby_v3f64() #0 {
; CHECK-LABEL: constrained_vector_nearby_v3f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq nearbyint
; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq nearbyint
; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq nearbyint
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl {{[0-9]+}}(%rsp)
; CHECK-NEXT: wait
; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_nearby_v3f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vroundsd $12, %xmm0, %xmm0, %xmm0
; AVX-NEXT: vroundpd $12, {{.*}}(%rip), %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: retq
entry:
%nearby = call <3 x double> @llvm.experimental.constrained.nearbyint.v3f64(
<3 x double> <double 42.0, double 42.1, double 42.2>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x double> %nearby
}
define <4 x double> @constrained_vector_nearbyint_v4f64() #0 {
; CHECK-LABEL: constrained_vector_nearbyint_v4f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq nearbyint
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq nearbyint
; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq nearbyint
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq nearbyint
; CHECK-NEXT: movaps %xmm0, %xmm1
; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_nearbyint_v4f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vroundpd $12, {{.*}}(%rip), %ymm0
; AVX-NEXT: retq
entry:
%nearby = call <4 x double> @llvm.experimental.constrained.nearbyint.v4f64(
<4 x double> <double 42.1, double 42.2,
double 42.3, double 42.4>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <4 x double> %nearby
}
define <1 x float> @constrained_vector_maxnum_v1f32() #0 {
; CHECK-LABEL: constrained_vector_maxnum_v1f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: callq fmaxf
; CHECK-NEXT: popq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_maxnum_v1f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: pushq %rax
; AVX-NEXT: .cfi_def_cfa_offset 16
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: callq fmaxf
; AVX-NEXT: popq %rax
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%max = call <1 x float> @llvm.experimental.constrained.maxnum.v1f32(
<1 x float> <float 42.0>, <1 x float> <float 41.0>,
metadata !"fpexcept.strict") #0
ret <1 x float> %max
}
define <2 x double> @constrained_vector_maxnum_v2f64() #0 {
; CHECK-LABEL: constrained_vector_maxnum_v2f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: callq fmax
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: callq fmax
; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_maxnum_v2f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $24, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 32
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: callq fmax
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: callq fmax
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: addq $24, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%max = call <2 x double> @llvm.experimental.constrained.maxnum.v2f64(
<2 x double> <double 43.0, double 42.0>,
<2 x double> <double 41.0, double 40.0>,
metadata !"fpexcept.strict") #0
ret <2 x double> %max
}
define <3 x float> @constrained_vector_maxnum_v3f32() #0 {
; CHECK-LABEL: constrained_vector_maxnum_v3f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: callq fmaxf
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq fmaxf
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: callq fmaxf
; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
; CHECK-NEXT: movaps %xmm1, %xmm0
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_maxnum_v3f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 48
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: callq fmaxf
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: callq fmaxf
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: callq fmaxf
; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3]
; AVX-NEXT: addq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%max = call <3 x float> @llvm.experimental.constrained.maxnum.v3f32(
<3 x float> <float 43.0, float 44.0, float 45.0>,
<3 x float> <float 41.0, float 42.0, float 43.0>,
metadata !"fpexcept.strict") #0
ret <3 x float> %max
}
define <3 x double> @constrained_vector_max_v3f64() #0 {
; CHECK-LABEL: constrained_vector_max_v3f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: callq fmax
; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: callq fmax
; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: callq fmax
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl {{[0-9]+}}(%rsp)
; CHECK-NEXT: wait
; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_max_v3f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $56, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 64
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: callq fmax
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: callq fmax
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: vzeroupper
; AVX-NEXT: callq fmax
; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: addq $56, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%max = call <3 x double> @llvm.experimental.constrained.maxnum.v3f64(
<3 x double> <double 43.0, double 44.0, double 45.0>,
<3 x double> <double 40.0, double 41.0, double 42.0>,
metadata !"fpexcept.strict") #0
ret <3 x double> %max
}
define <4 x double> @constrained_vector_maxnum_v4f64() #0 {
; CHECK-LABEL: constrained_vector_maxnum_v4f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: callq fmax
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: callq fmax
; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: callq fmax
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: callq fmax
; CHECK-NEXT: movaps %xmm0, %xmm1
; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_maxnum_v4f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 48
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: callq fmax
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: callq fmax
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: callq fmax
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: callq fmax
; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
; AVX-NEXT: addq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%max = call <4 x double> @llvm.experimental.constrained.maxnum.v4f64(
<4 x double> <double 44.0, double 45.0,
double 46.0, double 47.0>,
<4 x double> <double 40.0, double 41.0,
double 42.0, double 43.0>,
metadata !"fpexcept.strict") #0
ret <4 x double> %max
}
define <1 x float> @constrained_vector_minnum_v1f32() #0 {
; CHECK-LABEL: constrained_vector_minnum_v1f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: callq fminf
; CHECK-NEXT: popq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_minnum_v1f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: pushq %rax
; AVX-NEXT: .cfi_def_cfa_offset 16
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: callq fminf
; AVX-NEXT: popq %rax
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%min = call <1 x float> @llvm.experimental.constrained.minnum.v1f32(
<1 x float> <float 42.0>, <1 x float> <float 41.0>,
metadata !"fpexcept.strict") #0
ret <1 x float> %min
}
define <2 x double> @constrained_vector_minnum_v2f64() #0 {
; CHECK-LABEL: constrained_vector_minnum_v2f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: callq fmin
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: callq fmin
; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_minnum_v2f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $24, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 32
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: callq fmin
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: callq fmin
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: addq $24, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%min = call <2 x double> @llvm.experimental.constrained.minnum.v2f64(
<2 x double> <double 43.0, double 42.0>,
<2 x double> <double 41.0, double 40.0>,
metadata !"fpexcept.strict") #0
ret <2 x double> %min
}
define <3 x float> @constrained_vector_minnum_v3f32() #0 {
; CHECK-LABEL: constrained_vector_minnum_v3f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: callq fminf
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq fminf
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: callq fminf
; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
; CHECK-NEXT: movaps %xmm1, %xmm0
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_minnum_v3f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 48
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: callq fminf
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: callq fminf
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: callq fminf
; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3]
; AVX-NEXT: addq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%min = call <3 x float> @llvm.experimental.constrained.minnum.v3f32(
<3 x float> <float 43.0, float 44.0, float 45.0>,
<3 x float> <float 41.0, float 42.0, float 43.0>,
metadata !"fpexcept.strict") #0
ret <3 x float> %min
}
define <3 x double> @constrained_vector_min_v3f64() #0 {
; CHECK-LABEL: constrained_vector_min_v3f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: callq fmin
; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: callq fmin
; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: callq fmin
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl {{[0-9]+}}(%rsp)
; CHECK-NEXT: wait
; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_min_v3f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $56, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 64
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: callq fmin
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: callq fmin
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: vzeroupper
; AVX-NEXT: callq fmin
; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: addq $56, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%min = call <3 x double> @llvm.experimental.constrained.minnum.v3f64(
<3 x double> <double 43.0, double 44.0, double 45.0>,
<3 x double> <double 40.0, double 41.0, double 42.0>,
metadata !"fpexcept.strict") #0
ret <3 x double> %min
}
define <4 x double> @constrained_vector_minnum_v4f64() #0 {
; CHECK-LABEL: constrained_vector_minnum_v4f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: callq fmin
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: callq fmin
; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: callq fmin
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: callq fmin
; CHECK-NEXT: movaps %xmm0, %xmm1
; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_minnum_v4f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 48
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: callq fmin
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: callq fmin
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: callq fmin
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: callq fmin
; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
; AVX-NEXT: addq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%min = call <4 x double> @llvm.experimental.constrained.minnum.v4f64(
<4 x double> <double 44.0, double 45.0,
double 46.0, double 47.0>,
<4 x double> <double 40.0, double 41.0,
double 42.0, double 43.0>,
metadata !"fpexcept.strict") #0
ret <4 x double> %min
}
define <1 x i32> @constrained_vector_fptosi_v1i32_v1f32() #0 {
; CHECK-LABEL: constrained_vector_fptosi_v1i32_v1f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvttss2si {{.*}}(%rip), %eax
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_fptosi_v1i32_v1f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vcvttss2si {{.*}}(%rip), %eax
; AVX-NEXT: retq
entry:
%result = call <1 x i32> @llvm.experimental.constrained.fptosi.v1i32.v1f32(
<1 x float><float 42.0>,
metadata !"fpexcept.strict") #0
ret <1 x i32> %result
}
define <2 x i32> @constrained_vector_fptosi_v2i32_v2f32() #0 {
; CHECK-LABEL: constrained_vector_fptosi_v2i32_v2f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvttps2dq {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_fptosi_v2i32_v2f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vcvttps2dq {{.*}}(%rip), %xmm0
; AVX-NEXT: retq
entry:
%result = call <2 x i32> @llvm.experimental.constrained.fptosi.v2i32.v2f32(
<2 x float><float 42.0, float 43.0>,
metadata !"fpexcept.strict") #0
ret <2 x i32> %result
}
define <3 x i32> @constrained_vector_fptosi_v3i32_v3f32() #0 {
; CHECK-LABEL: constrained_vector_fptosi_v3i32_v3f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvttss2si {{.*}}(%rip), %eax
; CHECK-NEXT: movd %eax, %xmm1
; CHECK-NEXT: cvttss2si {{.*}}(%rip), %eax
; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-NEXT: cvttss2si {{.*}}(%rip), %eax
; CHECK-NEXT: movd %eax, %xmm1
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_fptosi_v3i32_v3f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vcvttss2si {{.*}}(%rip), %eax
; AVX-NEXT: vmovd %eax, %xmm0
; AVX-NEXT: vcvttss2si {{.*}}(%rip), %eax
; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
; AVX-NEXT: vcvttss2si {{.*}}(%rip), %eax
; AVX-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
%result = call <3 x i32> @llvm.experimental.constrained.fptosi.v3i32.v3f32(
<3 x float><float 42.0, float 43.0,
float 44.0>,
metadata !"fpexcept.strict") #0
ret <3 x i32> %result
}
define <4 x i32> @constrained_vector_fptosi_v4i32_v4f32() #0 {
; CHECK-LABEL: constrained_vector_fptosi_v4i32_v4f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvttps2dq {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_fptosi_v4i32_v4f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vcvttps2dq {{.*}}(%rip), %xmm0
; AVX-NEXT: retq
entry:
%result = call <4 x i32> @llvm.experimental.constrained.fptosi.v4i32.v4f32(
<4 x float><float 42.0, float 43.0,
float 44.0, float 45.0>,
metadata !"fpexcept.strict") #0
ret <4 x i32> %result
}
define <1 x i64> @constrained_vector_fptosi_v1i64_v1f32() #0 {
; CHECK-LABEL: constrained_vector_fptosi_v1i64_v1f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_fptosi_v1i64_v1f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vcvttss2si {{.*}}(%rip), %rax
; AVX-NEXT: retq
entry:
%result = call <1 x i64> @llvm.experimental.constrained.fptosi.v1i64.v1f32(
<1 x float><float 42.0>,
metadata !"fpexcept.strict") #0
ret <1 x i64> %result
}
define <2 x i64> @constrained_vector_fptosi_v2i64_v2f32() #0 {
; CHECK-LABEL: constrained_vector_fptosi_v2i64_v2f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax
; CHECK-NEXT: movq %rax, %xmm1
; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax
; CHECK-NEXT: movq %rax, %xmm0
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_fptosi_v2i64_v2f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vcvttss2si {{.*}}(%rip), %rax
; AVX-NEXT: vmovq %rax, %xmm0
; AVX-NEXT: vcvttss2si {{.*}}(%rip), %rax
; AVX-NEXT: vmovq %rax, %xmm1
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX-NEXT: retq
entry:
%result = call <2 x i64> @llvm.experimental.constrained.fptosi.v2i64.v2f32(
<2 x float><float 42.0, float 43.0>,
metadata !"fpexcept.strict") #0
ret <2 x i64> %result
}
define <3 x i64> @constrained_vector_fptosi_v3i64_v3f32() #0 {
; CHECK-LABEL: constrained_vector_fptosi_v3i64_v3f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rcx
; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rdx
; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_fptosi_v3i64_v3f32:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax
; AVX1-NEXT: vmovq %rax, %xmm0
; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax
; AVX1-NEXT: vmovq %rax, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax
; AVX1-NEXT: vmovq %rax, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_fptosi_v3i64_v3f32:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vcvttss2si {{.*}}(%rip), %rax
; AVX512-NEXT: vmovq %rax, %xmm0
; AVX512-NEXT: vcvttss2si {{.*}}(%rip), %rax
; AVX512-NEXT: vmovq %rax, %xmm1
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512-NEXT: vcvttss2si {{.*}}(%rip), %rax
; AVX512-NEXT: vmovq %rax, %xmm1
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
entry:
%result = call <3 x i64> @llvm.experimental.constrained.fptosi.v3i64.v3f32(
<3 x float><float 42.0, float 43.0,
float 44.0>,
metadata !"fpexcept.strict") #0
ret <3 x i64> %result
}
define <4 x i64> @constrained_vector_fptosi_v4i64_v4f32() #0 {
; CHECK-LABEL: constrained_vector_fptosi_v4i64_v4f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax
; CHECK-NEXT: movq %rax, %xmm1
; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax
; CHECK-NEXT: movq %rax, %xmm0
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax
; CHECK-NEXT: movq %rax, %xmm2
; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax
; CHECK-NEXT: movq %rax, %xmm1
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_fptosi_v4i64_v4f32:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax
; AVX1-NEXT: vmovq %rax, %xmm0
; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax
; AVX1-NEXT: vmovq %rax, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax
; AVX1-NEXT: vmovq %rax, %xmm1
; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax
; AVX1-NEXT: vmovq %rax, %xmm2
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX512F-LABEL: constrained_vector_fptosi_v4i64_v4f32:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vcvttss2si {{.*}}(%rip), %rax
; AVX512F-NEXT: vmovq %rax, %xmm0
; AVX512F-NEXT: vcvttss2si {{.*}}(%rip), %rax
; AVX512F-NEXT: vmovq %rax, %xmm1
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512F-NEXT: vcvttss2si {{.*}}(%rip), %rax
; AVX512F-NEXT: vmovq %rax, %xmm1
; AVX512F-NEXT: vcvttss2si {{.*}}(%rip), %rax
; AVX512F-NEXT: vmovq %rax, %xmm2
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: constrained_vector_fptosi_v4i64_v4f32:
; AVX512DQ: # %bb.0: # %entry
; AVX512DQ-NEXT: vmovaps {{.*#+}} xmm0 = [4.2E+1,4.3E+1,4.4E+1,4.5E+1]
; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0
; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512DQ-NEXT: retq
entry:
%result = call <4 x i64> @llvm.experimental.constrained.fptosi.v4i64.v4f32(
<4 x float><float 42.0, float 43.0,
float 44.0, float 45.0>,
metadata !"fpexcept.strict") #0
ret <4 x i64> %result
}
define <1 x i32> @constrained_vector_fptosi_v1i32_v1f64() #0 {
; CHECK-LABEL: constrained_vector_fptosi_v1i32_v1f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %eax
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_fptosi_v1i32_v1f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vcvttsd2si {{.*}}(%rip), %eax
; AVX-NEXT: retq
entry:
%result = call <1 x i32> @llvm.experimental.constrained.fptosi.v1i32.v1f64(
<1 x double><double 42.1>,
metadata !"fpexcept.strict") #0
ret <1 x i32> %result
}
define <2 x i32> @constrained_vector_fptosi_v2i32_v2f64() #0 {
; CHECK-LABEL: constrained_vector_fptosi_v2i32_v2f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvttpd2dq {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_fptosi_v2i32_v2f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vcvttpd2dqx {{.*}}(%rip), %xmm0
; AVX-NEXT: retq
entry:
%result = call <2 x i32> @llvm.experimental.constrained.fptosi.v2i32.v2f64(
<2 x double><double 42.1, double 42.2>,
metadata !"fpexcept.strict") #0
ret <2 x i32> %result
}
define <3 x i32> @constrained_vector_fptosi_v3i32_v3f64() #0 {
; CHECK-LABEL: constrained_vector_fptosi_v3i32_v3f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %eax
; CHECK-NEXT: movd %eax, %xmm1
; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %eax
; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %eax
; CHECK-NEXT: movd %eax, %xmm1
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_fptosi_v3i32_v3f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vcvttsd2si {{.*}}(%rip), %eax
; AVX-NEXT: vmovd %eax, %xmm0
; AVX-NEXT: vcvttsd2si {{.*}}(%rip), %eax
; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
; AVX-NEXT: vcvttsd2si {{.*}}(%rip), %eax
; AVX-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
%result = call <3 x i32> @llvm.experimental.constrained.fptosi.v3i32.v3f64(
<3 x double><double 42.1, double 42.2,
double 42.3>,
metadata !"fpexcept.strict") #0
ret <3 x i32> %result
}
define <4 x i32> @constrained_vector_fptosi_v4i32_v4f64() #0 {
; CHECK-LABEL: constrained_vector_fptosi_v4i32_v4f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvttpd2dq {{.*}}(%rip), %xmm1
; CHECK-NEXT: cvttpd2dq {{.*}}(%rip), %xmm0
; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_fptosi_v4i32_v4f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vcvttpd2dqy {{.*}}(%rip), %xmm0
; AVX-NEXT: retq
entry:
%result = call <4 x i32> @llvm.experimental.constrained.fptosi.v4i32.v4f64(
<4 x double><double 42.1, double 42.2,
double 42.3, double 42.4>,
metadata !"fpexcept.strict") #0
ret <4 x i32> %result
}
define <1 x i64> @constrained_vector_fptosi_v1i64_v1f64() #0 {
; CHECK-LABEL: constrained_vector_fptosi_v1i64_v1f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_fptosi_v1i64_v1f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vcvttsd2si {{.*}}(%rip), %rax
; AVX-NEXT: retq
entry:
%result = call <1 x i64> @llvm.experimental.constrained.fptosi.v1i64.v1f64(
<1 x double><double 42.1>,
metadata !"fpexcept.strict") #0
ret <1 x i64> %result
}
define <2 x i64> @constrained_vector_fptosi_v2i64_v2f64() #0 {
; CHECK-LABEL: constrained_vector_fptosi_v2i64_v2f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax
; CHECK-NEXT: movq %rax, %xmm1
; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax
; CHECK-NEXT: movq %rax, %xmm0
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_fptosi_v2i64_v2f64:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax
; AVX1-NEXT: vmovq %rax, %xmm0
; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax
; AVX1-NEXT: vmovq %rax, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX1-NEXT: retq
;
; AVX512F-LABEL: constrained_vector_fptosi_v2i64_v2f64:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vcvttsd2si {{.*}}(%rip), %rax
; AVX512F-NEXT: vmovq %rax, %xmm0
; AVX512F-NEXT: vcvttsd2si {{.*}}(%rip), %rax
; AVX512F-NEXT: vmovq %rax, %xmm1
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: constrained_vector_fptosi_v2i64_v2f64:
; AVX512DQ: # %bb.0: # %entry
; AVX512DQ-NEXT: vmovaps {{.*#+}} xmm0 = [4.2100000000000001E+1,4.2200000000000003E+1]
; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
entry:
%result = call <2 x i64> @llvm.experimental.constrained.fptosi.v2i64.v2f64(
<2 x double><double 42.1, double 42.2>,
metadata !"fpexcept.strict") #0
ret <2 x i64> %result
}
define <3 x i64> @constrained_vector_fptosi_v3i64_v3f64() #0 {
; CHECK-LABEL: constrained_vector_fptosi_v3i64_v3f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rcx
; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rdx
; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_fptosi_v3i64_v3f64:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax
; AVX1-NEXT: vmovq %rax, %xmm0
; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax
; AVX1-NEXT: vmovq %rax, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax
; AVX1-NEXT: vmovq %rax, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_fptosi_v3i64_v3f64:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vcvttsd2si {{.*}}(%rip), %rax
; AVX512-NEXT: vmovq %rax, %xmm0
; AVX512-NEXT: vcvttsd2si {{.*}}(%rip), %rax
; AVX512-NEXT: vmovq %rax, %xmm1
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512-NEXT: vcvttsd2si {{.*}}(%rip), %rax
; AVX512-NEXT: vmovq %rax, %xmm1
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
entry:
%result = call <3 x i64> @llvm.experimental.constrained.fptosi.v3i64.v3f64(
<3 x double><double 42.1, double 42.2,
double 42.3>,
metadata !"fpexcept.strict") #0
ret <3 x i64> %result
}
define <4 x i64> @constrained_vector_fptosi_v4i64_v4f64() #0 {
; CHECK-LABEL: constrained_vector_fptosi_v4i64_v4f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax
; CHECK-NEXT: movq %rax, %xmm1
; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax
; CHECK-NEXT: movq %rax, %xmm0
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax
; CHECK-NEXT: movq %rax, %xmm2
; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax
; CHECK-NEXT: movq %rax, %xmm1
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_fptosi_v4i64_v4f64:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax
; AVX1-NEXT: vmovq %rax, %xmm0
; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax
; AVX1-NEXT: vmovq %rax, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax
; AVX1-NEXT: vmovq %rax, %xmm1
; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax
; AVX1-NEXT: vmovq %rax, %xmm2
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX512F-LABEL: constrained_vector_fptosi_v4i64_v4f64:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vcvttsd2si {{.*}}(%rip), %rax
; AVX512F-NEXT: vmovq %rax, %xmm0
; AVX512F-NEXT: vcvttsd2si {{.*}}(%rip), %rax
; AVX512F-NEXT: vmovq %rax, %xmm1
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512F-NEXT: vcvttsd2si {{.*}}(%rip), %rax
; AVX512F-NEXT: vmovq %rax, %xmm1
; AVX512F-NEXT: vcvttsd2si {{.*}}(%rip), %rax
; AVX512F-NEXT: vmovq %rax, %xmm2
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: constrained_vector_fptosi_v4i64_v4f64:
; AVX512DQ: # %bb.0: # %entry
; AVX512DQ-NEXT: vmovaps {{.*#+}} ymm0 = [4.2100000000000001E+1,4.2200000000000003E+1,4.2299999999999997E+1,4.2399999999999999E+1]
; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0
; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512DQ-NEXT: retq
entry:
%result = call <4 x i64> @llvm.experimental.constrained.fptosi.v4i64.v4f64(
<4 x double><double 42.1, double 42.2,
double 42.3, double 42.4>,
metadata !"fpexcept.strict") #0
ret <4 x i64> %result
}
define <1 x i32> @constrained_vector_fptoui_v1i32_v1f32() #0 {
; CHECK-LABEL: constrained_vector_fptoui_v1i32_v1f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax
; CHECK-NEXT: # kill: def $eax killed $eax killed $rax
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_fptoui_v1i32_v1f32:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax
; AVX1-NEXT: # kill: def $eax killed $eax killed $rax
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_fptoui_v1i32_v1f32:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vcvttss2usi {{.*}}(%rip), %eax
; AVX512-NEXT: retq
entry:
%result = call <1 x i32> @llvm.experimental.constrained.fptoui.v1i32.v1f32(
<1 x float><float 42.0>,
metadata !"fpexcept.strict") #0
ret <1 x i32> %result
}
define <2 x i32> @constrained_vector_fptoui_v2i32_v2f32() #0 {
; CHECK-LABEL: constrained_vector_fptoui_v2i32_v2f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax
; CHECK-NEXT: movd %eax, %xmm1
; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax
; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_fptoui_v2i32_v2f32:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax
; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rcx
; AVX1-NEXT: vmovd %ecx, %xmm0
; AVX1-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_fptoui_v2i32_v2f32:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [4.2E+1,4.3E+1,0.0E+0,0.0E+0]
; AVX512-NEXT: vcvttps2udq %zmm0, %zmm0
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
%result = call <2 x i32> @llvm.experimental.constrained.fptoui.v2i32.v2f32(
<2 x float><float 42.0, float 43.0>,
metadata !"fpexcept.strict") #0
ret <2 x i32> %result
}
define <3 x i32> @constrained_vector_fptoui_v3i32_v3f32() #0 {
; CHECK-LABEL: constrained_vector_fptoui_v3i32_v3f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax
; CHECK-NEXT: movd %eax, %xmm1
; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax
; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax
; CHECK-NEXT: movd %eax, %xmm1
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_fptoui_v3i32_v3f32:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax
; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rcx
; AVX1-NEXT: vmovd %ecx, %xmm0
; AVX1-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
; AVX1-NEXT: vcvttss2si {{.*}}(%rip), %rax
; AVX1-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_fptoui_v3i32_v3f32:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vcvttss2usi {{.*}}(%rip), %eax
; AVX512-NEXT: vmovd %eax, %xmm0
; AVX512-NEXT: vcvttss2usi {{.*}}(%rip), %eax
; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
; AVX512-NEXT: vcvttss2usi {{.*}}(%rip), %eax
; AVX512-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
; AVX512-NEXT: retq
entry:
%result = call <3 x i32> @llvm.experimental.constrained.fptoui.v3i32.v3f32(
<3 x float><float 42.0, float 43.0,
float 44.0>,
metadata !"fpexcept.strict") #0
ret <3 x i32> %result
}
define <4 x i32> @constrained_vector_fptoui_v4i32_v4f32() #0 {
; CHECK-LABEL: constrained_vector_fptoui_v4i32_v4f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movaps {{.*#+}} xmm0 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
; CHECK-NEXT: movaps {{.*#+}} xmm1 = [4.2E+1,4.3E+1,4.4E+1,4.5E+1]
; CHECK-NEXT: movaps %xmm1, %xmm2
; CHECK-NEXT: cmpltps %xmm0, %xmm2
; CHECK-NEXT: movaps %xmm2, %xmm3
; CHECK-NEXT: andnps {{.*}}(%rip), %xmm3
; CHECK-NEXT: andnps %xmm0, %xmm2
; CHECK-NEXT: subps %xmm2, %xmm1
; CHECK-NEXT: cvttps2dq %xmm1, %xmm0
; CHECK-NEXT: xorps %xmm3, %xmm0
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_fptoui_v4i32_v4f32:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [4.2E+1,4.3E+1,4.4E+1,4.5E+1]
; AVX1-NEXT: vcmpltps %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vmovaps {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
; AVX1-NEXT: vblendvps %xmm2, %xmm3, %xmm4, %xmm4
; AVX1-NEXT: vblendvps %xmm2, %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vsubps %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0
; AVX1-NEXT: vxorps %xmm4, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_fptoui_v4i32_v4f32:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [4.2E+1,4.3E+1,4.4E+1,4.5E+1]
; AVX512-NEXT: vcvttps2udq %zmm0, %zmm0
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
%result = call <4 x i32> @llvm.experimental.constrained.fptoui.v4i32.v4f32(
<4 x float><float 42.0, float 43.0,
float 44.0, float 45.0>,
metadata !"fpexcept.strict") #0
ret <4 x i32> %result
}
define <1 x i64> @constrained_vector_fptoui_v1i64_v1f32() #0 {
; CHECK-LABEL: constrained_vector_fptoui_v1i64_v1f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; CHECK-NEXT: comiss %xmm0, %xmm2
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: ja .LBB115_2
; CHECK-NEXT: # %bb.1: # %entry
; CHECK-NEXT: movaps %xmm2, %xmm1
; CHECK-NEXT: .LBB115_2: # %entry
; CHECK-NEXT: subss %xmm1, %xmm0
; CHECK-NEXT: cvttss2si %xmm0, %rcx
; CHECK-NEXT: setbe %al
; CHECK-NEXT: movzbl %al, %eax
; CHECK-NEXT: shlq $63, %rax
; CHECK-NEXT: xorq %rcx, %rax
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_fptoui_v1i64_v1f32:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
[TargetLowering] Fix another potential FPE in expandFP_TO_UINT D53794 introduced code to perform the FP_TO_UINT expansion via FP_TO_SINT in a way that would never expose floating-point exceptions in the intermediate steps. Unfortunately, I just noticed there is still a way this can happen. As discussed in D53794, the compiler now generates this sequence: // Sel = Src < 0x8000000000000000 // Val = select Sel, Src, Src - 0x8000000000000000 // Ofs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val) ^ Ofs The problem is with the Src - 0x8000000000000000 expression. As I mentioned in the original review, that expression can never overflow or underflow if the original value is in range for FP_TO_UINT. But I missed that we can get an Inexact exception in the case where Src is a very small positive value. (In this case the result of the sub is ignored, but that doesn't help.) Instead, I'd suggest to use the following sequence: // Sel = Src < 0x8000000000000000 // FltOfs = select Sel, 0, 0x8000000000000000 // IntOfs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val - FltOfs) ^ IntOfs In the case where the value is already in range of FP_TO_SINT, we now simply compute Val - 0, which now definitely cannot trap (unless Val is a NaN in which case we'd want to trap anyway). In the case where the value is not in range of FP_TO_SINT, but still in range of FP_TO_UINT, the sub can never be inexact, as Val is between 2^(n-1) and (2^n)-1, i.e. always has the 2^(n-1) bit set, and the sub is always simply clearing that bit. There is a slight complication in the case where Val is a constant, so we know at compile time whether Sel is true or false. In that scenario, the old code would automatically optimize the sub away, while this no longer happens with the new code. Instead, I've added extra code to check for this case and then just fall back to FP_TO_SINT directly. (This seems to catch even slightly more cases.) Original version of the patch by Ulrich Weigand. X86 changes added by Craig Topper Differential Revision: https://reviews.llvm.org/D67105
2019-12-07 06:11:04 +08:00
; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX1-NEXT: vcomiss %xmm0, %xmm1
; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX1-NEXT: ja .LBB115_2
; AVX1-NEXT: # %bb.1: # %entry
; AVX1-NEXT: vmovaps %xmm1, %xmm2
; AVX1-NEXT: .LBB115_2: # %entry
; AVX1-NEXT: vsubss %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vcvttss2si %xmm0, %rcx
; AVX1-NEXT: setbe %al
; AVX1-NEXT: movzbl %al, %eax
; AVX1-NEXT: shlq $63, %rax
; AVX1-NEXT: xorq %rcx, %rax
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_fptoui_v1i64_v1f32:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vcvttss2usi {{.*}}(%rip), %rax
; AVX512-NEXT: retq
entry:
%result = call <1 x i64> @llvm.experimental.constrained.fptoui.v1i64.v1f32(
<1 x float><float 42.0>,
metadata !"fpexcept.strict") #0
ret <1 x i64> %result
}
define <2 x i64> @constrained_vector_fptoui_v2i64_v2f32() #0 {
; CHECK-LABEL: constrained_vector_fptoui_v2i64_v2f32:
; CHECK: # %bb.0: # %entry
[TargetLowering] Fix another potential FPE in expandFP_TO_UINT D53794 introduced code to perform the FP_TO_UINT expansion via FP_TO_SINT in a way that would never expose floating-point exceptions in the intermediate steps. Unfortunately, I just noticed there is still a way this can happen. As discussed in D53794, the compiler now generates this sequence: // Sel = Src < 0x8000000000000000 // Val = select Sel, Src, Src - 0x8000000000000000 // Ofs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val) ^ Ofs The problem is with the Src - 0x8000000000000000 expression. As I mentioned in the original review, that expression can never overflow or underflow if the original value is in range for FP_TO_UINT. But I missed that we can get an Inexact exception in the case where Src is a very small positive value. (In this case the result of the sub is ignored, but that doesn't help.) Instead, I'd suggest to use the following sequence: // Sel = Src < 0x8000000000000000 // FltOfs = select Sel, 0, 0x8000000000000000 // IntOfs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val - FltOfs) ^ IntOfs In the case where the value is already in range of FP_TO_SINT, we now simply compute Val - 0, which now definitely cannot trap (unless Val is a NaN in which case we'd want to trap anyway). In the case where the value is not in range of FP_TO_SINT, but still in range of FP_TO_UINT, the sub can never be inexact, as Val is between 2^(n-1) and (2^n)-1, i.e. always has the 2^(n-1) bit set, and the sub is always simply clearing that bit. There is a slight complication in the case where Val is a constant, so we know at compile time whether Sel is true or false. In that scenario, the old code would automatically optimize the sub away, while this no longer happens with the new code. Instead, I've added extra code to check for this case and then just fall back to FP_TO_SINT directly. (This seems to catch even slightly more cases.) Original version of the patch by Ulrich Weigand. X86 changes added by Craig Topper Differential Revision: https://reviews.llvm.org/D67105
2019-12-07 06:11:04 +08:00
; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: comiss %xmm2, %xmm1
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: xorps %xmm3, %xmm3
; CHECK-NEXT: ja .LBB116_2
; CHECK-NEXT: # %bb.1: # %entry
; CHECK-NEXT: movaps %xmm1, %xmm3
; CHECK-NEXT: .LBB116_2: # %entry
; CHECK-NEXT: subss %xmm3, %xmm2
[TargetLowering] Fix another potential FPE in expandFP_TO_UINT D53794 introduced code to perform the FP_TO_UINT expansion via FP_TO_SINT in a way that would never expose floating-point exceptions in the intermediate steps. Unfortunately, I just noticed there is still a way this can happen. As discussed in D53794, the compiler now generates this sequence: // Sel = Src < 0x8000000000000000 // Val = select Sel, Src, Src - 0x8000000000000000 // Ofs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val) ^ Ofs The problem is with the Src - 0x8000000000000000 expression. As I mentioned in the original review, that expression can never overflow or underflow if the original value is in range for FP_TO_UINT. But I missed that we can get an Inexact exception in the case where Src is a very small positive value. (In this case the result of the sub is ignored, but that doesn't help.) Instead, I'd suggest to use the following sequence: // Sel = Src < 0x8000000000000000 // FltOfs = select Sel, 0, 0x8000000000000000 // IntOfs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val - FltOfs) ^ IntOfs In the case where the value is already in range of FP_TO_SINT, we now simply compute Val - 0, which now definitely cannot trap (unless Val is a NaN in which case we'd want to trap anyway). In the case where the value is not in range of FP_TO_SINT, but still in range of FP_TO_UINT, the sub can never be inexact, as Val is between 2^(n-1) and (2^n)-1, i.e. always has the 2^(n-1) bit set, and the sub is always simply clearing that bit. There is a slight complication in the case where Val is a constant, so we know at compile time whether Sel is true or false. In that scenario, the old code would automatically optimize the sub away, while this no longer happens with the new code. Instead, I've added extra code to check for this case and then just fall back to FP_TO_SINT directly. (This seems to catch even slightly more cases.) Original version of the patch by Ulrich Weigand. X86 changes added by Craig Topper Differential Revision: https://reviews.llvm.org/D67105
2019-12-07 06:11:04 +08:00
; CHECK-NEXT: cvttss2si %xmm2, %rax
; CHECK-NEXT: setbe %cl
; CHECK-NEXT: movzbl %cl, %ecx
; CHECK-NEXT: shlq $63, %rcx
; CHECK-NEXT: xorq %rax, %rcx
; CHECK-NEXT: movq %rcx, %xmm2
; CHECK-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; CHECK-NEXT: comiss %xmm3, %xmm1
; CHECK-NEXT: ja .LBB116_4
; CHECK-NEXT: # %bb.3: # %entry
; CHECK-NEXT: movaps %xmm1, %xmm0
; CHECK-NEXT: .LBB116_4: # %entry
; CHECK-NEXT: subss %xmm0, %xmm3
; CHECK-NEXT: cvttss2si %xmm3, %rax
; CHECK-NEXT: setbe %cl
; CHECK-NEXT: movzbl %cl, %ecx
; CHECK-NEXT: shlq $63, %rcx
; CHECK-NEXT: xorq %rax, %rcx
; CHECK-NEXT: movq %rcx, %xmm0
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_fptoui_v2i64_v2f32:
; AVX1: # %bb.0: # %entry
[TargetLowering] Fix another potential FPE in expandFP_TO_UINT D53794 introduced code to perform the FP_TO_UINT expansion via FP_TO_SINT in a way that would never expose floating-point exceptions in the intermediate steps. Unfortunately, I just noticed there is still a way this can happen. As discussed in D53794, the compiler now generates this sequence: // Sel = Src < 0x8000000000000000 // Val = select Sel, Src, Src - 0x8000000000000000 // Ofs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val) ^ Ofs The problem is with the Src - 0x8000000000000000 expression. As I mentioned in the original review, that expression can never overflow or underflow if the original value is in range for FP_TO_UINT. But I missed that we can get an Inexact exception in the case where Src is a very small positive value. (In this case the result of the sub is ignored, but that doesn't help.) Instead, I'd suggest to use the following sequence: // Sel = Src < 0x8000000000000000 // FltOfs = select Sel, 0, 0x8000000000000000 // IntOfs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val - FltOfs) ^ IntOfs In the case where the value is already in range of FP_TO_SINT, we now simply compute Val - 0, which now definitely cannot trap (unless Val is a NaN in which case we'd want to trap anyway). In the case where the value is not in range of FP_TO_SINT, but still in range of FP_TO_UINT, the sub can never be inexact, as Val is between 2^(n-1) and (2^n)-1, i.e. always has the 2^(n-1) bit set, and the sub is always simply clearing that bit. There is a slight complication in the case where Val is a constant, so we know at compile time whether Sel is true or false. In that scenario, the old code would automatically optimize the sub away, while this no longer happens with the new code. Instead, I've added extra code to check for this case and then just fall back to FP_TO_SINT directly. (This seems to catch even slightly more cases.) Original version of the patch by Ulrich Weigand. X86 changes added by Craig Topper Differential Revision: https://reviews.llvm.org/D67105
2019-12-07 06:11:04 +08:00
; AVX1-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX1-NEXT: vcomiss %xmm2, %xmm0
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
; AVX1-NEXT: ja .LBB116_2
; AVX1-NEXT: # %bb.1: # %entry
; AVX1-NEXT: vmovaps %xmm0, %xmm3
; AVX1-NEXT: .LBB116_2: # %entry
; AVX1-NEXT: vsubss %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vcvttss2si %xmm2, %rax
; AVX1-NEXT: setbe %cl
; AVX1-NEXT: movzbl %cl, %ecx
; AVX1-NEXT: shlq $63, %rcx
; AVX1-NEXT: xorq %rax, %rcx
; AVX1-NEXT: vmovq %rcx, %xmm2
; AVX1-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; AVX1-NEXT: vcomiss %xmm3, %xmm0
; AVX1-NEXT: ja .LBB116_4
; AVX1-NEXT: # %bb.3: # %entry
; AVX1-NEXT: vmovaps %xmm0, %xmm1
; AVX1-NEXT: .LBB116_4: # %entry
; AVX1-NEXT: vsubss %xmm1, %xmm3, %xmm0
[TargetLowering] Fix another potential FPE in expandFP_TO_UINT D53794 introduced code to perform the FP_TO_UINT expansion via FP_TO_SINT in a way that would never expose floating-point exceptions in the intermediate steps. Unfortunately, I just noticed there is still a way this can happen. As discussed in D53794, the compiler now generates this sequence: // Sel = Src < 0x8000000000000000 // Val = select Sel, Src, Src - 0x8000000000000000 // Ofs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val) ^ Ofs The problem is with the Src - 0x8000000000000000 expression. As I mentioned in the original review, that expression can never overflow or underflow if the original value is in range for FP_TO_UINT. But I missed that we can get an Inexact exception in the case where Src is a very small positive value. (In this case the result of the sub is ignored, but that doesn't help.) Instead, I'd suggest to use the following sequence: // Sel = Src < 0x8000000000000000 // FltOfs = select Sel, 0, 0x8000000000000000 // IntOfs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val - FltOfs) ^ IntOfs In the case where the value is already in range of FP_TO_SINT, we now simply compute Val - 0, which now definitely cannot trap (unless Val is a NaN in which case we'd want to trap anyway). In the case where the value is not in range of FP_TO_SINT, but still in range of FP_TO_UINT, the sub can never be inexact, as Val is between 2^(n-1) and (2^n)-1, i.e. always has the 2^(n-1) bit set, and the sub is always simply clearing that bit. There is a slight complication in the case where Val is a constant, so we know at compile time whether Sel is true or false. In that scenario, the old code would automatically optimize the sub away, while this no longer happens with the new code. Instead, I've added extra code to check for this case and then just fall back to FP_TO_SINT directly. (This seems to catch even slightly more cases.) Original version of the patch by Ulrich Weigand. X86 changes added by Craig Topper Differential Revision: https://reviews.llvm.org/D67105
2019-12-07 06:11:04 +08:00
; AVX1-NEXT: vcvttss2si %xmm0, %rax
; AVX1-NEXT: setbe %cl
; AVX1-NEXT: movzbl %cl, %ecx
; AVX1-NEXT: shlq $63, %rcx
; AVX1-NEXT: xorq %rax, %rcx
; AVX1-NEXT: vmovq %rcx, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_fptoui_v2i64_v2f32:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vcvttss2usi {{.*}}(%rip), %rax
; AVX512-NEXT: vmovq %rax, %xmm0
; AVX512-NEXT: vcvttss2usi {{.*}}(%rip), %rax
; AVX512-NEXT: vmovq %rax, %xmm1
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512-NEXT: retq
entry:
%result = call <2 x i64> @llvm.experimental.constrained.fptoui.v2i64.v2f32(
<2 x float><float 42.0, float 43.0>,
metadata !"fpexcept.strict") #0
ret <2 x i64> %result
}
define <3 x i64> @constrained_vector_fptoui_v3i64_v3f32() #0 {
; CHECK-LABEL: constrained_vector_fptoui_v3i64_v3f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
[TargetLowering] Fix another potential FPE in expandFP_TO_UINT D53794 introduced code to perform the FP_TO_UINT expansion via FP_TO_SINT in a way that would never expose floating-point exceptions in the intermediate steps. Unfortunately, I just noticed there is still a way this can happen. As discussed in D53794, the compiler now generates this sequence: // Sel = Src < 0x8000000000000000 // Val = select Sel, Src, Src - 0x8000000000000000 // Ofs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val) ^ Ofs The problem is with the Src - 0x8000000000000000 expression. As I mentioned in the original review, that expression can never overflow or underflow if the original value is in range for FP_TO_UINT. But I missed that we can get an Inexact exception in the case where Src is a very small positive value. (In this case the result of the sub is ignored, but that doesn't help.) Instead, I'd suggest to use the following sequence: // Sel = Src < 0x8000000000000000 // FltOfs = select Sel, 0, 0x8000000000000000 // IntOfs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val - FltOfs) ^ IntOfs In the case where the value is already in range of FP_TO_SINT, we now simply compute Val - 0, which now definitely cannot trap (unless Val is a NaN in which case we'd want to trap anyway). In the case where the value is not in range of FP_TO_SINT, but still in range of FP_TO_UINT, the sub can never be inexact, as Val is between 2^(n-1) and (2^n)-1, i.e. always has the 2^(n-1) bit set, and the sub is always simply clearing that bit. There is a slight complication in the case where Val is a constant, so we know at compile time whether Sel is true or false. In that scenario, the old code would automatically optimize the sub away, while this no longer happens with the new code. Instead, I've added extra code to check for this case and then just fall back to FP_TO_SINT directly. (This seems to catch even slightly more cases.) Original version of the patch by Ulrich Weigand. X86 changes added by Craig Topper Differential Revision: https://reviews.llvm.org/D67105
2019-12-07 06:11:04 +08:00
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: comiss %xmm2, %xmm1
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: xorps %xmm3, %xmm3
; CHECK-NEXT: ja .LBB117_2
; CHECK-NEXT: # %bb.1: # %entry
; CHECK-NEXT: movaps %xmm1, %xmm3
; CHECK-NEXT: .LBB117_2: # %entry
; CHECK-NEXT: subss %xmm3, %xmm2
; CHECK-NEXT: cvttss2si %xmm2, %rcx
; CHECK-NEXT: setbe %al
; CHECK-NEXT: movzbl %al, %eax
; CHECK-NEXT: shlq $63, %rax
; CHECK-NEXT: xorq %rcx, %rax
; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; CHECK-NEXT: comiss %xmm2, %xmm1
; CHECK-NEXT: xorps %xmm3, %xmm3
; CHECK-NEXT: ja .LBB117_4
; CHECK-NEXT: # %bb.3: # %entry
; CHECK-NEXT: movaps %xmm1, %xmm3
; CHECK-NEXT: .LBB117_4: # %entry
; CHECK-NEXT: subss %xmm3, %xmm2
; CHECK-NEXT: cvttss2si %xmm2, %rcx
; CHECK-NEXT: setbe %dl
; CHECK-NEXT: movzbl %dl, %edx
; CHECK-NEXT: shlq $63, %rdx
; CHECK-NEXT: xorq %rcx, %rdx
; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; CHECK-NEXT: comiss %xmm2, %xmm1
; CHECK-NEXT: ja .LBB117_6
; CHECK-NEXT: # %bb.5: # %entry
; CHECK-NEXT: movaps %xmm1, %xmm0
; CHECK-NEXT: .LBB117_6: # %entry
; CHECK-NEXT: subss %xmm0, %xmm2
; CHECK-NEXT: cvttss2si %xmm2, %rsi
; CHECK-NEXT: setbe %cl
; CHECK-NEXT: movzbl %cl, %ecx
; CHECK-NEXT: shlq $63, %rcx
; CHECK-NEXT: xorq %rsi, %rcx
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_fptoui_v3i64_v3f32:
; AVX1: # %bb.0: # %entry
[TargetLowering] Fix another potential FPE in expandFP_TO_UINT D53794 introduced code to perform the FP_TO_UINT expansion via FP_TO_SINT in a way that would never expose floating-point exceptions in the intermediate steps. Unfortunately, I just noticed there is still a way this can happen. As discussed in D53794, the compiler now generates this sequence: // Sel = Src < 0x8000000000000000 // Val = select Sel, Src, Src - 0x8000000000000000 // Ofs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val) ^ Ofs The problem is with the Src - 0x8000000000000000 expression. As I mentioned in the original review, that expression can never overflow or underflow if the original value is in range for FP_TO_UINT. But I missed that we can get an Inexact exception in the case where Src is a very small positive value. (In this case the result of the sub is ignored, but that doesn't help.) Instead, I'd suggest to use the following sequence: // Sel = Src < 0x8000000000000000 // FltOfs = select Sel, 0, 0x8000000000000000 // IntOfs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val - FltOfs) ^ IntOfs In the case where the value is already in range of FP_TO_SINT, we now simply compute Val - 0, which now definitely cannot trap (unless Val is a NaN in which case we'd want to trap anyway). In the case where the value is not in range of FP_TO_SINT, but still in range of FP_TO_UINT, the sub can never be inexact, as Val is between 2^(n-1) and (2^n)-1, i.e. always has the 2^(n-1) bit set, and the sub is always simply clearing that bit. There is a slight complication in the case where Val is a constant, so we know at compile time whether Sel is true or false. In that scenario, the old code would automatically optimize the sub away, while this no longer happens with the new code. Instead, I've added extra code to check for this case and then just fall back to FP_TO_SINT directly. (This seems to catch even slightly more cases.) Original version of the patch by Ulrich Weigand. X86 changes added by Craig Topper Differential Revision: https://reviews.llvm.org/D67105
2019-12-07 06:11:04 +08:00
; AVX1-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX1-NEXT: vcomiss %xmm2, %xmm0
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
; AVX1-NEXT: ja .LBB117_2
; AVX1-NEXT: # %bb.1: # %entry
; AVX1-NEXT: vmovaps %xmm0, %xmm3
; AVX1-NEXT: .LBB117_2: # %entry
; AVX1-NEXT: vsubss %xmm3, %xmm2, %xmm2
[TargetLowering] Fix another potential FPE in expandFP_TO_UINT D53794 introduced code to perform the FP_TO_UINT expansion via FP_TO_SINT in a way that would never expose floating-point exceptions in the intermediate steps. Unfortunately, I just noticed there is still a way this can happen. As discussed in D53794, the compiler now generates this sequence: // Sel = Src < 0x8000000000000000 // Val = select Sel, Src, Src - 0x8000000000000000 // Ofs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val) ^ Ofs The problem is with the Src - 0x8000000000000000 expression. As I mentioned in the original review, that expression can never overflow or underflow if the original value is in range for FP_TO_UINT. But I missed that we can get an Inexact exception in the case where Src is a very small positive value. (In this case the result of the sub is ignored, but that doesn't help.) Instead, I'd suggest to use the following sequence: // Sel = Src < 0x8000000000000000 // FltOfs = select Sel, 0, 0x8000000000000000 // IntOfs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val - FltOfs) ^ IntOfs In the case where the value is already in range of FP_TO_SINT, we now simply compute Val - 0, which now definitely cannot trap (unless Val is a NaN in which case we'd want to trap anyway). In the case where the value is not in range of FP_TO_SINT, but still in range of FP_TO_UINT, the sub can never be inexact, as Val is between 2^(n-1) and (2^n)-1, i.e. always has the 2^(n-1) bit set, and the sub is always simply clearing that bit. There is a slight complication in the case where Val is a constant, so we know at compile time whether Sel is true or false. In that scenario, the old code would automatically optimize the sub away, while this no longer happens with the new code. Instead, I've added extra code to check for this case and then just fall back to FP_TO_SINT directly. (This seems to catch even slightly more cases.) Original version of the patch by Ulrich Weigand. X86 changes added by Craig Topper Differential Revision: https://reviews.llvm.org/D67105
2019-12-07 06:11:04 +08:00
; AVX1-NEXT: vcvttss2si %xmm2, %rax
; AVX1-NEXT: setbe %cl
; AVX1-NEXT: movzbl %cl, %ecx
; AVX1-NEXT: shlq $63, %rcx
; AVX1-NEXT: xorq %rax, %rcx
; AVX1-NEXT: vmovq %rcx, %xmm2
; AVX1-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; AVX1-NEXT: vcomiss %xmm3, %xmm0
; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4
; AVX1-NEXT: ja .LBB117_4
; AVX1-NEXT: # %bb.3: # %entry
; AVX1-NEXT: vmovaps %xmm0, %xmm4
; AVX1-NEXT: .LBB117_4: # %entry
; AVX1-NEXT: vsubss %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vcvttss2si %xmm3, %rax
; AVX1-NEXT: setbe %cl
; AVX1-NEXT: movzbl %cl, %ecx
; AVX1-NEXT: shlq $63, %rcx
; AVX1-NEXT: xorq %rax, %rcx
; AVX1-NEXT: vmovq %rcx, %xmm3
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
; AVX1-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; AVX1-NEXT: vcomiss %xmm3, %xmm0
; AVX1-NEXT: ja .LBB117_6
; AVX1-NEXT: # %bb.5: # %entry
; AVX1-NEXT: vmovaps %xmm0, %xmm1
; AVX1-NEXT: .LBB117_6: # %entry
; AVX1-NEXT: vsubss %xmm1, %xmm3, %xmm0
[TargetLowering] Fix another potential FPE in expandFP_TO_UINT D53794 introduced code to perform the FP_TO_UINT expansion via FP_TO_SINT in a way that would never expose floating-point exceptions in the intermediate steps. Unfortunately, I just noticed there is still a way this can happen. As discussed in D53794, the compiler now generates this sequence: // Sel = Src < 0x8000000000000000 // Val = select Sel, Src, Src - 0x8000000000000000 // Ofs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val) ^ Ofs The problem is with the Src - 0x8000000000000000 expression. As I mentioned in the original review, that expression can never overflow or underflow if the original value is in range for FP_TO_UINT. But I missed that we can get an Inexact exception in the case where Src is a very small positive value. (In this case the result of the sub is ignored, but that doesn't help.) Instead, I'd suggest to use the following sequence: // Sel = Src < 0x8000000000000000 // FltOfs = select Sel, 0, 0x8000000000000000 // IntOfs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val - FltOfs) ^ IntOfs In the case where the value is already in range of FP_TO_SINT, we now simply compute Val - 0, which now definitely cannot trap (unless Val is a NaN in which case we'd want to trap anyway). In the case where the value is not in range of FP_TO_SINT, but still in range of FP_TO_UINT, the sub can never be inexact, as Val is between 2^(n-1) and (2^n)-1, i.e. always has the 2^(n-1) bit set, and the sub is always simply clearing that bit. There is a slight complication in the case where Val is a constant, so we know at compile time whether Sel is true or false. In that scenario, the old code would automatically optimize the sub away, while this no longer happens with the new code. Instead, I've added extra code to check for this case and then just fall back to FP_TO_SINT directly. (This seems to catch even slightly more cases.) Original version of the patch by Ulrich Weigand. X86 changes added by Craig Topper Differential Revision: https://reviews.llvm.org/D67105
2019-12-07 06:11:04 +08:00
; AVX1-NEXT: vcvttss2si %xmm0, %rax
; AVX1-NEXT: setbe %cl
; AVX1-NEXT: movzbl %cl, %ecx
; AVX1-NEXT: shlq $63, %rcx
; AVX1-NEXT: xorq %rax, %rcx
; AVX1-NEXT: vmovq %rcx, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_fptoui_v3i64_v3f32:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vcvttss2usi {{.*}}(%rip), %rax
; AVX512-NEXT: vmovq %rax, %xmm0
; AVX512-NEXT: vcvttss2usi {{.*}}(%rip), %rax
; AVX512-NEXT: vmovq %rax, %xmm1
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512-NEXT: vcvttss2usi {{.*}}(%rip), %rax
; AVX512-NEXT: vmovq %rax, %xmm1
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
entry:
%result = call <3 x i64> @llvm.experimental.constrained.fptoui.v3i64.v3f32(
<3 x float><float 42.0, float 43.0,
float 44.0>,
metadata !"fpexcept.strict") #0
ret <3 x i64> %result
}
define <4 x i64> @constrained_vector_fptoui_v4i64_v4f32() #0 {
; CHECK-LABEL: constrained_vector_fptoui_v4i64_v4f32:
; CHECK: # %bb.0: # %entry
[TargetLowering] Fix another potential FPE in expandFP_TO_UINT D53794 introduced code to perform the FP_TO_UINT expansion via FP_TO_SINT in a way that would never expose floating-point exceptions in the intermediate steps. Unfortunately, I just noticed there is still a way this can happen. As discussed in D53794, the compiler now generates this sequence: // Sel = Src < 0x8000000000000000 // Val = select Sel, Src, Src - 0x8000000000000000 // Ofs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val) ^ Ofs The problem is with the Src - 0x8000000000000000 expression. As I mentioned in the original review, that expression can never overflow or underflow if the original value is in range for FP_TO_UINT. But I missed that we can get an Inexact exception in the case where Src is a very small positive value. (In this case the result of the sub is ignored, but that doesn't help.) Instead, I'd suggest to use the following sequence: // Sel = Src < 0x8000000000000000 // FltOfs = select Sel, 0, 0x8000000000000000 // IntOfs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val - FltOfs) ^ IntOfs In the case where the value is already in range of FP_TO_SINT, we now simply compute Val - 0, which now definitely cannot trap (unless Val is a NaN in which case we'd want to trap anyway). In the case where the value is not in range of FP_TO_SINT, but still in range of FP_TO_UINT, the sub can never be inexact, as Val is between 2^(n-1) and (2^n)-1, i.e. always has the 2^(n-1) bit set, and the sub is always simply clearing that bit. There is a slight complication in the case where Val is a constant, so we know at compile time whether Sel is true or false. In that scenario, the old code would automatically optimize the sub away, while this no longer happens with the new code. Instead, I've added extra code to check for this case and then just fall back to FP_TO_SINT directly. (This seems to catch even slightly more cases.) Original version of the patch by Ulrich Weigand. X86 changes added by Craig Topper Differential Revision: https://reviews.llvm.org/D67105
2019-12-07 06:11:04 +08:00
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; CHECK-NEXT: comiss %xmm0, %xmm2
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: xorps %xmm3, %xmm3
; CHECK-NEXT: ja .LBB118_2
; CHECK-NEXT: # %bb.1: # %entry
; CHECK-NEXT: movaps %xmm2, %xmm3
; CHECK-NEXT: .LBB118_2: # %entry
; CHECK-NEXT: subss %xmm3, %xmm0
; CHECK-NEXT: cvttss2si %xmm0, %rcx
; CHECK-NEXT: setbe %al
; CHECK-NEXT: movzbl %al, %eax
; CHECK-NEXT: shlq $63, %rax
; CHECK-NEXT: xorq %rcx, %rax
[TargetLowering] Fix another potential FPE in expandFP_TO_UINT D53794 introduced code to perform the FP_TO_UINT expansion via FP_TO_SINT in a way that would never expose floating-point exceptions in the intermediate steps. Unfortunately, I just noticed there is still a way this can happen. As discussed in D53794, the compiler now generates this sequence: // Sel = Src < 0x8000000000000000 // Val = select Sel, Src, Src - 0x8000000000000000 // Ofs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val) ^ Ofs The problem is with the Src - 0x8000000000000000 expression. As I mentioned in the original review, that expression can never overflow or underflow if the original value is in range for FP_TO_UINT. But I missed that we can get an Inexact exception in the case where Src is a very small positive value. (In this case the result of the sub is ignored, but that doesn't help.) Instead, I'd suggest to use the following sequence: // Sel = Src < 0x8000000000000000 // FltOfs = select Sel, 0, 0x8000000000000000 // IntOfs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val - FltOfs) ^ IntOfs In the case where the value is already in range of FP_TO_SINT, we now simply compute Val - 0, which now definitely cannot trap (unless Val is a NaN in which case we'd want to trap anyway). In the case where the value is not in range of FP_TO_SINT, but still in range of FP_TO_UINT, the sub can never be inexact, as Val is between 2^(n-1) and (2^n)-1, i.e. always has the 2^(n-1) bit set, and the sub is always simply clearing that bit. There is a slight complication in the case where Val is a constant, so we know at compile time whether Sel is true or false. In that scenario, the old code would automatically optimize the sub away, while this no longer happens with the new code. Instead, I've added extra code to check for this case and then just fall back to FP_TO_SINT directly. (This seems to catch even slightly more cases.) Original version of the patch by Ulrich Weigand. X86 changes added by Craig Topper Differential Revision: https://reviews.llvm.org/D67105
2019-12-07 06:11:04 +08:00
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: comiss %xmm0, %xmm2
; CHECK-NEXT: xorps %xmm4, %xmm4
; CHECK-NEXT: ja .LBB118_4
; CHECK-NEXT: # %bb.3: # %entry
; CHECK-NEXT: movaps %xmm2, %xmm4
; CHECK-NEXT: .LBB118_4: # %entry
; CHECK-NEXT: movq %rax, %xmm3
; CHECK-NEXT: subss %xmm4, %xmm0
[TargetLowering] Fix another potential FPE in expandFP_TO_UINT D53794 introduced code to perform the FP_TO_UINT expansion via FP_TO_SINT in a way that would never expose floating-point exceptions in the intermediate steps. Unfortunately, I just noticed there is still a way this can happen. As discussed in D53794, the compiler now generates this sequence: // Sel = Src < 0x8000000000000000 // Val = select Sel, Src, Src - 0x8000000000000000 // Ofs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val) ^ Ofs The problem is with the Src - 0x8000000000000000 expression. As I mentioned in the original review, that expression can never overflow or underflow if the original value is in range for FP_TO_UINT. But I missed that we can get an Inexact exception in the case where Src is a very small positive value. (In this case the result of the sub is ignored, but that doesn't help.) Instead, I'd suggest to use the following sequence: // Sel = Src < 0x8000000000000000 // FltOfs = select Sel, 0, 0x8000000000000000 // IntOfs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val - FltOfs) ^ IntOfs In the case where the value is already in range of FP_TO_SINT, we now simply compute Val - 0, which now definitely cannot trap (unless Val is a NaN in which case we'd want to trap anyway). In the case where the value is not in range of FP_TO_SINT, but still in range of FP_TO_UINT, the sub can never be inexact, as Val is between 2^(n-1) and (2^n)-1, i.e. always has the 2^(n-1) bit set, and the sub is always simply clearing that bit. There is a slight complication in the case where Val is a constant, so we know at compile time whether Sel is true or false. In that scenario, the old code would automatically optimize the sub away, while this no longer happens with the new code. Instead, I've added extra code to check for this case and then just fall back to FP_TO_SINT directly. (This seems to catch even slightly more cases.) Original version of the patch by Ulrich Weigand. X86 changes added by Craig Topper Differential Revision: https://reviews.llvm.org/D67105
2019-12-07 06:11:04 +08:00
; CHECK-NEXT: cvttss2si %xmm0, %rax
; CHECK-NEXT: setbe %cl
; CHECK-NEXT: movzbl %cl, %ecx
; CHECK-NEXT: shlq $63, %rcx
; CHECK-NEXT: xorq %rax, %rcx
; CHECK-NEXT: movq %rcx, %xmm0
; CHECK-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
; CHECK-NEXT: comiss %xmm4, %xmm2
; CHECK-NEXT: xorps %xmm5, %xmm5
; CHECK-NEXT: ja .LBB118_6
; CHECK-NEXT: # %bb.5: # %entry
; CHECK-NEXT: movaps %xmm2, %xmm5
; CHECK-NEXT: .LBB118_6: # %entry
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; CHECK-NEXT: subss %xmm5, %xmm4
; CHECK-NEXT: cvttss2si %xmm4, %rax
; CHECK-NEXT: setbe %cl
; CHECK-NEXT: movzbl %cl, %ecx
; CHECK-NEXT: shlq $63, %rcx
; CHECK-NEXT: xorq %rax, %rcx
; CHECK-NEXT: movq %rcx, %xmm3
; CHECK-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
; CHECK-NEXT: comiss %xmm4, %xmm2
; CHECK-NEXT: ja .LBB118_8
; CHECK-NEXT: # %bb.7: # %entry
; CHECK-NEXT: movaps %xmm2, %xmm1
; CHECK-NEXT: .LBB118_8: # %entry
; CHECK-NEXT: subss %xmm1, %xmm4
; CHECK-NEXT: cvttss2si %xmm4, %rax
; CHECK-NEXT: setbe %cl
; CHECK-NEXT: movzbl %cl, %ecx
; CHECK-NEXT: shlq $63, %rcx
; CHECK-NEXT: xorq %rax, %rcx
; CHECK-NEXT: movq %rcx, %xmm1
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_fptoui_v4i64_v4f32:
; AVX1: # %bb.0: # %entry
[TargetLowering] Fix another potential FPE in expandFP_TO_UINT D53794 introduced code to perform the FP_TO_UINT expansion via FP_TO_SINT in a way that would never expose floating-point exceptions in the intermediate steps. Unfortunately, I just noticed there is still a way this can happen. As discussed in D53794, the compiler now generates this sequence: // Sel = Src < 0x8000000000000000 // Val = select Sel, Src, Src - 0x8000000000000000 // Ofs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val) ^ Ofs The problem is with the Src - 0x8000000000000000 expression. As I mentioned in the original review, that expression can never overflow or underflow if the original value is in range for FP_TO_UINT. But I missed that we can get an Inexact exception in the case where Src is a very small positive value. (In this case the result of the sub is ignored, but that doesn't help.) Instead, I'd suggest to use the following sequence: // Sel = Src < 0x8000000000000000 // FltOfs = select Sel, 0, 0x8000000000000000 // IntOfs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val - FltOfs) ^ IntOfs In the case where the value is already in range of FP_TO_SINT, we now simply compute Val - 0, which now definitely cannot trap (unless Val is a NaN in which case we'd want to trap anyway). In the case where the value is not in range of FP_TO_SINT, but still in range of FP_TO_UINT, the sub can never be inexact, as Val is between 2^(n-1) and (2^n)-1, i.e. always has the 2^(n-1) bit set, and the sub is always simply clearing that bit. There is a slight complication in the case where Val is a constant, so we know at compile time whether Sel is true or false. In that scenario, the old code would automatically optimize the sub away, while this no longer happens with the new code. Instead, I've added extra code to check for this case and then just fall back to FP_TO_SINT directly. (This seems to catch even slightly more cases.) Original version of the patch by Ulrich Weigand. X86 changes added by Craig Topper Differential Revision: https://reviews.llvm.org/D67105
2019-12-07 06:11:04 +08:00
; AVX1-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX1-NEXT: vcomiss %xmm2, %xmm0
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
; AVX1-NEXT: ja .LBB118_2
; AVX1-NEXT: # %bb.1: # %entry
; AVX1-NEXT: vmovaps %xmm0, %xmm3
; AVX1-NEXT: .LBB118_2: # %entry
; AVX1-NEXT: vsubss %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vcvttss2si %xmm2, %rcx
; AVX1-NEXT: setbe %al
; AVX1-NEXT: movzbl %al, %eax
; AVX1-NEXT: shlq $63, %rax
; AVX1-NEXT: xorq %rcx, %rax
[TargetLowering] Fix another potential FPE in expandFP_TO_UINT D53794 introduced code to perform the FP_TO_UINT expansion via FP_TO_SINT in a way that would never expose floating-point exceptions in the intermediate steps. Unfortunately, I just noticed there is still a way this can happen. As discussed in D53794, the compiler now generates this sequence: // Sel = Src < 0x8000000000000000 // Val = select Sel, Src, Src - 0x8000000000000000 // Ofs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val) ^ Ofs The problem is with the Src - 0x8000000000000000 expression. As I mentioned in the original review, that expression can never overflow or underflow if the original value is in range for FP_TO_UINT. But I missed that we can get an Inexact exception in the case where Src is a very small positive value. (In this case the result of the sub is ignored, but that doesn't help.) Instead, I'd suggest to use the following sequence: // Sel = Src < 0x8000000000000000 // FltOfs = select Sel, 0, 0x8000000000000000 // IntOfs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val - FltOfs) ^ IntOfs In the case where the value is already in range of FP_TO_SINT, we now simply compute Val - 0, which now definitely cannot trap (unless Val is a NaN in which case we'd want to trap anyway). In the case where the value is not in range of FP_TO_SINT, but still in range of FP_TO_UINT, the sub can never be inexact, as Val is between 2^(n-1) and (2^n)-1, i.e. always has the 2^(n-1) bit set, and the sub is always simply clearing that bit. There is a slight complication in the case where Val is a constant, so we know at compile time whether Sel is true or false. In that scenario, the old code would automatically optimize the sub away, while this no longer happens with the new code. Instead, I've added extra code to check for this case and then just fall back to FP_TO_SINT directly. (This seems to catch even slightly more cases.) Original version of the patch by Ulrich Weigand. X86 changes added by Craig Topper Differential Revision: https://reviews.llvm.org/D67105
2019-12-07 06:11:04 +08:00
; AVX1-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; AVX1-NEXT: vcomiss %xmm3, %xmm0
; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4
; AVX1-NEXT: ja .LBB118_4
; AVX1-NEXT: # %bb.3: # %entry
; AVX1-NEXT: vmovaps %xmm0, %xmm4
; AVX1-NEXT: .LBB118_4: # %entry
; AVX1-NEXT: vmovq %rax, %xmm2
; AVX1-NEXT: vsubss %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vcvttss2si %xmm3, %rax
; AVX1-NEXT: setbe %cl
; AVX1-NEXT: movzbl %cl, %ecx
; AVX1-NEXT: shlq $63, %rcx
; AVX1-NEXT: xorq %rax, %rcx
; AVX1-NEXT: vmovq %rcx, %xmm3
; AVX1-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
; AVX1-NEXT: vcomiss %xmm4, %xmm0
; AVX1-NEXT: vxorps %xmm5, %xmm5, %xmm5
; AVX1-NEXT: ja .LBB118_6
; AVX1-NEXT: # %bb.5: # %entry
; AVX1-NEXT: vmovaps %xmm0, %xmm5
; AVX1-NEXT: .LBB118_6: # %entry
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
; AVX1-NEXT: vsubss %xmm5, %xmm4, %xmm3
; AVX1-NEXT: vcvttss2si %xmm3, %rax
; AVX1-NEXT: setbe %cl
; AVX1-NEXT: movzbl %cl, %ecx
; AVX1-NEXT: shlq $63, %rcx
; AVX1-NEXT: xorq %rax, %rcx
; AVX1-NEXT: vmovq %rcx, %xmm3
; AVX1-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
; AVX1-NEXT: vcomiss %xmm4, %xmm0
; AVX1-NEXT: ja .LBB118_8
; AVX1-NEXT: # %bb.7: # %entry
; AVX1-NEXT: vmovaps %xmm0, %xmm1
; AVX1-NEXT: .LBB118_8: # %entry
; AVX1-NEXT: vsubss %xmm1, %xmm4, %xmm0
[TargetLowering] Fix another potential FPE in expandFP_TO_UINT D53794 introduced code to perform the FP_TO_UINT expansion via FP_TO_SINT in a way that would never expose floating-point exceptions in the intermediate steps. Unfortunately, I just noticed there is still a way this can happen. As discussed in D53794, the compiler now generates this sequence: // Sel = Src < 0x8000000000000000 // Val = select Sel, Src, Src - 0x8000000000000000 // Ofs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val) ^ Ofs The problem is with the Src - 0x8000000000000000 expression. As I mentioned in the original review, that expression can never overflow or underflow if the original value is in range for FP_TO_UINT. But I missed that we can get an Inexact exception in the case where Src is a very small positive value. (In this case the result of the sub is ignored, but that doesn't help.) Instead, I'd suggest to use the following sequence: // Sel = Src < 0x8000000000000000 // FltOfs = select Sel, 0, 0x8000000000000000 // IntOfs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val - FltOfs) ^ IntOfs In the case where the value is already in range of FP_TO_SINT, we now simply compute Val - 0, which now definitely cannot trap (unless Val is a NaN in which case we'd want to trap anyway). In the case where the value is not in range of FP_TO_SINT, but still in range of FP_TO_UINT, the sub can never be inexact, as Val is between 2^(n-1) and (2^n)-1, i.e. always has the 2^(n-1) bit set, and the sub is always simply clearing that bit. There is a slight complication in the case where Val is a constant, so we know at compile time whether Sel is true or false. In that scenario, the old code would automatically optimize the sub away, while this no longer happens with the new code. Instead, I've added extra code to check for this case and then just fall back to FP_TO_SINT directly. (This seems to catch even slightly more cases.) Original version of the patch by Ulrich Weigand. X86 changes added by Craig Topper Differential Revision: https://reviews.llvm.org/D67105
2019-12-07 06:11:04 +08:00
; AVX1-NEXT: vcvttss2si %xmm0, %rax
; AVX1-NEXT: setbe %cl
; AVX1-NEXT: movzbl %cl, %ecx
; AVX1-NEXT: shlq $63, %rcx
; AVX1-NEXT: xorq %rax, %rcx
; AVX1-NEXT: vmovq %rcx, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX512F-LABEL: constrained_vector_fptoui_v4i64_v4f32:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vcvttss2usi {{.*}}(%rip), %rax
; AVX512F-NEXT: vmovq %rax, %xmm0
; AVX512F-NEXT: vcvttss2usi {{.*}}(%rip), %rax
; AVX512F-NEXT: vmovq %rax, %xmm1
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512F-NEXT: vcvttss2usi {{.*}}(%rip), %rax
; AVX512F-NEXT: vmovq %rax, %xmm1
; AVX512F-NEXT: vcvttss2usi {{.*}}(%rip), %rax
; AVX512F-NEXT: vmovq %rax, %xmm2
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: constrained_vector_fptoui_v4i64_v4f32:
; AVX512DQ: # %bb.0: # %entry
; AVX512DQ-NEXT: vmovaps {{.*#+}} xmm0 = [4.2E+1,4.3E+1,4.4E+1,4.5E+1]
; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0
; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512DQ-NEXT: retq
entry:
%result = call <4 x i64> @llvm.experimental.constrained.fptoui.v4i64.v4f32(
<4 x float><float 42.0, float 43.0,
float 44.0, float 45.0>,
metadata !"fpexcept.strict") #0
ret <4 x i64> %result
}
define <1 x i32> @constrained_vector_fptoui_v1i32_v1f64() #0 {
; CHECK-LABEL: constrained_vector_fptoui_v1i32_v1f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax
; CHECK-NEXT: # kill: def $eax killed $eax killed $rax
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_fptoui_v1i32_v1f64:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax
; AVX1-NEXT: # kill: def $eax killed $eax killed $rax
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_fptoui_v1i32_v1f64:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vcvttsd2usi {{.*}}(%rip), %eax
; AVX512-NEXT: retq
entry:
%result = call <1 x i32> @llvm.experimental.constrained.fptoui.v1i32.v1f64(
<1 x double><double 42.1>,
metadata !"fpexcept.strict") #0
ret <1 x i32> %result
}
define <2 x i32> @constrained_vector_fptoui_v2i32_v2f64() #0 {
; CHECK-LABEL: constrained_vector_fptoui_v2i32_v2f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax
; CHECK-NEXT: movd %eax, %xmm1
; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax
; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_fptoui_v2i32_v2f64:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax
; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rcx
; AVX1-NEXT: vmovd %ecx, %xmm0
; AVX1-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_fptoui_v2i32_v2f64:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [4.2100000000000001E+1,4.2200000000000003E+1,0.0E+0,0.0E+0]
; AVX512-NEXT: vcvttpd2udq %zmm0, %ymm0
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
%result = call <2 x i32> @llvm.experimental.constrained.fptoui.v2i32.v2f64(
<2 x double><double 42.1, double 42.2>,
metadata !"fpexcept.strict") #0
ret <2 x i32> %result
}
define <3 x i32> @constrained_vector_fptoui_v3i32_v3f64() #0 {
; CHECK-LABEL: constrained_vector_fptoui_v3i32_v3f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax
; CHECK-NEXT: movd %eax, %xmm1
; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax
; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax
; CHECK-NEXT: movd %eax, %xmm1
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_fptoui_v3i32_v3f64:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax
; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rcx
; AVX1-NEXT: vmovd %ecx, %xmm0
; AVX1-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax
; AVX1-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_fptoui_v3i32_v3f64:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vcvttsd2usi {{.*}}(%rip), %eax
; AVX512-NEXT: vmovd %eax, %xmm0
; AVX512-NEXT: vcvttsd2usi {{.*}}(%rip), %eax
; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
; AVX512-NEXT: vcvttsd2usi {{.*}}(%rip), %eax
; AVX512-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
; AVX512-NEXT: retq
entry:
%result = call <3 x i32> @llvm.experimental.constrained.fptoui.v3i32.v3f64(
<3 x double><double 42.1, double 42.2,
double 42.3>,
metadata !"fpexcept.strict") #0
ret <3 x i32> %result
}
define <4 x i32> @constrained_vector_fptoui_v4i32_v4f64() #0 {
; CHECK-LABEL: constrained_vector_fptoui_v4i32_v4f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax
; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax
; CHECK-NEXT: movd %eax, %xmm1
; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax
; CHECK-NEXT: movd %eax, %xmm2
; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax
; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_fptoui_v4i32_v4f64:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [4.2100000000000001E+1,4.2200000000000003E+1,4.2299999999999997E+1,4.2399999999999999E+1]
; AVX1-NEXT: vcmpltpd %ymm0, %ymm1, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm3[0,2]
; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4
; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
; AVX1-NEXT: vblendvps %xmm3, %xmm4, %xmm5, %xmm3
; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4
; AVX1-NEXT: vblendvpd %ymm2, %ymm4, %ymm0, %ymm0
; AVX1-NEXT: vsubpd %ymm0, %ymm1, %ymm0
; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0
; AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_fptoui_v4i32_v4f64:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [4.2100000000000001E+1,4.2200000000000003E+1,4.2299999999999997E+1,4.2399999999999999E+1]
; AVX512-NEXT: vcvttpd2udq %zmm0, %ymm0
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
%result = call <4 x i32> @llvm.experimental.constrained.fptoui.v4i32.v4f64(
<4 x double><double 42.1, double 42.2,
double 42.3, double 42.4>,
metadata !"fpexcept.strict") #0
ret <4 x i32> %result
}
define <1 x i64> @constrained_vector_fptoui_v1i64_v1f64() #0 {
; CHECK-LABEL: constrained_vector_fptoui_v1i64_v1f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
; CHECK-NEXT: comisd %xmm0, %xmm2
; CHECK-NEXT: xorpd %xmm1, %xmm1
; CHECK-NEXT: ja .LBB123_2
; CHECK-NEXT: # %bb.1: # %entry
; CHECK-NEXT: movapd %xmm2, %xmm1
; CHECK-NEXT: .LBB123_2: # %entry
; CHECK-NEXT: subsd %xmm1, %xmm0
; CHECK-NEXT: cvttsd2si %xmm0, %rcx
; CHECK-NEXT: setbe %al
; CHECK-NEXT: movzbl %al, %eax
; CHECK-NEXT: shlq $63, %rax
; CHECK-NEXT: xorq %rcx, %rax
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_fptoui_v1i64_v1f64:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
[TargetLowering] Fix another potential FPE in expandFP_TO_UINT D53794 introduced code to perform the FP_TO_UINT expansion via FP_TO_SINT in a way that would never expose floating-point exceptions in the intermediate steps. Unfortunately, I just noticed there is still a way this can happen. As discussed in D53794, the compiler now generates this sequence: // Sel = Src < 0x8000000000000000 // Val = select Sel, Src, Src - 0x8000000000000000 // Ofs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val) ^ Ofs The problem is with the Src - 0x8000000000000000 expression. As I mentioned in the original review, that expression can never overflow or underflow if the original value is in range for FP_TO_UINT. But I missed that we can get an Inexact exception in the case where Src is a very small positive value. (In this case the result of the sub is ignored, but that doesn't help.) Instead, I'd suggest to use the following sequence: // Sel = Src < 0x8000000000000000 // FltOfs = select Sel, 0, 0x8000000000000000 // IntOfs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val - FltOfs) ^ IntOfs In the case where the value is already in range of FP_TO_SINT, we now simply compute Val - 0, which now definitely cannot trap (unless Val is a NaN in which case we'd want to trap anyway). In the case where the value is not in range of FP_TO_SINT, but still in range of FP_TO_UINT, the sub can never be inexact, as Val is between 2^(n-1) and (2^n)-1, i.e. always has the 2^(n-1) bit set, and the sub is always simply clearing that bit. There is a slight complication in the case where Val is a constant, so we know at compile time whether Sel is true or false. In that scenario, the old code would automatically optimize the sub away, while this no longer happens with the new code. Instead, I've added extra code to check for this case and then just fall back to FP_TO_SINT directly. (This seems to catch even slightly more cases.) Original version of the patch by Ulrich Weigand. X86 changes added by Craig Topper Differential Revision: https://reviews.llvm.org/D67105
2019-12-07 06:11:04 +08:00
; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX1-NEXT: vcomisd %xmm0, %xmm1
; AVX1-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: ja .LBB123_2
; AVX1-NEXT: # %bb.1: # %entry
; AVX1-NEXT: vmovapd %xmm1, %xmm2
; AVX1-NEXT: .LBB123_2: # %entry
; AVX1-NEXT: vsubsd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vcvttsd2si %xmm0, %rcx
; AVX1-NEXT: setbe %al
; AVX1-NEXT: movzbl %al, %eax
; AVX1-NEXT: shlq $63, %rax
; AVX1-NEXT: xorq %rcx, %rax
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_fptoui_v1i64_v1f64:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vcvttsd2usi {{.*}}(%rip), %rax
; AVX512-NEXT: retq
entry:
%result = call <1 x i64> @llvm.experimental.constrained.fptoui.v1i64.v1f64(
<1 x double><double 42.1>,
metadata !"fpexcept.strict") #0
ret <1 x i64> %result
}
define <2 x i64> @constrained_vector_fptoui_v2i64_v2f64() #0 {
; CHECK-LABEL: constrained_vector_fptoui_v2i64_v2f64:
; CHECK: # %bb.0: # %entry
[TargetLowering] Fix another potential FPE in expandFP_TO_UINT D53794 introduced code to perform the FP_TO_UINT expansion via FP_TO_SINT in a way that would never expose floating-point exceptions in the intermediate steps. Unfortunately, I just noticed there is still a way this can happen. As discussed in D53794, the compiler now generates this sequence: // Sel = Src < 0x8000000000000000 // Val = select Sel, Src, Src - 0x8000000000000000 // Ofs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val) ^ Ofs The problem is with the Src - 0x8000000000000000 expression. As I mentioned in the original review, that expression can never overflow or underflow if the original value is in range for FP_TO_UINT. But I missed that we can get an Inexact exception in the case where Src is a very small positive value. (In this case the result of the sub is ignored, but that doesn't help.) Instead, I'd suggest to use the following sequence: // Sel = Src < 0x8000000000000000 // FltOfs = select Sel, 0, 0x8000000000000000 // IntOfs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val - FltOfs) ^ IntOfs In the case where the value is already in range of FP_TO_SINT, we now simply compute Val - 0, which now definitely cannot trap (unless Val is a NaN in which case we'd want to trap anyway). In the case where the value is not in range of FP_TO_SINT, but still in range of FP_TO_UINT, the sub can never be inexact, as Val is between 2^(n-1) and (2^n)-1, i.e. always has the 2^(n-1) bit set, and the sub is always simply clearing that bit. There is a slight complication in the case where Val is a constant, so we know at compile time whether Sel is true or false. In that scenario, the old code would automatically optimize the sub away, while this no longer happens with the new code. Instead, I've added extra code to check for this case and then just fall back to FP_TO_SINT directly. (This seems to catch even slightly more cases.) Original version of the patch by Ulrich Weigand. X86 changes added by Craig Topper Differential Revision: https://reviews.llvm.org/D67105
2019-12-07 06:11:04 +08:00
; CHECK-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: comisd %xmm2, %xmm1
; CHECK-NEXT: xorpd %xmm0, %xmm0
; CHECK-NEXT: xorpd %xmm3, %xmm3
; CHECK-NEXT: ja .LBB124_2
; CHECK-NEXT: # %bb.1: # %entry
; CHECK-NEXT: movapd %xmm1, %xmm3
; CHECK-NEXT: .LBB124_2: # %entry
; CHECK-NEXT: subsd %xmm3, %xmm2
[TargetLowering] Fix another potential FPE in expandFP_TO_UINT D53794 introduced code to perform the FP_TO_UINT expansion via FP_TO_SINT in a way that would never expose floating-point exceptions in the intermediate steps. Unfortunately, I just noticed there is still a way this can happen. As discussed in D53794, the compiler now generates this sequence: // Sel = Src < 0x8000000000000000 // Val = select Sel, Src, Src - 0x8000000000000000 // Ofs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val) ^ Ofs The problem is with the Src - 0x8000000000000000 expression. As I mentioned in the original review, that expression can never overflow or underflow if the original value is in range for FP_TO_UINT. But I missed that we can get an Inexact exception in the case where Src is a very small positive value. (In this case the result of the sub is ignored, but that doesn't help.) Instead, I'd suggest to use the following sequence: // Sel = Src < 0x8000000000000000 // FltOfs = select Sel, 0, 0x8000000000000000 // IntOfs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val - FltOfs) ^ IntOfs In the case where the value is already in range of FP_TO_SINT, we now simply compute Val - 0, which now definitely cannot trap (unless Val is a NaN in which case we'd want to trap anyway). In the case where the value is not in range of FP_TO_SINT, but still in range of FP_TO_UINT, the sub can never be inexact, as Val is between 2^(n-1) and (2^n)-1, i.e. always has the 2^(n-1) bit set, and the sub is always simply clearing that bit. There is a slight complication in the case where Val is a constant, so we know at compile time whether Sel is true or false. In that scenario, the old code would automatically optimize the sub away, while this no longer happens with the new code. Instead, I've added extra code to check for this case and then just fall back to FP_TO_SINT directly. (This seems to catch even slightly more cases.) Original version of the patch by Ulrich Weigand. X86 changes added by Craig Topper Differential Revision: https://reviews.llvm.org/D67105
2019-12-07 06:11:04 +08:00
; CHECK-NEXT: cvttsd2si %xmm2, %rax
; CHECK-NEXT: setbe %cl
; CHECK-NEXT: movzbl %cl, %ecx
; CHECK-NEXT: shlq $63, %rcx
; CHECK-NEXT: xorq %rax, %rcx
; CHECK-NEXT: movq %rcx, %xmm2
; CHECK-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
; CHECK-NEXT: comisd %xmm3, %xmm1
; CHECK-NEXT: ja .LBB124_4
; CHECK-NEXT: # %bb.3: # %entry
; CHECK-NEXT: movapd %xmm1, %xmm0
; CHECK-NEXT: .LBB124_4: # %entry
; CHECK-NEXT: subsd %xmm0, %xmm3
; CHECK-NEXT: cvttsd2si %xmm3, %rax
; CHECK-NEXT: setbe %cl
; CHECK-NEXT: movzbl %cl, %ecx
; CHECK-NEXT: shlq $63, %rcx
; CHECK-NEXT: xorq %rax, %rcx
; CHECK-NEXT: movq %rcx, %xmm0
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_fptoui_v2i64_v2f64:
; AVX1: # %bb.0: # %entry
[TargetLowering] Fix another potential FPE in expandFP_TO_UINT D53794 introduced code to perform the FP_TO_UINT expansion via FP_TO_SINT in a way that would never expose floating-point exceptions in the intermediate steps. Unfortunately, I just noticed there is still a way this can happen. As discussed in D53794, the compiler now generates this sequence: // Sel = Src < 0x8000000000000000 // Val = select Sel, Src, Src - 0x8000000000000000 // Ofs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val) ^ Ofs The problem is with the Src - 0x8000000000000000 expression. As I mentioned in the original review, that expression can never overflow or underflow if the original value is in range for FP_TO_UINT. But I missed that we can get an Inexact exception in the case where Src is a very small positive value. (In this case the result of the sub is ignored, but that doesn't help.) Instead, I'd suggest to use the following sequence: // Sel = Src < 0x8000000000000000 // FltOfs = select Sel, 0, 0x8000000000000000 // IntOfs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val - FltOfs) ^ IntOfs In the case where the value is already in range of FP_TO_SINT, we now simply compute Val - 0, which now definitely cannot trap (unless Val is a NaN in which case we'd want to trap anyway). In the case where the value is not in range of FP_TO_SINT, but still in range of FP_TO_UINT, the sub can never be inexact, as Val is between 2^(n-1) and (2^n)-1, i.e. always has the 2^(n-1) bit set, and the sub is always simply clearing that bit. There is a slight complication in the case where Val is a constant, so we know at compile time whether Sel is true or false. In that scenario, the old code would automatically optimize the sub away, while this no longer happens with the new code. Instead, I've added extra code to check for this case and then just fall back to FP_TO_SINT directly. (This seems to catch even slightly more cases.) Original version of the patch by Ulrich Weigand. X86 changes added by Craig Topper Differential Revision: https://reviews.llvm.org/D67105
2019-12-07 06:11:04 +08:00
; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX1-NEXT: vcomisd %xmm2, %xmm0
; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vxorpd %xmm3, %xmm3, %xmm3
; AVX1-NEXT: ja .LBB124_2
; AVX1-NEXT: # %bb.1: # %entry
; AVX1-NEXT: vmovapd %xmm0, %xmm3
; AVX1-NEXT: .LBB124_2: # %entry
; AVX1-NEXT: vsubsd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vcvttsd2si %xmm2, %rax
; AVX1-NEXT: setbe %cl
; AVX1-NEXT: movzbl %cl, %ecx
; AVX1-NEXT: shlq $63, %rcx
; AVX1-NEXT: xorq %rax, %rcx
; AVX1-NEXT: vmovq %rcx, %xmm2
; AVX1-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
; AVX1-NEXT: vcomisd %xmm3, %xmm0
; AVX1-NEXT: ja .LBB124_4
; AVX1-NEXT: # %bb.3: # %entry
; AVX1-NEXT: vmovapd %xmm0, %xmm1
; AVX1-NEXT: .LBB124_4: # %entry
; AVX1-NEXT: vsubsd %xmm1, %xmm3, %xmm0
[TargetLowering] Fix another potential FPE in expandFP_TO_UINT D53794 introduced code to perform the FP_TO_UINT expansion via FP_TO_SINT in a way that would never expose floating-point exceptions in the intermediate steps. Unfortunately, I just noticed there is still a way this can happen. As discussed in D53794, the compiler now generates this sequence: // Sel = Src < 0x8000000000000000 // Val = select Sel, Src, Src - 0x8000000000000000 // Ofs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val) ^ Ofs The problem is with the Src - 0x8000000000000000 expression. As I mentioned in the original review, that expression can never overflow or underflow if the original value is in range for FP_TO_UINT. But I missed that we can get an Inexact exception in the case where Src is a very small positive value. (In this case the result of the sub is ignored, but that doesn't help.) Instead, I'd suggest to use the following sequence: // Sel = Src < 0x8000000000000000 // FltOfs = select Sel, 0, 0x8000000000000000 // IntOfs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val - FltOfs) ^ IntOfs In the case where the value is already in range of FP_TO_SINT, we now simply compute Val - 0, which now definitely cannot trap (unless Val is a NaN in which case we'd want to trap anyway). In the case where the value is not in range of FP_TO_SINT, but still in range of FP_TO_UINT, the sub can never be inexact, as Val is between 2^(n-1) and (2^n)-1, i.e. always has the 2^(n-1) bit set, and the sub is always simply clearing that bit. There is a slight complication in the case where Val is a constant, so we know at compile time whether Sel is true or false. In that scenario, the old code would automatically optimize the sub away, while this no longer happens with the new code. Instead, I've added extra code to check for this case and then just fall back to FP_TO_SINT directly. (This seems to catch even slightly more cases.) Original version of the patch by Ulrich Weigand. X86 changes added by Craig Topper Differential Revision: https://reviews.llvm.org/D67105
2019-12-07 06:11:04 +08:00
; AVX1-NEXT: vcvttsd2si %xmm0, %rax
; AVX1-NEXT: setbe %cl
; AVX1-NEXT: movzbl %cl, %ecx
; AVX1-NEXT: shlq $63, %rcx
; AVX1-NEXT: xorq %rax, %rcx
; AVX1-NEXT: vmovq %rcx, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX1-NEXT: retq
;
; AVX512F-LABEL: constrained_vector_fptoui_v2i64_v2f64:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vcvttsd2usi {{.*}}(%rip), %rax
; AVX512F-NEXT: vmovq %rax, %xmm0
; AVX512F-NEXT: vcvttsd2usi {{.*}}(%rip), %rax
; AVX512F-NEXT: vmovq %rax, %xmm1
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: constrained_vector_fptoui_v2i64_v2f64:
; AVX512DQ: # %bb.0: # %entry
; AVX512DQ-NEXT: vmovaps {{.*#+}} xmm0 = [4.2100000000000001E+1,4.2200000000000003E+1]
; AVX512DQ-NEXT: vcvttpd2uqq %zmm0, %zmm0
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
entry:
%result = call <2 x i64> @llvm.experimental.constrained.fptoui.v2i64.v2f64(
<2 x double><double 42.1, double 42.2>,
metadata !"fpexcept.strict") #0
ret <2 x i64> %result
}
define <3 x i64> @constrained_vector_fptoui_v3i64_v3f64() #0 {
; CHECK-LABEL: constrained_vector_fptoui_v3i64_v3f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
[TargetLowering] Fix another potential FPE in expandFP_TO_UINT D53794 introduced code to perform the FP_TO_UINT expansion via FP_TO_SINT in a way that would never expose floating-point exceptions in the intermediate steps. Unfortunately, I just noticed there is still a way this can happen. As discussed in D53794, the compiler now generates this sequence: // Sel = Src < 0x8000000000000000 // Val = select Sel, Src, Src - 0x8000000000000000 // Ofs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val) ^ Ofs The problem is with the Src - 0x8000000000000000 expression. As I mentioned in the original review, that expression can never overflow or underflow if the original value is in range for FP_TO_UINT. But I missed that we can get an Inexact exception in the case where Src is a very small positive value. (In this case the result of the sub is ignored, but that doesn't help.) Instead, I'd suggest to use the following sequence: // Sel = Src < 0x8000000000000000 // FltOfs = select Sel, 0, 0x8000000000000000 // IntOfs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val - FltOfs) ^ IntOfs In the case where the value is already in range of FP_TO_SINT, we now simply compute Val - 0, which now definitely cannot trap (unless Val is a NaN in which case we'd want to trap anyway). In the case where the value is not in range of FP_TO_SINT, but still in range of FP_TO_UINT, the sub can never be inexact, as Val is between 2^(n-1) and (2^n)-1, i.e. always has the 2^(n-1) bit set, and the sub is always simply clearing that bit. There is a slight complication in the case where Val is a constant, so we know at compile time whether Sel is true or false. In that scenario, the old code would automatically optimize the sub away, while this no longer happens with the new code. Instead, I've added extra code to check for this case and then just fall back to FP_TO_SINT directly. (This seems to catch even slightly more cases.) Original version of the patch by Ulrich Weigand. X86 changes added by Craig Topper Differential Revision: https://reviews.llvm.org/D67105
2019-12-07 06:11:04 +08:00
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: comisd %xmm2, %xmm1
; CHECK-NEXT: xorpd %xmm0, %xmm0
; CHECK-NEXT: xorpd %xmm3, %xmm3
; CHECK-NEXT: ja .LBB125_2
; CHECK-NEXT: # %bb.1: # %entry
; CHECK-NEXT: movapd %xmm1, %xmm3
; CHECK-NEXT: .LBB125_2: # %entry
; CHECK-NEXT: subsd %xmm3, %xmm2
; CHECK-NEXT: cvttsd2si %xmm2, %rcx
; CHECK-NEXT: setbe %al
; CHECK-NEXT: movzbl %al, %eax
; CHECK-NEXT: shlq $63, %rax
; CHECK-NEXT: xorq %rcx, %rax
; CHECK-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
; CHECK-NEXT: comisd %xmm2, %xmm1
; CHECK-NEXT: xorpd %xmm3, %xmm3
; CHECK-NEXT: ja .LBB125_4
; CHECK-NEXT: # %bb.3: # %entry
; CHECK-NEXT: movapd %xmm1, %xmm3
; CHECK-NEXT: .LBB125_4: # %entry
; CHECK-NEXT: subsd %xmm3, %xmm2
; CHECK-NEXT: cvttsd2si %xmm2, %rcx
; CHECK-NEXT: setbe %dl
; CHECK-NEXT: movzbl %dl, %edx
; CHECK-NEXT: shlq $63, %rdx
; CHECK-NEXT: xorq %rcx, %rdx
; CHECK-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
; CHECK-NEXT: comisd %xmm2, %xmm1
; CHECK-NEXT: ja .LBB125_6
; CHECK-NEXT: # %bb.5: # %entry
; CHECK-NEXT: movapd %xmm1, %xmm0
; CHECK-NEXT: .LBB125_6: # %entry
; CHECK-NEXT: subsd %xmm0, %xmm2
; CHECK-NEXT: cvttsd2si %xmm2, %rsi
; CHECK-NEXT: setbe %cl
; CHECK-NEXT: movzbl %cl, %ecx
; CHECK-NEXT: shlq $63, %rcx
; CHECK-NEXT: xorq %rsi, %rcx
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_fptoui_v3i64_v3f64:
; AVX1: # %bb.0: # %entry
[TargetLowering] Fix another potential FPE in expandFP_TO_UINT D53794 introduced code to perform the FP_TO_UINT expansion via FP_TO_SINT in a way that would never expose floating-point exceptions in the intermediate steps. Unfortunately, I just noticed there is still a way this can happen. As discussed in D53794, the compiler now generates this sequence: // Sel = Src < 0x8000000000000000 // Val = select Sel, Src, Src - 0x8000000000000000 // Ofs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val) ^ Ofs The problem is with the Src - 0x8000000000000000 expression. As I mentioned in the original review, that expression can never overflow or underflow if the original value is in range for FP_TO_UINT. But I missed that we can get an Inexact exception in the case where Src is a very small positive value. (In this case the result of the sub is ignored, but that doesn't help.) Instead, I'd suggest to use the following sequence: // Sel = Src < 0x8000000000000000 // FltOfs = select Sel, 0, 0x8000000000000000 // IntOfs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val - FltOfs) ^ IntOfs In the case where the value is already in range of FP_TO_SINT, we now simply compute Val - 0, which now definitely cannot trap (unless Val is a NaN in which case we'd want to trap anyway). In the case where the value is not in range of FP_TO_SINT, but still in range of FP_TO_UINT, the sub can never be inexact, as Val is between 2^(n-1) and (2^n)-1, i.e. always has the 2^(n-1) bit set, and the sub is always simply clearing that bit. There is a slight complication in the case where Val is a constant, so we know at compile time whether Sel is true or false. In that scenario, the old code would automatically optimize the sub away, while this no longer happens with the new code. Instead, I've added extra code to check for this case and then just fall back to FP_TO_SINT directly. (This seems to catch even slightly more cases.) Original version of the patch by Ulrich Weigand. X86 changes added by Craig Topper Differential Revision: https://reviews.llvm.org/D67105
2019-12-07 06:11:04 +08:00
; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX1-NEXT: vcomisd %xmm2, %xmm0
; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vxorpd %xmm3, %xmm3, %xmm3
; AVX1-NEXT: ja .LBB125_2
; AVX1-NEXT: # %bb.1: # %entry
; AVX1-NEXT: vmovapd %xmm0, %xmm3
; AVX1-NEXT: .LBB125_2: # %entry
; AVX1-NEXT: vsubsd %xmm3, %xmm2, %xmm2
[TargetLowering] Fix another potential FPE in expandFP_TO_UINT D53794 introduced code to perform the FP_TO_UINT expansion via FP_TO_SINT in a way that would never expose floating-point exceptions in the intermediate steps. Unfortunately, I just noticed there is still a way this can happen. As discussed in D53794, the compiler now generates this sequence: // Sel = Src < 0x8000000000000000 // Val = select Sel, Src, Src - 0x8000000000000000 // Ofs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val) ^ Ofs The problem is with the Src - 0x8000000000000000 expression. As I mentioned in the original review, that expression can never overflow or underflow if the original value is in range for FP_TO_UINT. But I missed that we can get an Inexact exception in the case where Src is a very small positive value. (In this case the result of the sub is ignored, but that doesn't help.) Instead, I'd suggest to use the following sequence: // Sel = Src < 0x8000000000000000 // FltOfs = select Sel, 0, 0x8000000000000000 // IntOfs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val - FltOfs) ^ IntOfs In the case where the value is already in range of FP_TO_SINT, we now simply compute Val - 0, which now definitely cannot trap (unless Val is a NaN in which case we'd want to trap anyway). In the case where the value is not in range of FP_TO_SINT, but still in range of FP_TO_UINT, the sub can never be inexact, as Val is between 2^(n-1) and (2^n)-1, i.e. always has the 2^(n-1) bit set, and the sub is always simply clearing that bit. There is a slight complication in the case where Val is a constant, so we know at compile time whether Sel is true or false. In that scenario, the old code would automatically optimize the sub away, while this no longer happens with the new code. Instead, I've added extra code to check for this case and then just fall back to FP_TO_SINT directly. (This seems to catch even slightly more cases.) Original version of the patch by Ulrich Weigand. X86 changes added by Craig Topper Differential Revision: https://reviews.llvm.org/D67105
2019-12-07 06:11:04 +08:00
; AVX1-NEXT: vcvttsd2si %xmm2, %rax
; AVX1-NEXT: setbe %cl
; AVX1-NEXT: movzbl %cl, %ecx
; AVX1-NEXT: shlq $63, %rcx
; AVX1-NEXT: xorq %rax, %rcx
; AVX1-NEXT: vmovq %rcx, %xmm2
; AVX1-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
; AVX1-NEXT: vcomisd %xmm3, %xmm0
; AVX1-NEXT: vxorpd %xmm4, %xmm4, %xmm4
; AVX1-NEXT: ja .LBB125_4
; AVX1-NEXT: # %bb.3: # %entry
; AVX1-NEXT: vmovapd %xmm0, %xmm4
; AVX1-NEXT: .LBB125_4: # %entry
; AVX1-NEXT: vsubsd %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vcvttsd2si %xmm3, %rax
; AVX1-NEXT: setbe %cl
; AVX1-NEXT: movzbl %cl, %ecx
; AVX1-NEXT: shlq $63, %rcx
; AVX1-NEXT: xorq %rax, %rcx
; AVX1-NEXT: vmovq %rcx, %xmm3
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
; AVX1-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
; AVX1-NEXT: vcomisd %xmm3, %xmm0
; AVX1-NEXT: ja .LBB125_6
; AVX1-NEXT: # %bb.5: # %entry
; AVX1-NEXT: vmovapd %xmm0, %xmm1
; AVX1-NEXT: .LBB125_6: # %entry
; AVX1-NEXT: vsubsd %xmm1, %xmm3, %xmm0
[TargetLowering] Fix another potential FPE in expandFP_TO_UINT D53794 introduced code to perform the FP_TO_UINT expansion via FP_TO_SINT in a way that would never expose floating-point exceptions in the intermediate steps. Unfortunately, I just noticed there is still a way this can happen. As discussed in D53794, the compiler now generates this sequence: // Sel = Src < 0x8000000000000000 // Val = select Sel, Src, Src - 0x8000000000000000 // Ofs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val) ^ Ofs The problem is with the Src - 0x8000000000000000 expression. As I mentioned in the original review, that expression can never overflow or underflow if the original value is in range for FP_TO_UINT. But I missed that we can get an Inexact exception in the case where Src is a very small positive value. (In this case the result of the sub is ignored, but that doesn't help.) Instead, I'd suggest to use the following sequence: // Sel = Src < 0x8000000000000000 // FltOfs = select Sel, 0, 0x8000000000000000 // IntOfs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val - FltOfs) ^ IntOfs In the case where the value is already in range of FP_TO_SINT, we now simply compute Val - 0, which now definitely cannot trap (unless Val is a NaN in which case we'd want to trap anyway). In the case where the value is not in range of FP_TO_SINT, but still in range of FP_TO_UINT, the sub can never be inexact, as Val is between 2^(n-1) and (2^n)-1, i.e. always has the 2^(n-1) bit set, and the sub is always simply clearing that bit. There is a slight complication in the case where Val is a constant, so we know at compile time whether Sel is true or false. In that scenario, the old code would automatically optimize the sub away, while this no longer happens with the new code. Instead, I've added extra code to check for this case and then just fall back to FP_TO_SINT directly. (This seems to catch even slightly more cases.) Original version of the patch by Ulrich Weigand. X86 changes added by Craig Topper Differential Revision: https://reviews.llvm.org/D67105
2019-12-07 06:11:04 +08:00
; AVX1-NEXT: vcvttsd2si %xmm0, %rax
; AVX1-NEXT: setbe %cl
; AVX1-NEXT: movzbl %cl, %ecx
; AVX1-NEXT: shlq $63, %rcx
; AVX1-NEXT: xorq %rax, %rcx
; AVX1-NEXT: vmovq %rcx, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_fptoui_v3i64_v3f64:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vcvttsd2usi {{.*}}(%rip), %rax
; AVX512-NEXT: vmovq %rax, %xmm0
; AVX512-NEXT: vcvttsd2usi {{.*}}(%rip), %rax
; AVX512-NEXT: vmovq %rax, %xmm1
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512-NEXT: vcvttsd2usi {{.*}}(%rip), %rax
; AVX512-NEXT: vmovq %rax, %xmm1
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
entry:
%result = call <3 x i64> @llvm.experimental.constrained.fptoui.v3i64.v3f64(
<3 x double><double 42.1, double 42.2,
double 42.3>,
metadata !"fpexcept.strict") #0
ret <3 x i64> %result
}
define <4 x i64> @constrained_vector_fptoui_v4i64_v4f64() #0 {
; CHECK-LABEL: constrained_vector_fptoui_v4i64_v4f64:
; CHECK: # %bb.0: # %entry
[TargetLowering] Fix another potential FPE in expandFP_TO_UINT D53794 introduced code to perform the FP_TO_UINT expansion via FP_TO_SINT in a way that would never expose floating-point exceptions in the intermediate steps. Unfortunately, I just noticed there is still a way this can happen. As discussed in D53794, the compiler now generates this sequence: // Sel = Src < 0x8000000000000000 // Val = select Sel, Src, Src - 0x8000000000000000 // Ofs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val) ^ Ofs The problem is with the Src - 0x8000000000000000 expression. As I mentioned in the original review, that expression can never overflow or underflow if the original value is in range for FP_TO_UINT. But I missed that we can get an Inexact exception in the case where Src is a very small positive value. (In this case the result of the sub is ignored, but that doesn't help.) Instead, I'd suggest to use the following sequence: // Sel = Src < 0x8000000000000000 // FltOfs = select Sel, 0, 0x8000000000000000 // IntOfs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val - FltOfs) ^ IntOfs In the case where the value is already in range of FP_TO_SINT, we now simply compute Val - 0, which now definitely cannot trap (unless Val is a NaN in which case we'd want to trap anyway). In the case where the value is not in range of FP_TO_SINT, but still in range of FP_TO_UINT, the sub can never be inexact, as Val is between 2^(n-1) and (2^n)-1, i.e. always has the 2^(n-1) bit set, and the sub is always simply clearing that bit. There is a slight complication in the case where Val is a constant, so we know at compile time whether Sel is true or false. In that scenario, the old code would automatically optimize the sub away, while this no longer happens with the new code. Instead, I've added extra code to check for this case and then just fall back to FP_TO_SINT directly. (This seems to catch even slightly more cases.) Original version of the patch by Ulrich Weigand. X86 changes added by Craig Topper Differential Revision: https://reviews.llvm.org/D67105
2019-12-07 06:11:04 +08:00
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
; CHECK-NEXT: comisd %xmm0, %xmm2
; CHECK-NEXT: xorpd %xmm1, %xmm1
; CHECK-NEXT: xorpd %xmm3, %xmm3
; CHECK-NEXT: ja .LBB126_2
; CHECK-NEXT: # %bb.1: # %entry
; CHECK-NEXT: movapd %xmm2, %xmm3
; CHECK-NEXT: .LBB126_2: # %entry
; CHECK-NEXT: subsd %xmm3, %xmm0
; CHECK-NEXT: cvttsd2si %xmm0, %rcx
; CHECK-NEXT: setbe %al
; CHECK-NEXT: movzbl %al, %eax
; CHECK-NEXT: shlq $63, %rax
; CHECK-NEXT: xorq %rcx, %rax
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: comisd %xmm0, %xmm2
; CHECK-NEXT: xorpd %xmm4, %xmm4
; CHECK-NEXT: ja .LBB126_4
; CHECK-NEXT: # %bb.3: # %entry
; CHECK-NEXT: movapd %xmm2, %xmm4
; CHECK-NEXT: .LBB126_4: # %entry
; CHECK-NEXT: movq %rax, %xmm3
; CHECK-NEXT: subsd %xmm4, %xmm0
; CHECK-NEXT: cvttsd2si %xmm0, %rax
; CHECK-NEXT: setbe %cl
; CHECK-NEXT: movzbl %cl, %ecx
; CHECK-NEXT: shlq $63, %rcx
; CHECK-NEXT: xorq %rax, %rcx
; CHECK-NEXT: movq %rcx, %xmm0
; CHECK-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero
; CHECK-NEXT: comisd %xmm4, %xmm2
; CHECK-NEXT: xorpd %xmm5, %xmm5
; CHECK-NEXT: ja .LBB126_6
; CHECK-NEXT: # %bb.5: # %entry
; CHECK-NEXT: movapd %xmm2, %xmm5
; CHECK-NEXT: .LBB126_6: # %entry
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; CHECK-NEXT: subsd %xmm5, %xmm4
; CHECK-NEXT: cvttsd2si %xmm4, %rax
; CHECK-NEXT: setbe %cl
; CHECK-NEXT: movzbl %cl, %ecx
; CHECK-NEXT: shlq $63, %rcx
; CHECK-NEXT: xorq %rax, %rcx
; CHECK-NEXT: movq %rcx, %xmm3
; CHECK-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero
; CHECK-NEXT: comisd %xmm4, %xmm2
; CHECK-NEXT: ja .LBB126_8
; CHECK-NEXT: # %bb.7: # %entry
; CHECK-NEXT: movapd %xmm2, %xmm1
; CHECK-NEXT: .LBB126_8: # %entry
; CHECK-NEXT: subsd %xmm1, %xmm4
; CHECK-NEXT: cvttsd2si %xmm4, %rax
; CHECK-NEXT: setbe %cl
; CHECK-NEXT: movzbl %cl, %ecx
; CHECK-NEXT: shlq $63, %rcx
; CHECK-NEXT: xorq %rax, %rcx
; CHECK-NEXT: movq %rcx, %xmm1
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_fptoui_v4i64_v4f64:
; AVX1: # %bb.0: # %entry
[TargetLowering] Fix another potential FPE in expandFP_TO_UINT D53794 introduced code to perform the FP_TO_UINT expansion via FP_TO_SINT in a way that would never expose floating-point exceptions in the intermediate steps. Unfortunately, I just noticed there is still a way this can happen. As discussed in D53794, the compiler now generates this sequence: // Sel = Src < 0x8000000000000000 // Val = select Sel, Src, Src - 0x8000000000000000 // Ofs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val) ^ Ofs The problem is with the Src - 0x8000000000000000 expression. As I mentioned in the original review, that expression can never overflow or underflow if the original value is in range for FP_TO_UINT. But I missed that we can get an Inexact exception in the case where Src is a very small positive value. (In this case the result of the sub is ignored, but that doesn't help.) Instead, I'd suggest to use the following sequence: // Sel = Src < 0x8000000000000000 // FltOfs = select Sel, 0, 0x8000000000000000 // IntOfs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val - FltOfs) ^ IntOfs In the case where the value is already in range of FP_TO_SINT, we now simply compute Val - 0, which now definitely cannot trap (unless Val is a NaN in which case we'd want to trap anyway). In the case where the value is not in range of FP_TO_SINT, but still in range of FP_TO_UINT, the sub can never be inexact, as Val is between 2^(n-1) and (2^n)-1, i.e. always has the 2^(n-1) bit set, and the sub is always simply clearing that bit. There is a slight complication in the case where Val is a constant, so we know at compile time whether Sel is true or false. In that scenario, the old code would automatically optimize the sub away, while this no longer happens with the new code. Instead, I've added extra code to check for this case and then just fall back to FP_TO_SINT directly. (This seems to catch even slightly more cases.) Original version of the patch by Ulrich Weigand. X86 changes added by Craig Topper Differential Revision: https://reviews.llvm.org/D67105
2019-12-07 06:11:04 +08:00
; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX1-NEXT: vcomisd %xmm2, %xmm0
; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vxorpd %xmm3, %xmm3, %xmm3
; AVX1-NEXT: ja .LBB126_2
; AVX1-NEXT: # %bb.1: # %entry
; AVX1-NEXT: vmovapd %xmm0, %xmm3
; AVX1-NEXT: .LBB126_2: # %entry
; AVX1-NEXT: vsubsd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vcvttsd2si %xmm2, %rcx
; AVX1-NEXT: setbe %al
; AVX1-NEXT: movzbl %al, %eax
; AVX1-NEXT: shlq $63, %rax
; AVX1-NEXT: xorq %rcx, %rax
[TargetLowering] Fix another potential FPE in expandFP_TO_UINT D53794 introduced code to perform the FP_TO_UINT expansion via FP_TO_SINT in a way that would never expose floating-point exceptions in the intermediate steps. Unfortunately, I just noticed there is still a way this can happen. As discussed in D53794, the compiler now generates this sequence: // Sel = Src < 0x8000000000000000 // Val = select Sel, Src, Src - 0x8000000000000000 // Ofs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val) ^ Ofs The problem is with the Src - 0x8000000000000000 expression. As I mentioned in the original review, that expression can never overflow or underflow if the original value is in range for FP_TO_UINT. But I missed that we can get an Inexact exception in the case where Src is a very small positive value. (In this case the result of the sub is ignored, but that doesn't help.) Instead, I'd suggest to use the following sequence: // Sel = Src < 0x8000000000000000 // FltOfs = select Sel, 0, 0x8000000000000000 // IntOfs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val - FltOfs) ^ IntOfs In the case where the value is already in range of FP_TO_SINT, we now simply compute Val - 0, which now definitely cannot trap (unless Val is a NaN in which case we'd want to trap anyway). In the case where the value is not in range of FP_TO_SINT, but still in range of FP_TO_UINT, the sub can never be inexact, as Val is between 2^(n-1) and (2^n)-1, i.e. always has the 2^(n-1) bit set, and the sub is always simply clearing that bit. There is a slight complication in the case where Val is a constant, so we know at compile time whether Sel is true or false. In that scenario, the old code would automatically optimize the sub away, while this no longer happens with the new code. Instead, I've added extra code to check for this case and then just fall back to FP_TO_SINT directly. (This seems to catch even slightly more cases.) Original version of the patch by Ulrich Weigand. X86 changes added by Craig Topper Differential Revision: https://reviews.llvm.org/D67105
2019-12-07 06:11:04 +08:00
; AVX1-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
; AVX1-NEXT: vcomisd %xmm3, %xmm0
; AVX1-NEXT: vxorpd %xmm4, %xmm4, %xmm4
; AVX1-NEXT: ja .LBB126_4
; AVX1-NEXT: # %bb.3: # %entry
; AVX1-NEXT: vmovapd %xmm0, %xmm4
; AVX1-NEXT: .LBB126_4: # %entry
; AVX1-NEXT: vmovq %rax, %xmm2
; AVX1-NEXT: vsubsd %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vcvttsd2si %xmm3, %rax
; AVX1-NEXT: setbe %cl
; AVX1-NEXT: movzbl %cl, %ecx
; AVX1-NEXT: shlq $63, %rcx
; AVX1-NEXT: xorq %rax, %rcx
; AVX1-NEXT: vmovq %rcx, %xmm3
; AVX1-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
; AVX1-NEXT: vcomisd %xmm4, %xmm0
; AVX1-NEXT: vxorpd %xmm5, %xmm5, %xmm5
; AVX1-NEXT: ja .LBB126_6
; AVX1-NEXT: # %bb.5: # %entry
; AVX1-NEXT: vmovapd %xmm0, %xmm5
; AVX1-NEXT: .LBB126_6: # %entry
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
; AVX1-NEXT: vsubsd %xmm5, %xmm4, %xmm3
; AVX1-NEXT: vcvttsd2si %xmm3, %rax
; AVX1-NEXT: setbe %cl
; AVX1-NEXT: movzbl %cl, %ecx
; AVX1-NEXT: shlq $63, %rcx
; AVX1-NEXT: xorq %rax, %rcx
; AVX1-NEXT: vmovq %rcx, %xmm3
; AVX1-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
; AVX1-NEXT: vcomisd %xmm4, %xmm0
; AVX1-NEXT: ja .LBB126_8
; AVX1-NEXT: # %bb.7: # %entry
; AVX1-NEXT: vmovapd %xmm0, %xmm1
; AVX1-NEXT: .LBB126_8: # %entry
; AVX1-NEXT: vsubsd %xmm1, %xmm4, %xmm0
[TargetLowering] Fix another potential FPE in expandFP_TO_UINT D53794 introduced code to perform the FP_TO_UINT expansion via FP_TO_SINT in a way that would never expose floating-point exceptions in the intermediate steps. Unfortunately, I just noticed there is still a way this can happen. As discussed in D53794, the compiler now generates this sequence: // Sel = Src < 0x8000000000000000 // Val = select Sel, Src, Src - 0x8000000000000000 // Ofs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val) ^ Ofs The problem is with the Src - 0x8000000000000000 expression. As I mentioned in the original review, that expression can never overflow or underflow if the original value is in range for FP_TO_UINT. But I missed that we can get an Inexact exception in the case where Src is a very small positive value. (In this case the result of the sub is ignored, but that doesn't help.) Instead, I'd suggest to use the following sequence: // Sel = Src < 0x8000000000000000 // FltOfs = select Sel, 0, 0x8000000000000000 // IntOfs = select Sel, 0, 0x8000000000000000 // Result = fp_to_sint(Val - FltOfs) ^ IntOfs In the case where the value is already in range of FP_TO_SINT, we now simply compute Val - 0, which now definitely cannot trap (unless Val is a NaN in which case we'd want to trap anyway). In the case where the value is not in range of FP_TO_SINT, but still in range of FP_TO_UINT, the sub can never be inexact, as Val is between 2^(n-1) and (2^n)-1, i.e. always has the 2^(n-1) bit set, and the sub is always simply clearing that bit. There is a slight complication in the case where Val is a constant, so we know at compile time whether Sel is true or false. In that scenario, the old code would automatically optimize the sub away, while this no longer happens with the new code. Instead, I've added extra code to check for this case and then just fall back to FP_TO_SINT directly. (This seems to catch even slightly more cases.) Original version of the patch by Ulrich Weigand. X86 changes added by Craig Topper Differential Revision: https://reviews.llvm.org/D67105
2019-12-07 06:11:04 +08:00
; AVX1-NEXT: vcvttsd2si %xmm0, %rax
; AVX1-NEXT: setbe %cl
; AVX1-NEXT: movzbl %cl, %ecx
; AVX1-NEXT: shlq $63, %rcx
; AVX1-NEXT: xorq %rax, %rcx
; AVX1-NEXT: vmovq %rcx, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX512F-LABEL: constrained_vector_fptoui_v4i64_v4f64:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vcvttsd2usi {{.*}}(%rip), %rax
; AVX512F-NEXT: vmovq %rax, %xmm0
; AVX512F-NEXT: vcvttsd2usi {{.*}}(%rip), %rax
; AVX512F-NEXT: vmovq %rax, %xmm1
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512F-NEXT: vcvttsd2usi {{.*}}(%rip), %rax
; AVX512F-NEXT: vmovq %rax, %xmm1
; AVX512F-NEXT: vcvttsd2usi {{.*}}(%rip), %rax
; AVX512F-NEXT: vmovq %rax, %xmm2
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: constrained_vector_fptoui_v4i64_v4f64:
; AVX512DQ: # %bb.0: # %entry
; AVX512DQ-NEXT: vmovaps {{.*#+}} ymm0 = [4.2100000000000001E+1,4.2200000000000003E+1,4.2299999999999997E+1,4.2399999999999999E+1]
; AVX512DQ-NEXT: vcvttpd2uqq %zmm0, %zmm0
; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512DQ-NEXT: retq
entry:
%result = call <4 x i64> @llvm.experimental.constrained.fptoui.v4i64.v4f64(
<4 x double><double 42.1, double 42.2,
double 42.3, double 42.4>,
metadata !"fpexcept.strict") #0
ret <4 x i64> %result
}
define <1 x float> @constrained_vector_fptrunc_v1f64() #0 {
; CHECK-LABEL: constrained_vector_fptrunc_v1f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: cvtsd2ss %xmm0, %xmm0
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_fptrunc_v1f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
%result = call <1 x float> @llvm.experimental.constrained.fptrunc.v1f32.v1f64(
<1 x double><double 42.1>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <1 x float> %result
}
define <2 x float> @constrained_vector_fptrunc_v2f64() #0 {
; CHECK-LABEL: constrained_vector_fptrunc_v2f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvtpd2ps {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_fptrunc_v2f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vcvtpd2psx {{.*}}(%rip), %xmm0
; AVX-NEXT: retq
entry:
%result = call <2 x float> @llvm.experimental.constrained.fptrunc.v2f32.v2f64(
<2 x double><double 42.1, double 42.2>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <2 x float> %result
}
define <3 x float> @constrained_vector_fptrunc_v3f64() #0 {
; CHECK-LABEL: constrained_vector_fptrunc_v3f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: cvtsd2ss %xmm0, %xmm1
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: cvtsd2ss %xmm0, %xmm0
; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: cvtsd2ss %xmm1, %xmm1
; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_fptrunc_v3f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; AVX-NEXT: retq
entry:
%result = call <3 x float> @llvm.experimental.constrained.fptrunc.v3f32.v3f64(
<3 x double><double 42.1, double 42.2,
double 42.3>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x float> %result
}
define <4 x float> @constrained_vector_fptrunc_v4f64() #0 {
; CHECK-LABEL: constrained_vector_fptrunc_v4f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvtpd2ps {{.*}}(%rip), %xmm1
; CHECK-NEXT: cvtpd2ps {{.*}}(%rip), %xmm0
; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_fptrunc_v4f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vcvtpd2psy {{.*}}(%rip), %xmm0
; AVX-NEXT: retq
entry:
%result = call <4 x float> @llvm.experimental.constrained.fptrunc.v4f32.v4f64(
<4 x double><double 42.1, double 42.2,
double 42.3, double 42.4>,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <4 x float> %result
}
define <1 x double> @constrained_vector_fpext_v1f32() #0 {
; CHECK-LABEL: constrained_vector_fpext_v1f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: cvtss2sd %xmm0, %xmm0
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_fpext_v1f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
%result = call <1 x double> @llvm.experimental.constrained.fpext.v1f64.v1f32(
<1 x float><float 42.0>,
metadata !"fpexcept.strict") #0
ret <1 x double> %result
}
define <2 x double> @constrained_vector_fpext_v2f32() #0 {
; CHECK-LABEL: constrained_vector_fpext_v2f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvtps2pd {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_fpext_v2f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vcvtps2pd {{.*}}(%rip), %xmm0
; AVX-NEXT: retq
entry:
%result = call <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f32(
<2 x float><float 42.0, float 43.0>,
metadata !"fpexcept.strict") #0
ret <2 x double> %result
}
define <3 x double> @constrained_vector_fpext_v3f32() #0 {
; CHECK-LABEL: constrained_vector_fpext_v3f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: cvtss2sd %xmm0, %xmm1
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: cvtss2sd %xmm0, %xmm0
; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; CHECK-NEXT: cvtss2sd %xmm2, %xmm2
; CHECK-NEXT: movsd %xmm2, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp)
; CHECK-NEXT: wait
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_fpext_v3f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: retq
entry:
%result = call <3 x double> @llvm.experimental.constrained.fpext.v3f64.v3f32(
<3 x float><float 42.0, float 43.0,
float 44.0>,
metadata !"fpexcept.strict") #0
ret <3 x double> %result
}
define <4 x double> @constrained_vector_fpext_v4f32() #0 {
; CHECK-LABEL: constrained_vector_fpext_v4f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvtps2pd {{.*}}(%rip), %xmm1
; CHECK-NEXT: cvtps2pd {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_fpext_v4f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vcvtps2pd {{.*}}(%rip), %ymm0
; AVX-NEXT: retq
entry:
%result = call <4 x double> @llvm.experimental.constrained.fpext.v4f64.v4f32(
<4 x float><float 42.0, float 43.0,
float 44.0, float 45.0>,
metadata !"fpexcept.strict") #0
ret <4 x double> %result
}
define <1 x float> @constrained_vector_ceil_v1f32() #0 {
; CHECK-LABEL: constrained_vector_ceil_v1f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq ceilf
; CHECK-NEXT: popq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_ceil_v1f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
%ceil = call <1 x float> @llvm.experimental.constrained.ceil.v1f32(
<1 x float> <float 1.5>,
metadata !"fpexcept.strict") #0
ret <1 x float> %ceil
}
define <2 x double> @constrained_vector_ceil_v2f64() #0 {
; CHECK-LABEL: constrained_vector_ceil_v2f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq ceil
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq ceil
; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_ceil_v2f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vroundpd $10, {{.*}}(%rip), %xmm0
; AVX-NEXT: retq
entry:
%ceil = call <2 x double> @llvm.experimental.constrained.ceil.v2f64(
<2 x double> <double 1.1, double 1.9>,
metadata !"fpexcept.strict") #0
ret <2 x double> %ceil
}
define <3 x float> @constrained_vector_ceil_v3f32() #0 {
; CHECK-LABEL: constrained_vector_ceil_v3f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq ceilf
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq ceilf
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq ceilf
; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
; CHECK-NEXT: movaps %xmm1, %xmm0
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_ceil_v3f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: vroundss $10, %xmm1, %xmm1, %xmm1
; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX-NEXT: vroundss $10, %xmm2, %xmm2, %xmm2
; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
; AVX-NEXT: retq
entry:
%ceil = call <3 x float> @llvm.experimental.constrained.ceil.v3f32(
<3 x float> <float 1.5, float 2.5, float 3.5>,
metadata !"fpexcept.strict") #0
ret <3 x float> %ceil
}
define <3 x double> @constrained_vector_ceil_v3f64() #0 {
; CHECK-LABEL: constrained_vector_ceil_v3f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq ceil
; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq ceil
; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq ceil
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl {{[0-9]+}}(%rsp)
; CHECK-NEXT: wait
; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_ceil_v3f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm0
; AVX-NEXT: vroundpd $10, {{.*}}(%rip), %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: retq
entry:
%ceil = call <3 x double> @llvm.experimental.constrained.ceil.v3f64(
<3 x double> <double 1.1, double 1.9, double 1.5>,
metadata !"fpexcept.strict") #0
ret <3 x double> %ceil
}
define <1 x float> @constrained_vector_floor_v1f32() #0 {
; CHECK-LABEL: constrained_vector_floor_v1f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq floorf
; CHECK-NEXT: popq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_floor_v1f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
%floor = call <1 x float> @llvm.experimental.constrained.floor.v1f32(
<1 x float> <float 1.5>,
metadata !"fpexcept.strict") #0
ret <1 x float> %floor
}
define <2 x double> @constrained_vector_floor_v2f64() #0 {
; CHECK-LABEL: constrained_vector_floor_v2f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq floor
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq floor
; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_floor_v2f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vroundpd $9, {{.*}}(%rip), %xmm0
; AVX-NEXT: retq
entry:
%floor = call <2 x double> @llvm.experimental.constrained.floor.v2f64(
<2 x double> <double 1.1, double 1.9>,
metadata !"fpexcept.strict") #0
ret <2 x double> %floor
}
define <3 x float> @constrained_vector_floor_v3f32() #0 {
; CHECK-LABEL: constrained_vector_floor_v3f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq floorf
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq floorf
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq floorf
; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
; CHECK-NEXT: movaps %xmm1, %xmm0
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_floor_v3f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: vroundss $9, %xmm1, %xmm1, %xmm1
; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX-NEXT: vroundss $9, %xmm2, %xmm2, %xmm2
; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
; AVX-NEXT: retq
entry:
%floor = call <3 x float> @llvm.experimental.constrained.floor.v3f32(
<3 x float> <float 1.5, float 2.5, float 3.5>,
metadata !"fpexcept.strict") #0
ret <3 x float> %floor
}
define <3 x double> @constrained_vector_floor_v3f64() #0 {
; CHECK-LABEL: constrained_vector_floor_v3f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq floor
; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq floor
; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq floor
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl {{[0-9]+}}(%rsp)
; CHECK-NEXT: wait
; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_floor_v3f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm0
; AVX-NEXT: vroundpd $9, {{.*}}(%rip), %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: retq
entry:
%floor = call <3 x double> @llvm.experimental.constrained.floor.v3f64(
<3 x double> <double 1.1, double 1.9, double 1.5>,
metadata !"fpexcept.strict") #0
ret <3 x double> %floor
}
define <1 x float> @constrained_vector_round_v1f32() #0 {
; CHECK-LABEL: constrained_vector_round_v1f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq roundf
; CHECK-NEXT: popq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_round_v1f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: pushq %rax
; AVX-NEXT: .cfi_def_cfa_offset 16
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: callq roundf
; AVX-NEXT: popq %rax
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%round = call <1 x float> @llvm.experimental.constrained.round.v1f32(
<1 x float> <float 1.5>,
metadata !"fpexcept.strict") #0
ret <1 x float> %round
}
define <2 x double> @constrained_vector_round_v2f64() #0 {
; CHECK-LABEL: constrained_vector_round_v2f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq round
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq round
; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_round_v2f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $24, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 32
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq round
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq round
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: addq $24, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%round = call <2 x double> @llvm.experimental.constrained.round.v2f64(
<2 x double> <double 1.1, double 1.9>,
metadata !"fpexcept.strict") #0
ret <2 x double> %round
}
define <3 x float> @constrained_vector_round_v3f32() #0 {
; CHECK-LABEL: constrained_vector_round_v3f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq roundf
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq roundf
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq roundf
; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
; CHECK-NEXT: movaps %xmm1, %xmm0
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_round_v3f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 48
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: callq roundf
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: callq roundf
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: callq roundf
; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3]
; AVX-NEXT: addq $40, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%round = call <3 x float> @llvm.experimental.constrained.round.v3f32(
<3 x float> <float 1.5, float 2.5, float 3.5>,
metadata !"fpexcept.strict") #0
ret <3 x float> %round
}
define <3 x double> @constrained_vector_round_v3f64() #0 {
; CHECK-LABEL: constrained_vector_round_v3f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq round
; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq round
; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq round
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl {{[0-9]+}}(%rsp)
; CHECK-NEXT: wait
; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_round_v3f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $56, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 64
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq round
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: callq round
; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vzeroupper
; AVX-NEXT: callq round
; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: addq $56, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%round = call <3 x double> @llvm.experimental.constrained.round.v3f64(
<3 x double> <double 1.1, double 1.9, double 1.5>,
metadata !"fpexcept.strict") #0
ret <3 x double> %round
}
define <1 x float> @constrained_vector_trunc_v1f32() #0 {
; CHECK-LABEL: constrained_vector_trunc_v1f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq truncf
; CHECK-NEXT: popq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_trunc_v1f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
%trunc = call <1 x float> @llvm.experimental.constrained.trunc.v1f32(
<1 x float> <float 1.5>,
metadata !"fpexcept.strict") #0
ret <1 x float> %trunc
}
define <2 x double> @constrained_vector_trunc_v2f64() #0 {
; CHECK-LABEL: constrained_vector_trunc_v2f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq trunc
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq trunc
; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_trunc_v2f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vroundpd $11, {{.*}}(%rip), %xmm0
; AVX-NEXT: retq
entry:
%trunc = call <2 x double> @llvm.experimental.constrained.trunc.v2f64(
<2 x double> <double 1.1, double 1.9>,
metadata !"fpexcept.strict") #0
ret <2 x double> %trunc
}
define <3 x float> @constrained_vector_trunc_v3f32() #0 {
; CHECK-LABEL: constrained_vector_trunc_v3f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq truncf
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq truncf
; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: callq truncf
; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
; CHECK-NEXT: movaps %xmm1, %xmm0
; CHECK-NEXT: addq $40, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_trunc_v3f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: vroundss $11, %xmm1, %xmm1, %xmm1
; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX-NEXT: vroundss $11, %xmm2, %xmm2, %xmm2
; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
; AVX-NEXT: retq
entry:
%trunc = call <3 x float> @llvm.experimental.constrained.trunc.v3f32(
<3 x float> <float 1.5, float 2.5, float 3.5>,
metadata !"fpexcept.strict") #0
ret <3 x float> %trunc
}
define <3 x double> @constrained_vector_trunc_v3f64() #0 {
; CHECK-LABEL: constrained_vector_trunc_v3f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq trunc
; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq trunc
; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: callq trunc
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl {{[0-9]+}}(%rsp)
; CHECK-NEXT: wait
; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_trunc_v3f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0
; AVX-NEXT: vroundpd $11, {{.*}}(%rip), %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: retq
entry:
%trunc = call <3 x double> @llvm.experimental.constrained.trunc.v3f64(
<3 x double> <double 1.1, double 1.9, double 1.5>,
metadata !"fpexcept.strict") #0
ret <3 x double> %trunc
}
define <1 x double> @constrained_vector_sitofp_v1f64_v1i32(<1 x i32> %x) #0 {
; CHECK-LABEL: constrained_vector_sitofp_v1f64_v1i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvtsi2sd %edi, %xmm0
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_sitofp_v1f64_v1i32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vcvtsi2sd %edi, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
%result = call <1 x double>
@llvm.experimental.constrained.sitofp.v1f64.v1i32(<1 x i32> %x,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <1 x double> %result
}
define <1 x float> @constrained_vector_sitofp_v1f32_v1i32(<1 x i32> %x) #0 {
; CHECK-LABEL: constrained_vector_sitofp_v1f32_v1i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvtsi2ss %edi, %xmm0
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_sitofp_v1f32_v1i32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
%result = call <1 x float>
@llvm.experimental.constrained.sitofp.v1f32.v1i32(<1 x i32> %x,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <1 x float> %result
}
define <1 x double> @constrained_vector_sitofp_v1f64_v1i64(<1 x i64> %x) #0 {
; CHECK-LABEL: constrained_vector_sitofp_v1f64_v1i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvtsi2sd %rdi, %xmm0
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_sitofp_v1f64_v1i64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vcvtsi2sd %rdi, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
%result = call <1 x double>
@llvm.experimental.constrained.sitofp.v1f64.v1i64(<1 x i64> %x,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <1 x double> %result
}
define <1 x float> @constrained_vector_sitofp_v1f32_v1i64(<1 x i64> %x) #0 {
; CHECK-LABEL: constrained_vector_sitofp_v1f32_v1i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvtsi2ss %rdi, %xmm0
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_sitofp_v1f32_v1i64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
%result = call <1 x float>
@llvm.experimental.constrained.sitofp.v1f32.v1i64(<1 x i64> %x,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <1 x float> %result
}
define <2 x double> @constrained_vector_sitofp_v2f64_v2i32(<2 x i32> %x) #0 {
; CHECK-LABEL: constrained_vector_sitofp_v2f64_v2i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvtdq2pd %xmm0, %xmm0
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_sitofp_v2f64_v2i32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
; AVX-NEXT: retq
entry:
%result = call <2 x double>
@llvm.experimental.constrained.sitofp.v2f64.v2i32(<2 x i32> %x,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <2 x double> %result
}
define <2 x float> @constrained_vector_sitofp_v2f32_v2i32(<2 x i32> %x) #0 {
; CHECK-LABEL: constrained_vector_sitofp_v2f32_v2i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; CHECK-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_sitofp_v2f32_v2i32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
; AVX-NEXT: retq
entry:
%result = call <2 x float>
@llvm.experimental.constrained.sitofp.v2f32.v2i32(<2 x i32> %x,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <2 x float> %result
}
define <2 x double> @constrained_vector_sitofp_v2f64_v2i64(<2 x i64> %x) #0 {
; CHECK-LABEL: constrained_vector_sitofp_v2f64_v2i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movq %xmm0, %rax
; CHECK-NEXT: cvtsi2sd %rax, %xmm1
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; CHECK-NEXT: movq %xmm0, %rax
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: cvtsi2sd %rax, %xmm0
; CHECK-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; CHECK-NEXT: movapd %xmm1, %xmm0
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_sitofp_v2f64_v2i64:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-NEXT: retq
;
; AVX512F-LABEL: constrained_vector_sitofp_v2f64_v2i64:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0
; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: constrained_vector_sitofp_v2f64_v2i64:
; AVX512DQ: # %bb.0: # %entry
; AVX512DQ-NEXT: vmovaps %xmm0, %xmm0
; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
entry:
%result = call <2 x double>
@llvm.experimental.constrained.sitofp.v2f64.v2i64(<2 x i64> %x,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <2 x double> %result
}
define <2 x float> @constrained_vector_sitofp_v2f32_v2i64(<2 x i64> %x) #0 {
; CHECK-LABEL: constrained_vector_sitofp_v2f32_v2i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movq %xmm0, %rax
; CHECK-NEXT: cvtsi2ss %rax, %xmm1
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; CHECK-NEXT: movq %xmm0, %rax
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: cvtsi2ss %rax, %xmm0
; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; CHECK-NEXT: movaps %xmm1, %xmm0
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_sitofp_v2f32_v2i64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vpextrq $1, %xmm0, %rax
; AVX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1
; AVX-NEXT: vmovq %xmm0, %rax
; AVX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; AVX-NEXT: retq
entry:
%result = call <2 x float>
@llvm.experimental.constrained.sitofp.v2f32.v2i64(<2 x i64> %x,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <2 x float> %result
}
define <3 x double> @constrained_vector_sitofp_v3f64_v3i32(<3 x i32> %x) #0 {
; CHECK-LABEL: constrained_vector_sitofp_v3f64_v3i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movd %xmm0, %eax
; CHECK-NEXT: cvtsi2sd %eax, %xmm2
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; CHECK-NEXT: movd %xmm1, %eax
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: cvtsi2sd %eax, %xmm1
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; CHECK-NEXT: movd %xmm0, %eax
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: cvtsi2sd %eax, %xmm0
; CHECK-NEXT: movsd %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp)
; CHECK-NEXT: wait
; CHECK-NEXT: movapd %xmm2, %xmm0
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_sitofp_v3f64_v3i32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vextractps $1, %xmm0, %eax
; AVX-NEXT: vcvtsi2sd %eax, %xmm1, %xmm1
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: vcvtsi2sd %eax, %xmm2, %xmm2
; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX-NEXT: vpextrd $2, %xmm0, %eax
; AVX-NEXT: vcvtsi2sd %eax, %xmm3, %xmm0
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: retq
entry:
%result = call <3 x double>
@llvm.experimental.constrained.sitofp.v3f64.v3i32(<3 x i32> %x,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x double> %result
}
define <3 x float> @constrained_vector_sitofp_v3f32_v3i32(<3 x i32> %x) #0 {
; CHECK-LABEL: constrained_vector_sitofp_v3f32_v3i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movd %xmm0, %eax
; CHECK-NEXT: cvtsi2ss %eax, %xmm1
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
; CHECK-NEXT: movd %xmm2, %eax
; CHECK-NEXT: xorps %xmm2, %xmm2
; CHECK-NEXT: cvtsi2ss %eax, %xmm2
; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; CHECK-NEXT: movd %xmm0, %eax
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: cvtsi2ss %eax, %xmm0
; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; CHECK-NEXT: movaps %xmm1, %xmm0
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_sitofp_v3f32_v3i32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vextractps $1, %xmm0, %eax
; AVX-NEXT: vcvtsi2ss %eax, %xmm1, %xmm1
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: vcvtsi2ss %eax, %xmm2, %xmm2
; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
; AVX-NEXT: vpextrd $2, %xmm0, %eax
; AVX-NEXT: vcvtsi2ss %eax, %xmm3, %xmm0
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
; AVX-NEXT: retq
entry:
%result = call <3 x float>
@llvm.experimental.constrained.sitofp.v3f32.v3i32(<3 x i32> %x,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x float> %result
}
define <3 x double> @constrained_vector_sitofp_v3f64_v3i64(<3 x i64> %x) #0 {
; CHECK-LABEL: constrained_vector_sitofp_v3f64_v3i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvtsi2sd %rsi, %xmm1
; CHECK-NEXT: cvtsi2sd %rdi, %xmm0
; CHECK-NEXT: cvtsi2sd %rdx, %xmm2
; CHECK-NEXT: movsd %xmm2, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp)
; CHECK-NEXT: wait
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_sitofp_v3f64_v3i64:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_sitofp_v3f64_v3i64:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpextrq $1, %xmm0, %rax
; AVX512-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2
; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0
; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX512-NEXT: retq
entry:
%result = call <3 x double>
@llvm.experimental.constrained.sitofp.v3f64.v3i64(<3 x i64> %x,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x double> %result
}
define <3 x float> @constrained_vector_sitofp_v3f32_v3i64(<3 x i64> %x) #0 {
; CHECK-LABEL: constrained_vector_sitofp_v3f32_v3i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvtsi2ss %rsi, %xmm1
; CHECK-NEXT: cvtsi2ss %rdi, %xmm0
; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: cvtsi2ss %rdx, %xmm1
; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_sitofp_v3f32_v3i64:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_sitofp_v3f32_v3i64:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpextrq $1, %xmm0, %rax
; AVX512-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2
; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0
; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
%result = call <3 x float>
@llvm.experimental.constrained.sitofp.v3f32.v3i64(<3 x i64> %x,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x float> %result
}
define <4 x double> @constrained_vector_sitofp_v4f64_v4i32(<4 x i32> %x) #0 {
; CHECK-LABEL: constrained_vector_sitofp_v4f64_v4i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvtdq2pd %xmm0, %xmm2
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; CHECK-NEXT: cvtdq2pd %xmm0, %xmm1
; CHECK-NEXT: movaps %xmm2, %xmm0
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_sitofp_v4f64_v4i32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX-NEXT: retq
entry:
%result = call <4 x double>
@llvm.experimental.constrained.sitofp.v4f64.v4i32(<4 x i32> %x,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <4 x double> %result
}
define <4 x float> @constrained_vector_sitofp_v4f32_v4i32(<4 x i32> %x) #0 {
; CHECK-LABEL: constrained_vector_sitofp_v4f32_v4i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_sitofp_v4f32_v4i32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
; AVX-NEXT: retq
entry:
%result = call <4 x float>
@llvm.experimental.constrained.sitofp.v4f32.v4i32(<4 x i32> %x,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <4 x float> %result
}
define <4 x double> @constrained_vector_sitofp_v4f64_v4i64(<4 x i64> %x) #0 {
; CHECK-LABEL: constrained_vector_sitofp_v4f64_v4i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movq %xmm0, %rax
; CHECK-NEXT: cvtsi2sd %rax, %xmm2
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; CHECK-NEXT: movq %xmm0, %rax
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: cvtsi2sd %rax, %xmm0
; CHECK-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0]
; CHECK-NEXT: movq %xmm1, %rax
; CHECK-NEXT: cvtsi2sd %rax, %xmm3
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; CHECK-NEXT: movq %xmm0, %rax
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: cvtsi2sd %rax, %xmm0
; CHECK-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0]
; CHECK-NEXT: movapd %xmm2, %xmm0
; CHECK-NEXT: movapd %xmm3, %xmm1
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_sitofp_v4f64_v4i64:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpextrq $1, %xmm1, %rax
; AVX1-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2
; AVX1-NEXT: vmovq %xmm1, %rax
; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX512F-LABEL: constrained_vector_sitofp_v4f64_v4i64:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
; AVX512F-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2
; AVX512F-NEXT: vmovq %xmm1, %rax
; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1
; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0
; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: constrained_vector_sitofp_v4f64_v4i64:
; AVX512DQ: # %bb.0: # %entry
; AVX512DQ-NEXT: vmovaps %ymm0, %ymm0
; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0
; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512DQ-NEXT: retq
entry:
%result = call <4 x double>
@llvm.experimental.constrained.sitofp.v4f64.v4i64(<4 x i64> %x,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <4 x double> %result
}
define <4 x float> @constrained_vector_sitofp_v4f32_v4i64(<4 x i64> %x) #0 {
; CHECK-LABEL: constrained_vector_sitofp_v4f32_v4i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movq %xmm1, %rax
; CHECK-NEXT: cvtsi2ss %rax, %xmm2
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; CHECK-NEXT: movq %xmm1, %rax
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: cvtsi2ss %rax, %xmm1
; CHECK-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; CHECK-NEXT: movq %xmm0, %rax
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: cvtsi2ss %rax, %xmm1
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; CHECK-NEXT: movq %xmm0, %rax
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: cvtsi2ss %rax, %xmm0
; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; CHECK-NEXT: movaps %xmm1, %xmm0
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_sitofp_v4f32_v4i64:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX512F-LABEL: constrained_vector_sitofp_v4f32_v4i64:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: constrained_vector_sitofp_v4f32_v4i64:
; AVX512DQ: # %bb.0: # %entry
; AVX512DQ-NEXT: vmovaps %ymm0, %ymm0
; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
entry:
%result = call <4 x float>
@llvm.experimental.constrained.sitofp.v4f32.v4i64(<4 x i64> %x,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <4 x float> %result
}
define <1 x double> @constrained_vector_uitofp_v1f64_v1i32(<1 x i32> %x) #0 {
; CHECK-LABEL: constrained_vector_uitofp_v1f64_v1i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: cvtsi2sd %rax, %xmm0
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_uitofp_v1f64_v1i32:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: movl %edi, %eax
; AVX1-NEXT: vcvtsi2sd %rax, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_uitofp_v1f64_v1i32:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vcvtusi2sd %edi, %xmm0, %xmm0
; AVX512-NEXT: retq
entry:
%result = call <1 x double>
@llvm.experimental.constrained.uitofp.v1f64.v1i32(<1 x i32> %x,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <1 x double> %result
}
define <1 x float> @constrained_vector_uitofp_v1f32_v1i32(<1 x i32> %x) #0 {
; CHECK-LABEL: constrained_vector_uitofp_v1f32_v1i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: cvtsi2ss %rax, %xmm0
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_uitofp_v1f32_v1i32:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: movl %edi, %eax
; AVX1-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_uitofp_v1f32_v1i32:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vcvtusi2ss %edi, %xmm0, %xmm0
; AVX512-NEXT: retq
entry:
%result = call <1 x float>
@llvm.experimental.constrained.uitofp.v1f32.v1i32(<1 x i32> %x,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <1 x float> %result
}
define <1 x double> @constrained_vector_uitofp_v1f64_v1i64(<1 x i64> %x) #0 {
; CHECK-LABEL: constrained_vector_uitofp_v1f64_v1i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movq %rdi, %xmm1
; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; CHECK-NEXT: subpd {{.*}}(%rip), %xmm1
; CHECK-NEXT: movapd %xmm1, %xmm0
; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; CHECK-NEXT: addpd %xmm1, %xmm0
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_uitofp_v1f64_v1i64:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovq %rdi, %xmm0
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_uitofp_v1f64_v1i64:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vcvtusi2sd %rdi, %xmm0, %xmm0
; AVX512-NEXT: retq
entry:
%result = call <1 x double>
@llvm.experimental.constrained.uitofp.v1f64.v1i64(<1 x i64> %x,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <1 x double> %result
}
define <1 x float> @constrained_vector_uitofp_v1f32_v1i64(<1 x i64> %x) #0 {
; CHECK-LABEL: constrained_vector_uitofp_v1f32_v1i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: shrq %rax
; CHECK-NEXT: movl %edi, %ecx
; CHECK-NEXT: andl $1, %ecx
; CHECK-NEXT: orq %rax, %rcx
; CHECK-NEXT: testq %rdi, %rdi
; CHECK-NEXT: cmovnsq %rdi, %rcx
; CHECK-NEXT: cvtsi2ss %rcx, %xmm0
; CHECK-NEXT: jns .LBB170_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: addss %xmm0, %xmm0
; CHECK-NEXT: .LBB170_2: # %entry
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_uitofp_v1f32_v1i64:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: movq %rdi, %rax
; AVX1-NEXT: shrq %rax
; AVX1-NEXT: movl %edi, %ecx
; AVX1-NEXT: andl $1, %ecx
; AVX1-NEXT: orq %rax, %rcx
; AVX1-NEXT: testq %rdi, %rdi
; AVX1-NEXT: cmovnsq %rdi, %rcx
; AVX1-NEXT: vcvtsi2ss %rcx, %xmm0, %xmm0
; AVX1-NEXT: jns .LBB170_2
; AVX1-NEXT: # %bb.1:
; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0
; AVX1-NEXT: .LBB170_2: # %entry
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_uitofp_v1f32_v1i64:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vcvtusi2ss %rdi, %xmm0, %xmm0
; AVX512-NEXT: retq
entry:
%result = call <1 x float>
@llvm.experimental.constrained.uitofp.v1f32.v1i64(<1 x i64> %x,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <1 x float> %result
}
define <2 x double> @constrained_vector_uitofp_v2f64_v2i32(<2 x i32> %x) #0 {
; CHECK-LABEL: constrained_vector_uitofp_v2f64_v2i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xorpd %xmm1, %xmm1
; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-NEXT: movapd {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
; CHECK-NEXT: orpd %xmm1, %xmm0
; CHECK-NEXT: subpd %xmm1, %xmm0
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_uitofp_v2f64_v2i32:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vsubpd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_uitofp_v2f64_v2i32:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512-NEXT: vcvtudq2pd %ymm0, %zmm0
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
%result = call <2 x double>
@llvm.experimental.constrained.uitofp.v2f64.v2i32(<2 x i32> %x,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <2 x double> %result
}
define <2 x float> @constrained_vector_uitofp_v2f32_v2i32(<2 x i32> %x) #0 {
; CHECK-LABEL: constrained_vector_uitofp_v2f32_v2i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xorpd %xmm1, %xmm1
; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-NEXT: movapd {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
; CHECK-NEXT: orpd %xmm1, %xmm0
; CHECK-NEXT: subpd %xmm1, %xmm0
; CHECK-NEXT: cvtpd2ps %xmm0, %xmm0
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_uitofp_v2f32_v2i32:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vsubpd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vcvtpd2ps %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_uitofp_v2f32_v2i32:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512-NEXT: vcvtudq2ps %zmm0, %zmm0
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
%result = call <2 x float>
@llvm.experimental.constrained.uitofp.v2f32.v2i32(<2 x i32> %x,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <2 x float> %result
}
define <2 x double> @constrained_vector_uitofp_v2f64_v2i64(<2 x i64> %x) #0 {
; CHECK-LABEL: constrained_vector_uitofp_v2f64_v2i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295]
; CHECK-NEXT: pand %xmm0, %xmm1
; CHECK-NEXT: por {{.*}}(%rip), %xmm1
; CHECK-NEXT: psrlq $32, %xmm0
; CHECK-NEXT: por {{.*}}(%rip), %xmm0
; CHECK-NEXT: subpd {{.*}}(%rip), %xmm0
; CHECK-NEXT: addpd %xmm1, %xmm0
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_uitofp_v2f64_v2i64:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; AVX1-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX512F-LABEL: constrained_vector_uitofp_v2f64_v2i64:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1
; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX512F-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0
; AVX512F-NEXT: vaddpd %xmm0, %xmm1, %xmm0
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: constrained_vector_uitofp_v2f64_v2i64:
; AVX512DQ: # %bb.0: # %entry
; AVX512DQ-NEXT: vmovaps %xmm0, %xmm0
; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
entry:
%result = call <2 x double>
@llvm.experimental.constrained.uitofp.v2f64.v2i64(<2 x i64> %x,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <2 x double> %result
}
define <2 x float> @constrained_vector_uitofp_v2f32_v2i64(<2 x i64> %x) #0 {
; CHECK-LABEL: constrained_vector_uitofp_v2f32_v2i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movdqa %xmm0, %xmm1
; CHECK-NEXT: movq %xmm0, %rax
; CHECK-NEXT: movq %rax, %rcx
; CHECK-NEXT: shrq %rcx
; CHECK-NEXT: movl %eax, %edx
; CHECK-NEXT: andl $1, %edx
; CHECK-NEXT: orq %rcx, %rdx
; CHECK-NEXT: testq %rax, %rax
; CHECK-NEXT: cmovnsq %rax, %rdx
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: cvtsi2ss %rdx, %xmm0
; CHECK-NEXT: jns .LBB174_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: addss %xmm0, %xmm0
; CHECK-NEXT: .LBB174_2: # %entry
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; CHECK-NEXT: movq %xmm1, %rax
; CHECK-NEXT: movq %rax, %rcx
; CHECK-NEXT: shrq %rcx
; CHECK-NEXT: movl %eax, %edx
; CHECK-NEXT: andl $1, %edx
; CHECK-NEXT: orq %rcx, %rdx
; CHECK-NEXT: testq %rax, %rax
; CHECK-NEXT: cmovnsq %rax, %rdx
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: cvtsi2ss %rdx, %xmm1
; CHECK-NEXT: jns .LBB174_4
; CHECK-NEXT: # %bb.3:
; CHECK-NEXT: addss %xmm1, %xmm1
; CHECK-NEXT: .LBB174_4: # %entry
; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_uitofp_v2f32_v2i64:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm2
; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpextrq $1, %xmm1, %rax
; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2
; AVX1-NEXT: vmovq %xmm1, %rax
; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero
; AVX1-NEXT: vaddps %xmm1, %xmm1, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_uitofp_v2f32_v2i64:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpextrq $1, %xmm0, %rax
; AVX512-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0
; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; AVX512-NEXT: retq
entry:
%result = call <2 x float>
@llvm.experimental.constrained.uitofp.v2f32.v2i64(<2 x i64> %x,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <2 x float> %result
}
define <3 x double> @constrained_vector_uitofp_v3f64_v3i32(<3 x i32> %x) #0 {
; CHECK-LABEL: constrained_vector_uitofp_v3f64_v3i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movd %xmm0, %eax
; CHECK-NEXT: cvtsi2sd %rax, %xmm2
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; CHECK-NEXT: movd %xmm1, %eax
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: cvtsi2sd %rax, %xmm1
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; CHECK-NEXT: movd %xmm0, %eax
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: cvtsi2sd %rax, %xmm0
; CHECK-NEXT: movsd %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp)
; CHECK-NEXT: wait
; CHECK-NEXT: movapd %xmm2, %xmm0
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_uitofp_v3f64_v3i32:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractps $1, %xmm0, %eax
; AVX1-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX1-NEXT: vpextrd $2, %xmm0, %eax
; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_uitofp_v3f64_v3i32:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vextractps $1, %xmm0, %eax
; AVX512-NEXT: vcvtusi2sd %eax, %xmm1, %xmm1
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: vcvtusi2sd %eax, %xmm2, %xmm2
; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512-NEXT: vpextrd $2, %xmm0, %eax
; AVX512-NEXT: vcvtusi2sd %eax, %xmm3, %xmm0
; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX512-NEXT: retq
entry:
%result = call <3 x double>
@llvm.experimental.constrained.uitofp.v3f64.v3i32(<3 x i32> %x,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x double> %result
}
define <3 x float> @constrained_vector_uitofp_v3f32_v3i32(<3 x i32> %x) #0 {
; CHECK-LABEL: constrained_vector_uitofp_v3f32_v3i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movd %xmm0, %eax
; CHECK-NEXT: cvtsi2ss %rax, %xmm1
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
; CHECK-NEXT: movd %xmm2, %eax
; CHECK-NEXT: xorps %xmm2, %xmm2
; CHECK-NEXT: cvtsi2ss %rax, %xmm2
; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; CHECK-NEXT: movd %xmm0, %eax
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: cvtsi2ss %rax, %xmm0
; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; CHECK-NEXT: movaps %xmm1, %xmm0
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_uitofp_v3f32_v3i32:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractps $1, %xmm0, %eax
; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
; AVX1-NEXT: vpextrd $2, %xmm0, %eax
; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_uitofp_v3f32_v3i32:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vextractps $1, %xmm0, %eax
; AVX512-NEXT: vcvtusi2ss %eax, %xmm1, %xmm1
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: vcvtusi2ss %eax, %xmm2, %xmm2
; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
; AVX512-NEXT: vpextrd $2, %xmm0, %eax
; AVX512-NEXT: vcvtusi2ss %eax, %xmm3, %xmm0
; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
; AVX512-NEXT: retq
entry:
%result = call <3 x float>
@llvm.experimental.constrained.uitofp.v3f32.v3i32(<3 x i32> %x,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x float> %result
}
define <3 x double> @constrained_vector_uitofp_v3f64_v3i64(<3 x i64> %x) #0 {
; CHECK-LABEL: constrained_vector_uitofp_v3f64_v3i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movq %rdi, %xmm1
; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; CHECK-NEXT: movapd {{.*#+}} xmm3 = [4.503599627370496E+15,1.9342813113834067E+25]
; CHECK-NEXT: subpd %xmm3, %xmm1
; CHECK-NEXT: movapd %xmm1, %xmm0
; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; CHECK-NEXT: addpd %xmm1, %xmm0
; CHECK-NEXT: movq %rsi, %xmm4
; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
; CHECK-NEXT: subpd %xmm3, %xmm4
; CHECK-NEXT: movapd %xmm4, %xmm1
; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
; CHECK-NEXT: addpd %xmm4, %xmm1
; CHECK-NEXT: movq %rdx, %xmm4
; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
; CHECK-NEXT: subpd %xmm3, %xmm4
; CHECK-NEXT: movapd %xmm4, %xmm2
; CHECK-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1]
; CHECK-NEXT: addpd %xmm4, %xmm2
; CHECK-NEXT: movlpd %xmm2, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp)
; CHECK-NEXT: wait
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_uitofp_v3f64_v3i64:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovapd {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [4.503599627370496E+15,1.9342813113834067E+25]
; AVX1-NEXT: vsubpd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0]
; AVX1-NEXT: vaddpd %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[2,3,0,1]
; AVX1-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
; AVX1-NEXT: vsubpd %xmm3, %xmm4, %xmm4
; AVX1-NEXT: vpermilpd {{.*#+}} xmm5 = xmm4[1,0]
; AVX1-NEXT: vaddpd %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm4[0]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX1-NEXT: vsubpd %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_uitofp_v3f64_v3i64:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpextrq $1, %xmm0, %rax
; AVX512-NEXT: vcvtusi2sd %rax, %xmm1, %xmm1
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: vcvtusi2sd %rax, %xmm2, %xmm2
; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: vcvtusi2sd %rax, %xmm3, %xmm0
; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX512-NEXT: retq
entry:
%result = call <3 x double>
@llvm.experimental.constrained.uitofp.v3f64.v3i64(<3 x i64> %x,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x double> %result
}
define <3 x float> @constrained_vector_uitofp_v3f32_v3i64(<3 x i64> %x) #0 {
; CHECK-LABEL: constrained_vector_uitofp_v3f32_v3i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movq %rsi, %rax
; CHECK-NEXT: shrq %rax
; CHECK-NEXT: movl %esi, %ecx
; CHECK-NEXT: andl $1, %ecx
; CHECK-NEXT: orq %rax, %rcx
; CHECK-NEXT: testq %rsi, %rsi
; CHECK-NEXT: cmovnsq %rsi, %rcx
; CHECK-NEXT: cvtsi2ss %rcx, %xmm1
; CHECK-NEXT: jns .LBB178_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: addss %xmm1, %xmm1
; CHECK-NEXT: .LBB178_2: # %entry
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: shrq %rax
; CHECK-NEXT: movl %edi, %ecx
; CHECK-NEXT: andl $1, %ecx
; CHECK-NEXT: orq %rax, %rcx
; CHECK-NEXT: testq %rdi, %rdi
; CHECK-NEXT: cmovnsq %rdi, %rcx
; CHECK-NEXT: cvtsi2ss %rcx, %xmm0
; CHECK-NEXT: jns .LBB178_4
; CHECK-NEXT: # %bb.3:
; CHECK-NEXT: addss %xmm0, %xmm0
; CHECK-NEXT: .LBB178_4: # %entry
; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-NEXT: movq %rdx, %rax
; CHECK-NEXT: shrq %rax
; CHECK-NEXT: movl %edx, %ecx
; CHECK-NEXT: andl $1, %ecx
; CHECK-NEXT: orq %rax, %rcx
; CHECK-NEXT: testq %rdx, %rdx
; CHECK-NEXT: cmovnsq %rdx, %rcx
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: cvtsi2ss %rcx, %xmm1
; CHECK-NEXT: jns .LBB178_6
; CHECK-NEXT: # %bb.5:
; CHECK-NEXT: addss %xmm1, %xmm1
; CHECK-NEXT: .LBB178_6: # %entry
; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_uitofp_v3f32_v3i64:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: shrq %rcx
; AVX1-NEXT: movl %eax, %edx
; AVX1-NEXT: andl $1, %edx
; AVX1-NEXT: orq %rcx, %rdx
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: cmovnsq %rax, %rdx
; AVX1-NEXT: vcvtsi2ss %rdx, %xmm1, %xmm1
; AVX1-NEXT: jns .LBB178_2
; AVX1-NEXT: # %bb.1:
; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1
; AVX1-NEXT: .LBB178_2: # %entry
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: shrq %rcx
; AVX1-NEXT: movl %eax, %edx
; AVX1-NEXT: andl $1, %edx
; AVX1-NEXT: orq %rcx, %rdx
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: cmovnsq %rax, %rdx
; AVX1-NEXT: vcvtsi2ss %rdx, %xmm2, %xmm2
; AVX1-NEXT: jns .LBB178_4
; AVX1-NEXT: # %bb.3:
; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
; AVX1-NEXT: .LBB178_4: # %entry
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: shrq %rcx
; AVX1-NEXT: movl %eax, %edx
; AVX1-NEXT: andl $1, %edx
; AVX1-NEXT: orq %rcx, %rdx
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: cmovnsq %rax, %rdx
; AVX1-NEXT: vcvtsi2ss %rdx, %xmm3, %xmm0
; AVX1-NEXT: jns .LBB178_6
; AVX1-NEXT: # %bb.5:
; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0
; AVX1-NEXT: .LBB178_6: # %entry
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_uitofp_v3f32_v3i64:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpextrq $1, %xmm0, %rax
; AVX512-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2
; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0
; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
%result = call <3 x float>
@llvm.experimental.constrained.uitofp.v3f32.v3i64(<3 x i64> %x,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <3 x float> %result
}
define <4 x double> @constrained_vector_uitofp_v4f64_v4i32(<4 x i32> %x) #0 {
; CHECK-LABEL: constrained_vector_uitofp_v4f64_v4i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xorpd %xmm2, %xmm2
; CHECK-NEXT: movapd %xmm0, %xmm1
; CHECK-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; CHECK-NEXT: movapd {{.*#+}} xmm3 = [4.503599627370496E+15,4.503599627370496E+15]
; CHECK-NEXT: orpd %xmm3, %xmm1
; CHECK-NEXT: subpd %xmm3, %xmm1
; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; CHECK-NEXT: orpd %xmm3, %xmm0
; CHECK-NEXT: subpd %xmm3, %xmm0
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_uitofp_v4f64_v4i32:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15]
; AVX1-NEXT: vorpd %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_uitofp_v4f64_v4i32:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovaps %xmm0, %xmm0
; AVX512-NEXT: vcvtudq2pd %ymm0, %zmm0
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512-NEXT: retq
entry:
%result = call <4 x double>
@llvm.experimental.constrained.uitofp.v4f64.v4i32(<4 x i32> %x,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <4 x double> %result
}
define <4 x float> @constrained_vector_uitofp_v4f32_v4i32(<4 x i32> %x) #0 {
; CHECK-LABEL: constrained_vector_uitofp_v4f32_v4i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
; CHECK-NEXT: pand %xmm0, %xmm1
; CHECK-NEXT: por {{.*}}(%rip), %xmm1
; CHECK-NEXT: psrld $16, %xmm0
; CHECK-NEXT: por {{.*}}(%rip), %xmm0
; CHECK-NEXT: subps {{.*}}(%rip), %xmm0
; CHECK-NEXT: addps %xmm1, %xmm0
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_uitofp_v4f32_v4i32:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
; AVX1-NEXT: vsubps {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_uitofp_v4f32_v4i32:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovaps %xmm0, %xmm0
; AVX512-NEXT: vcvtudq2ps %zmm0, %zmm0
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
%result = call <4 x float>
@llvm.experimental.constrained.uitofp.v4f32.v4i32(<4 x i32> %x,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <4 x float> %result
}
define <4 x double> @constrained_vector_uitofp_v4f64_v4i64(<4 x i64> %x) #0 {
; CHECK-LABEL: constrained_vector_uitofp_v4f64_v4i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
; CHECK-NEXT: movdqa %xmm1, %xmm3
; CHECK-NEXT: pand %xmm2, %xmm3
; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
; CHECK-NEXT: por %xmm4, %xmm3
; CHECK-NEXT: psrlq $32, %xmm1
; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
; CHECK-NEXT: por %xmm5, %xmm1
; CHECK-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
; CHECK-NEXT: subpd %xmm6, %xmm1
; CHECK-NEXT: addpd %xmm3, %xmm1
; CHECK-NEXT: pand %xmm0, %xmm2
; CHECK-NEXT: por %xmm4, %xmm2
; CHECK-NEXT: psrlq $32, %xmm0
; CHECK-NEXT: por %xmm5, %xmm0
; CHECK-NEXT: subpd %xmm6, %xmm0
; CHECK-NEXT: addpd %xmm2, %xmm0
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_uitofp_v4f64_v4i64:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vorpd {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vsubpd {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX512F-LABEL: constrained_vector_uitofp_v4f64_v4i64:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm0
; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
; AVX512F-NEXT: vsubpd %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vaddpd %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: constrained_vector_uitofp_v4f64_v4i64:
; AVX512DQ: # %bb.0: # %entry
; AVX512DQ-NEXT: vmovaps %ymm0, %ymm0
; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0
; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512DQ-NEXT: retq
entry:
%result = call <4 x double>
@llvm.experimental.constrained.uitofp.v4f64.v4i64(<4 x i64> %x,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <4 x double> %result
}
define <4 x float> @constrained_vector_uitofp_v4f32_v4i64(<4 x i64> %x) #0 {
; CHECK-LABEL: constrained_vector_uitofp_v4f32_v4i64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movq %xmm1, %rax
; CHECK-NEXT: movq %rax, %rcx
; CHECK-NEXT: shrq %rcx
; CHECK-NEXT: movl %eax, %edx
; CHECK-NEXT: andl $1, %edx
; CHECK-NEXT: orq %rcx, %rdx
; CHECK-NEXT: testq %rax, %rax
; CHECK-NEXT: cmovnsq %rax, %rdx
; CHECK-NEXT: cvtsi2ss %rdx, %xmm2
; CHECK-NEXT: jns .LBB182_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: addss %xmm2, %xmm2
; CHECK-NEXT: .LBB182_2: # %entry
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; CHECK-NEXT: movq %xmm1, %rax
; CHECK-NEXT: movq %rax, %rcx
; CHECK-NEXT: shrq %rcx
; CHECK-NEXT: movl %eax, %edx
; CHECK-NEXT: andl $1, %edx
; CHECK-NEXT: orq %rcx, %rdx
; CHECK-NEXT: testq %rax, %rax
; CHECK-NEXT: cmovnsq %rax, %rdx
; CHECK-NEXT: cvtsi2ss %rdx, %xmm3
; CHECK-NEXT: jns .LBB182_4
; CHECK-NEXT: # %bb.3:
; CHECK-NEXT: addss %xmm3, %xmm3
; CHECK-NEXT: .LBB182_4: # %entry
; CHECK-NEXT: movq %xmm0, %rax
; CHECK-NEXT: movq %rax, %rcx
; CHECK-NEXT: shrq %rcx
; CHECK-NEXT: movl %eax, %edx
; CHECK-NEXT: andl $1, %edx
; CHECK-NEXT: orq %rcx, %rdx
; CHECK-NEXT: testq %rax, %rax
; CHECK-NEXT: cmovnsq %rax, %rdx
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: cvtsi2ss %rdx, %xmm1
; CHECK-NEXT: jns .LBB182_6
; CHECK-NEXT: # %bb.5:
; CHECK-NEXT: addss %xmm1, %xmm1
; CHECK-NEXT: .LBB182_6: # %entry
; CHECK-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; CHECK-NEXT: movq %xmm0, %rax
; CHECK-NEXT: movq %rax, %rcx
; CHECK-NEXT: shrq %rcx
; CHECK-NEXT: movl %eax, %edx
; CHECK-NEXT: andl $1, %edx
; CHECK-NEXT: orq %rcx, %rdx
; CHECK-NEXT: testq %rax, %rax
; CHECK-NEXT: cmovnsq %rax, %rdx
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: cvtsi2ss %rdx, %xmm0
; CHECK-NEXT: jns .LBB182_8
; CHECK-NEXT: # %bb.7:
; CHECK-NEXT: addss %xmm0, %xmm0
; CHECK-NEXT: .LBB182_8: # %entry
; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; CHECK-NEXT: movaps %xmm1, %xmm0
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_uitofp_v4f32_v4i64:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX1-NEXT: vandpd {{.*}}(%rip), %ymm0, %ymm3
; AVX1-NEXT: vorpd %ymm3, %ymm1, %ymm1
; AVX1-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm1
; AVX1-NEXT: vpextrq $1, %xmm1, %rax
; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3
; AVX1-NEXT: vmovq %xmm1, %rax
; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4
; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vmovq %xmm1, %rax
; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4
; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
; AVX1-NEXT: vpextrq $1, %xmm1, %rax
; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0]
; AVX1-NEXT: vaddps %xmm1, %xmm1, %xmm3
; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vblendvps %xmm0, %xmm3, %xmm1, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX512F-LABEL: constrained_vector_uitofp_v4f32_v4i64:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: constrained_vector_uitofp_v4f32_v4i64:
; AVX512DQ: # %bb.0: # %entry
; AVX512DQ-NEXT: vmovaps %ymm0, %ymm0
; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
entry:
%result = call <4 x float>
@llvm.experimental.constrained.uitofp.v4f32.v4i64(<4 x i64> %x,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <4 x float> %result
}
; Simple test to make sure we don't fuse vselect+strict_fadd into a masked operation.
define <16 x float> @vpaddd_mask_test(<16 x float> %i, <16 x float> %j, <16 x i32> %mask1) nounwind readnone strictfp {
; CHECK-LABEL: vpaddd_mask_test:
; CHECK: # %bb.0:
; CHECK-NEXT: pxor %xmm10, %xmm10
; CHECK-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
; CHECK-NEXT: pcmpeqd %xmm10, %xmm8
; CHECK-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
; CHECK-NEXT: pcmpeqd %xmm10, %xmm9
; CHECK-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11
; CHECK-NEXT: pcmpeqd %xmm10, %xmm11
; CHECK-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm10
; CHECK-NEXT: addps %xmm3, %xmm7
; CHECK-NEXT: addps %xmm2, %xmm6
; CHECK-NEXT: addps %xmm1, %xmm5
; CHECK-NEXT: addps %xmm0, %xmm4
; CHECK-NEXT: andps %xmm10, %xmm0
; CHECK-NEXT: andnps %xmm4, %xmm10
; CHECK-NEXT: orps %xmm10, %xmm0
; CHECK-NEXT: andps %xmm11, %xmm1
; CHECK-NEXT: andnps %xmm5, %xmm11
; CHECK-NEXT: orps %xmm11, %xmm1
; CHECK-NEXT: andps %xmm9, %xmm2
; CHECK-NEXT: andnps %xmm6, %xmm9
; CHECK-NEXT: orps %xmm9, %xmm2
; CHECK-NEXT: andps %xmm8, %xmm3
; CHECK-NEXT: andnps %xmm7, %xmm8
; CHECK-NEXT: orps %xmm8, %xmm3
; CHECK-NEXT: retq
;
; AVX1-LABEL: vpaddd_mask_test:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7
; AVX1-NEXT: vpcmpeqd %xmm7, %xmm6, %xmm6
; AVX1-NEXT: vpcmpeqd %xmm7, %xmm5, %xmm5
; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm6
; AVX1-NEXT: vpcmpeqd %xmm7, %xmm6, %xmm6
; AVX1-NEXT: vpcmpeqd %xmm7, %xmm4, %xmm4
; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
; AVX1-NEXT: vaddps %ymm3, %ymm1, %ymm3
; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm2
; AVX1-NEXT: vblendvps %ymm4, %ymm0, %ymm2, %ymm0
; AVX1-NEXT: vblendvps %ymm5, %ymm1, %ymm3, %ymm1
; AVX1-NEXT: retq
;
; AVX512-LABEL: vpaddd_mask_test:
; AVX512: # %bb.0:
; AVX512-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm1
; AVX512-NEXT: vmovaps %zmm1, %zmm0 {%k1}
; AVX512-NEXT: retq
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
%x = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %i, <16 x float> %j, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
%r = select <16 x i1> %mask, <16 x float> %x, <16 x float> %i
ret <16 x float> %r
}
declare <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float>, <16 x float>, metadata, metadata)
attributes #0 = { strictfp }
; Single width declarations
declare <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double>, <2 x double>, metadata, metadata)
declare <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double>, <2 x double>, metadata, metadata)
declare <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double>, <2 x double>, metadata, metadata)
declare <2 x double> @llvm.experimental.constrained.fdiv.v2f64(<2 x double>, <2 x double>, metadata, metadata)
declare <2 x double> @llvm.experimental.constrained.frem.v2f64(<2 x double>, <2 x double>, metadata, metadata)
declare <2 x double> @llvm.experimental.constrained.sqrt.v2f64(<2 x double>, metadata, metadata)
declare <2 x double> @llvm.experimental.constrained.pow.v2f64(<2 x double>, <2 x double>, metadata, metadata)
declare <2 x double> @llvm.experimental.constrained.powi.v2f64(<2 x double>, i32, metadata, metadata)
declare <2 x double> @llvm.experimental.constrained.sin.v2f64(<2 x double>, metadata, metadata)
declare <2 x double> @llvm.experimental.constrained.cos.v2f64(<2 x double>, metadata, metadata)
declare <2 x double> @llvm.experimental.constrained.exp.v2f64(<2 x double>, metadata, metadata)
declare <2 x double> @llvm.experimental.constrained.exp2.v2f64(<2 x double>, metadata, metadata)
declare <2 x double> @llvm.experimental.constrained.log.v2f64(<2 x double>, metadata, metadata)
declare <2 x double> @llvm.experimental.constrained.log10.v2f64(<2 x double>, metadata, metadata)
declare <2 x double> @llvm.experimental.constrained.log2.v2f64(<2 x double>, metadata, metadata)
declare <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double>, metadata, metadata)
declare <2 x double> @llvm.experimental.constrained.nearbyint.v2f64(<2 x double>, metadata, metadata)
declare <2 x double> @llvm.experimental.constrained.maxnum.v2f64(<2 x double>, <2 x double>, metadata)
declare <2 x double> @llvm.experimental.constrained.minnum.v2f64(<2 x double>, <2 x double>, metadata)
declare <2 x i32> @llvm.experimental.constrained.fptosi.v2i32.v2f32(<2 x float>, metadata)
declare <2 x i64> @llvm.experimental.constrained.fptosi.v2i64.v2f32(<2 x float>, metadata)
declare <2 x i32> @llvm.experimental.constrained.fptosi.v2i32.v2f64(<2 x double>, metadata)
declare <2 x i64> @llvm.experimental.constrained.fptosi.v2i64.v2f64(<2 x double>, metadata)
declare <2 x i32> @llvm.experimental.constrained.fptoui.v2i32.v2f32(<2 x float>, metadata)
declare <2 x i64> @llvm.experimental.constrained.fptoui.v2i64.v2f32(<2 x float>, metadata)
declare <2 x i32> @llvm.experimental.constrained.fptoui.v2i32.v2f64(<2 x double>, metadata)
declare <2 x i64> @llvm.experimental.constrained.fptoui.v2i64.v2f64(<2 x double>, metadata)
declare <2 x float> @llvm.experimental.constrained.fptrunc.v2f32.v2f64(<2 x double>, metadata, metadata)
declare <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f32(<2 x float>, metadata)
declare <2 x double> @llvm.experimental.constrained.ceil.v2f64(<2 x double>, metadata)
declare <2 x double> @llvm.experimental.constrained.floor.v2f64(<2 x double>, metadata)
declare <2 x double> @llvm.experimental.constrained.round.v2f64(<2 x double>, metadata)
declare <2 x double> @llvm.experimental.constrained.trunc.v2f64(<2 x double>, metadata)
declare <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i32(<2 x i32>, metadata, metadata)
declare <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i32(<2 x i32>, metadata, metadata)
declare <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i64(<2 x i64>, metadata, metadata)
declare <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i64(<2 x i64>, metadata, metadata)
declare <2 x double> @llvm.experimental.constrained.uitofp.v2f64.v2i32(<2 x i32>, metadata, metadata)
declare <2 x float> @llvm.experimental.constrained.uitofp.v2f32.v2i32(<2 x i32>, metadata, metadata)
declare <2 x double> @llvm.experimental.constrained.uitofp.v2f64.v2i64(<2 x i64>, metadata, metadata)
declare <2 x float> @llvm.experimental.constrained.uitofp.v2f32.v2i64(<2 x i64>, metadata, metadata)
; Scalar width declarations
declare <1 x float> @llvm.experimental.constrained.fadd.v1f32(<1 x float>, <1 x float>, metadata, metadata)
declare <1 x float> @llvm.experimental.constrained.fsub.v1f32(<1 x float>, <1 x float>, metadata, metadata)
declare <1 x float> @llvm.experimental.constrained.fmul.v1f32(<1 x float>, <1 x float>, metadata, metadata)
declare <1 x float> @llvm.experimental.constrained.fdiv.v1f32(<1 x float>, <1 x float>, metadata, metadata)
declare <1 x float> @llvm.experimental.constrained.frem.v1f32(<1 x float>, <1 x float>, metadata, metadata)
declare <1 x float> @llvm.experimental.constrained.sqrt.v1f32(<1 x float>, metadata, metadata)
declare <1 x float> @llvm.experimental.constrained.pow.v1f32(<1 x float>, <1 x float>, metadata, metadata)
declare <1 x float> @llvm.experimental.constrained.powi.v1f32(<1 x float>, i32, metadata, metadata)
declare <1 x float> @llvm.experimental.constrained.sin.v1f32(<1 x float>, metadata, metadata)
declare <1 x float> @llvm.experimental.constrained.cos.v1f32(<1 x float>, metadata, metadata)
declare <1 x float> @llvm.experimental.constrained.exp.v1f32(<1 x float>, metadata, metadata)
declare <1 x float> @llvm.experimental.constrained.exp2.v1f32(<1 x float>, metadata, metadata)
declare <1 x float> @llvm.experimental.constrained.log.v1f32(<1 x float>, metadata, metadata)
declare <1 x float> @llvm.experimental.constrained.log10.v1f32(<1 x float>, metadata, metadata)
declare <1 x float> @llvm.experimental.constrained.log2.v1f32(<1 x float>, metadata, metadata)
declare <1 x float> @llvm.experimental.constrained.rint.v1f32(<1 x float>, metadata, metadata)
declare <1 x float> @llvm.experimental.constrained.nearbyint.v1f32(<1 x float>, metadata, metadata)
declare <1 x float> @llvm.experimental.constrained.maxnum.v1f32(<1 x float>, <1 x float>, metadata)
declare <1 x float> @llvm.experimental.constrained.minnum.v1f32(<1 x float>, <1 x float>, metadata)
declare <1 x i32> @llvm.experimental.constrained.fptosi.v1i32.v1f32(<1 x float>, metadata)
declare <1 x i64> @llvm.experimental.constrained.fptosi.v1i64.v1f32(<1 x float>, metadata)
declare <1 x i32> @llvm.experimental.constrained.fptosi.v1i32.v1f64(<1 x double>, metadata)
declare <1 x i64> @llvm.experimental.constrained.fptosi.v1i64.v1f64(<1 x double>, metadata)
declare <1 x i32> @llvm.experimental.constrained.fptoui.v1i32.v1f32(<1 x float>, metadata)
declare <1 x i64> @llvm.experimental.constrained.fptoui.v1i64.v1f32(<1 x float>, metadata)
declare <1 x i32> @llvm.experimental.constrained.fptoui.v1i32.v1f64(<1 x double>, metadata)
declare <1 x i64> @llvm.experimental.constrained.fptoui.v1i64.v1f64(<1 x double>, metadata)
declare <1 x float> @llvm.experimental.constrained.fptrunc.v1f32.v1f64(<1 x double>, metadata, metadata)
declare <1 x double> @llvm.experimental.constrained.fpext.v1f64.v1f32(<1 x float>, metadata)
declare <1 x float> @llvm.experimental.constrained.ceil.v1f32(<1 x float>, metadata)
declare <1 x float> @llvm.experimental.constrained.floor.v1f32(<1 x float>, metadata)
declare <1 x float> @llvm.experimental.constrained.round.v1f32(<1 x float>, metadata)
declare <1 x float> @llvm.experimental.constrained.trunc.v1f32(<1 x float>, metadata)
declare <1 x double> @llvm.experimental.constrained.sitofp.v1f64.v1i32(<1 x i32>, metadata, metadata)
declare <1 x float> @llvm.experimental.constrained.sitofp.v1f32.v1i32(<1 x i32>, metadata, metadata)
declare <1 x double> @llvm.experimental.constrained.sitofp.v1f64.v1i64(<1 x i64>, metadata, metadata)
declare <1 x float> @llvm.experimental.constrained.sitofp.v1f32.v1i64(<1 x i64>, metadata, metadata)
declare <1 x double> @llvm.experimental.constrained.uitofp.v1f64.v1i32(<1 x i32>, metadata, metadata)
declare <1 x float> @llvm.experimental.constrained.uitofp.v1f32.v1i32(<1 x i32>, metadata, metadata)
declare <1 x double> @llvm.experimental.constrained.uitofp.v1f64.v1i64(<1 x i64>, metadata, metadata)
declare <1 x float> @llvm.experimental.constrained.uitofp.v1f32.v1i64(<1 x i64>, metadata, metadata)
; Illegal width declarations
declare <3 x float> @llvm.experimental.constrained.fadd.v3f32(<3 x float>, <3 x float>, metadata, metadata)
declare <3 x double> @llvm.experimental.constrained.fadd.v3f64(<3 x double>, <3 x double>, metadata, metadata)
declare <3 x float> @llvm.experimental.constrained.fsub.v3f32(<3 x float>, <3 x float>, metadata, metadata)
declare <3 x double> @llvm.experimental.constrained.fsub.v3f64(<3 x double>, <3 x double>, metadata, metadata)
declare <3 x float> @llvm.experimental.constrained.fmul.v3f32(<3 x float>, <3 x float>, metadata, metadata)
declare <3 x double> @llvm.experimental.constrained.fmul.v3f64(<3 x double>, <3 x double>, metadata, metadata)
declare <3 x float> @llvm.experimental.constrained.fdiv.v3f32(<3 x float>, <3 x float>, metadata, metadata)
declare <3 x double> @llvm.experimental.constrained.fdiv.v3f64(<3 x double>, <3 x double>, metadata, metadata)
declare <3 x float> @llvm.experimental.constrained.frem.v3f32(<3 x float>, <3 x float>, metadata, metadata)
declare <3 x double> @llvm.experimental.constrained.frem.v3f64(<3 x double>, <3 x double>, metadata, metadata)
declare <3 x float> @llvm.experimental.constrained.sqrt.v3f32(<3 x float>, metadata, metadata)
declare <3 x double> @llvm.experimental.constrained.sqrt.v3f64(<3 x double>, metadata, metadata)
declare <3 x float> @llvm.experimental.constrained.pow.v3f32(<3 x float>, <3 x float>, metadata, metadata)
declare <3 x double> @llvm.experimental.constrained.pow.v3f64(<3 x double>, <3 x double>, metadata, metadata)
declare <3 x float> @llvm.experimental.constrained.powi.v3f32(<3 x float>, i32, metadata, metadata)
declare <3 x double> @llvm.experimental.constrained.powi.v3f64(<3 x double>, i32, metadata, metadata)
declare <3 x float> @llvm.experimental.constrained.sin.v3f32(<3 x float>, metadata, metadata)
declare <3 x double> @llvm.experimental.constrained.sin.v3f64(<3 x double>, metadata, metadata)
declare <3 x float> @llvm.experimental.constrained.cos.v3f32(<3 x float>, metadata, metadata)
declare <3 x double> @llvm.experimental.constrained.cos.v3f64(<3 x double>, metadata, metadata)
declare <3 x float> @llvm.experimental.constrained.exp.v3f32(<3 x float>, metadata, metadata)
declare <3 x double> @llvm.experimental.constrained.exp.v3f64(<3 x double>, metadata, metadata)
declare <3 x float> @llvm.experimental.constrained.exp2.v3f32(<3 x float>, metadata, metadata)
declare <3 x double> @llvm.experimental.constrained.exp2.v3f64(<3 x double>, metadata, metadata)
declare <3 x float> @llvm.experimental.constrained.log.v3f32(<3 x float>, metadata, metadata)
declare <3 x double> @llvm.experimental.constrained.log.v3f64(<3 x double>, metadata, metadata)
declare <3 x float> @llvm.experimental.constrained.log10.v3f32(<3 x float>, metadata, metadata)
declare <3 x double> @llvm.experimental.constrained.log10.v3f64(<3 x double>, metadata, metadata)
declare <3 x float> @llvm.experimental.constrained.log2.v3f32(<3 x float>, metadata, metadata)
declare <3 x double> @llvm.experimental.constrained.log2.v3f64(<3 x double>, metadata, metadata)
declare <3 x float> @llvm.experimental.constrained.rint.v3f32(<3 x float>, metadata, metadata)
declare <3 x double> @llvm.experimental.constrained.rint.v3f64(<3 x double>, metadata, metadata)
declare <3 x float> @llvm.experimental.constrained.nearbyint.v3f32(<3 x float>, metadata, metadata)
declare <3 x double> @llvm.experimental.constrained.nearbyint.v3f64(<3 x double>, metadata, metadata)
declare <3 x float> @llvm.experimental.constrained.maxnum.v3f32(<3 x float>, <3 x float>, metadata)
declare <3 x double> @llvm.experimental.constrained.maxnum.v3f64(<3 x double>, <3 x double>, metadata)
declare <3 x float> @llvm.experimental.constrained.minnum.v3f32(<3 x float>, <3 x float>, metadata)
declare <3 x double> @llvm.experimental.constrained.minnum.v3f64(<3 x double>, <3 x double>, metadata)
declare <3 x i32> @llvm.experimental.constrained.fptosi.v3i32.v3f32(<3 x float>, metadata)
declare <3 x i64> @llvm.experimental.constrained.fptosi.v3i64.v3f32(<3 x float>, metadata)
declare <3 x i32> @llvm.experimental.constrained.fptosi.v3i32.v3f64(<3 x double>, metadata)
declare <3 x i64> @llvm.experimental.constrained.fptosi.v3i64.v3f64(<3 x double>, metadata)
declare <3 x i32> @llvm.experimental.constrained.fptoui.v3i32.v3f32(<3 x float>, metadata)
declare <3 x i64> @llvm.experimental.constrained.fptoui.v3i64.v3f32(<3 x float>, metadata)
declare <3 x i32> @llvm.experimental.constrained.fptoui.v3i32.v3f64(<3 x double>, metadata)
declare <3 x i64> @llvm.experimental.constrained.fptoui.v3i64.v3f64(<3 x double>, metadata)
declare <3 x float> @llvm.experimental.constrained.fptrunc.v3f32.v3f64(<3 x double>, metadata, metadata)
declare <3 x double> @llvm.experimental.constrained.fpext.v3f64.v3f32(<3 x float>, metadata)
declare <3 x float> @llvm.experimental.constrained.ceil.v3f32(<3 x float>, metadata)
declare <3 x double> @llvm.experimental.constrained.ceil.v3f64(<3 x double>, metadata)
declare <3 x float> @llvm.experimental.constrained.floor.v3f32(<3 x float>, metadata)
declare <3 x double> @llvm.experimental.constrained.floor.v3f64(<3 x double>, metadata)
declare <3 x float> @llvm.experimental.constrained.round.v3f32(<3 x float>, metadata)
declare <3 x double> @llvm.experimental.constrained.round.v3f64(<3 x double>, metadata)
declare <3 x float> @llvm.experimental.constrained.trunc.v3f32(<3 x float>, metadata)
declare <3 x double> @llvm.experimental.constrained.trunc.v3f64(<3 x double>, metadata)
declare <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i32(<3 x i32>, metadata, metadata)
declare <3 x float> @llvm.experimental.constrained.sitofp.v3f32.v3i32(<3 x i32>, metadata, metadata)
declare <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i64(<3 x i64>, metadata, metadata)
declare <3 x float> @llvm.experimental.constrained.sitofp.v3f32.v3i64(<3 x i64>, metadata, metadata)
declare <3 x double> @llvm.experimental.constrained.uitofp.v3f64.v3i32(<3 x i32>, metadata, metadata)
declare <3 x float> @llvm.experimental.constrained.uitofp.v3f32.v3i32(<3 x i32>, metadata, metadata)
declare <3 x double> @llvm.experimental.constrained.uitofp.v3f64.v3i64(<3 x i64>, metadata, metadata)
declare <3 x float> @llvm.experimental.constrained.uitofp.v3f32.v3i64(<3 x i64>, metadata, metadata)
; Double width declarations
declare <4 x double> @llvm.experimental.constrained.fadd.v4f64(<4 x double>, <4 x double>, metadata, metadata)
declare <4 x double> @llvm.experimental.constrained.fsub.v4f64(<4 x double>, <4 x double>, metadata, metadata)
declare <4 x double> @llvm.experimental.constrained.fmul.v4f64(<4 x double>, <4 x double>, metadata, metadata)
declare <4 x double> @llvm.experimental.constrained.fdiv.v4f64(<4 x double>, <4 x double>, metadata, metadata)
declare <4 x double> @llvm.experimental.constrained.frem.v4f64(<4 x double>, <4 x double>, metadata, metadata)
declare <4 x double> @llvm.experimental.constrained.sqrt.v4f64(<4 x double>, metadata, metadata)
declare <4 x double> @llvm.experimental.constrained.pow.v4f64(<4 x double>, <4 x double>, metadata, metadata)
declare <4 x double> @llvm.experimental.constrained.powi.v4f64(<4 x double>, i32, metadata, metadata)
declare <4 x double> @llvm.experimental.constrained.sin.v4f64(<4 x double>, metadata, metadata)
declare <4 x double> @llvm.experimental.constrained.cos.v4f64(<4 x double>, metadata, metadata)
declare <4 x double> @llvm.experimental.constrained.exp.v4f64(<4 x double>, metadata, metadata)
declare <4 x double> @llvm.experimental.constrained.exp2.v4f64(<4 x double>, metadata, metadata)
declare <4 x double> @llvm.experimental.constrained.log.v4f64(<4 x double>, metadata, metadata)
declare <4 x double> @llvm.experimental.constrained.log10.v4f64(<4 x double>, metadata, metadata)
declare <4 x double> @llvm.experimental.constrained.log2.v4f64(<4 x double>, metadata, metadata)
declare <4 x double> @llvm.experimental.constrained.rint.v4f64(<4 x double>, metadata, metadata)
declare <4 x double> @llvm.experimental.constrained.nearbyint.v4f64(<4 x double>, metadata, metadata)
declare <4 x double> @llvm.experimental.constrained.maxnum.v4f64(<4 x double>, <4 x double>, metadata)
declare <4 x double> @llvm.experimental.constrained.minnum.v4f64(<4 x double>, <4 x double>, metadata)
declare <4 x i32> @llvm.experimental.constrained.fptosi.v4i32.v4f32(<4 x float>, metadata)
declare <4 x i64> @llvm.experimental.constrained.fptosi.v4i64.v4f32(<4 x float>, metadata)
declare <4 x i32> @llvm.experimental.constrained.fptosi.v4i32.v4f64(<4 x double>, metadata)
declare <4 x i64> @llvm.experimental.constrained.fptosi.v4i64.v4f64(<4 x double>, metadata)
declare <4 x i32> @llvm.experimental.constrained.fptoui.v4i32.v4f32(<4 x float>, metadata)
declare <4 x i64> @llvm.experimental.constrained.fptoui.v4i64.v4f32(<4 x float>, metadata)
declare <4 x i32> @llvm.experimental.constrained.fptoui.v4i32.v4f64(<4 x double>, metadata)
declare <4 x i64> @llvm.experimental.constrained.fptoui.v4i64.v4f64(<4 x double>, metadata)
declare <4 x float> @llvm.experimental.constrained.fptrunc.v4f32.v4f64(<4 x double>, metadata, metadata)
declare <4 x double> @llvm.experimental.constrained.fpext.v4f64.v4f32(<4 x float>, metadata)
declare <4 x double> @llvm.experimental.constrained.ceil.v4f64(<4 x double>, metadata)
declare <4 x double> @llvm.experimental.constrained.floor.v4f64(<4 x double>, metadata)
declare <4 x double> @llvm.experimental.constrained.round.v4f64(<4 x double>, metadata)
declare <4 x double> @llvm.experimental.constrained.trunc.v4f64(<4 x double>, metadata)
declare <4 x double> @llvm.experimental.constrained.sitofp.v4f64.v4i32(<4 x i32>, metadata, metadata)
declare <4 x float> @llvm.experimental.constrained.sitofp.v4f32.v4i32(<4 x i32>, metadata, metadata)
declare <4 x double> @llvm.experimental.constrained.sitofp.v4f64.v4i64(<4 x i64>, metadata, metadata)
declare <4 x float> @llvm.experimental.constrained.sitofp.v4f32.v4i64(<4 x i64>, metadata, metadata)
declare <4 x double> @llvm.experimental.constrained.uitofp.v4f64.v4i32(<4 x i32>, metadata, metadata)
declare <4 x float> @llvm.experimental.constrained.uitofp.v4f32.v4i32(<4 x i32>, metadata, metadata)
declare <4 x double> @llvm.experimental.constrained.uitofp.v4f64.v4i64(<4 x i64>, metadata, metadata)
declare <4 x float> @llvm.experimental.constrained.uitofp.v4f32.v4i64(<4 x i64>, metadata, metadata)