From a3a714082b4d122b8556659100b657d3ccc498b9 Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Wed, 9 Oct 2013 08:16:14 +0000 Subject: [PATCH] AVX-512: Added VRCP28 and VRSQRT28 instructions and intrinsics. llvm-svn: 192283 --- llvm/include/llvm/IR/IntrinsicsX86.td | 55 +++++++--- llvm/lib/Target/X86/X86InstrAVX512.td | 115 +++++++++++++++------ llvm/lib/Target/X86/X86InstrInfo.td | 4 +- llvm/lib/Target/X86/X86InstrSSE.td | 5 +- llvm/test/CodeGen/X86/avx512-intrinsics.ll | 74 ++++++++++--- 5 files changed, 190 insertions(+), 63 deletions(-) diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index bd22e1b37820..54ea37e24d85 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -2745,29 +2745,54 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx512_sqrt_ps_512 : GCCBuiltin<"__builtin_ia32_sqrtps512">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty], [IntrNoMem]>; - def int_x86_avx512_rcp14_ps_512 : GCCBuiltin<"__builtin_ia32_avx512_rcp14ps512">, + def int_x86_avx512_rcp14_ps_512 : GCCBuiltin<"__builtin_ia32_rcp14ps512">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty], [IntrNoMem]>; - def int_x86_avx512_rcp14_pd_512 : GCCBuiltin<"__builtin_ia32_avx512_rcp14pd512">, + def int_x86_avx512_rcp14_pd_512 : GCCBuiltin<"__builtin_ia32_rcp14pd512">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty], [IntrNoMem]>; - def int_x86_avx512_rcp14_ss : GCCBuiltin<"__builtin_ia32_avx512_rcp14ss">, - Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], + def int_x86_avx512_rcp14_ss : GCCBuiltin<"__builtin_ia32_rcp14ss">, + Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>; - def int_x86_avx512_rcp14_sd : GCCBuiltin<"__builtin_ia32_avx512_rcp14sd">, - Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], + def int_x86_avx512_rcp14_sd : GCCBuiltin<"__builtin_ia32_rcp14sd">, + Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty], [IntrNoMem]>; - def int_x86_avx512_rsqrt14_ps_512 : GCCBuiltin<"__builtin_ia32_avx512_rsqrt14ps512">, + def int_x86_avx512_rsqrt14_ps_512 : GCCBuiltin<"__builtin_ia32_rsqrt14ps512">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty], [IntrNoMem]>; - def int_x86_avx512_rsqrt14_pd_512 : GCCBuiltin<"__builtin_ia32_avx512_rsqrt14pd512">, + def int_x86_avx512_rsqrt14_pd_512 : GCCBuiltin<"__builtin_ia32_rsqrt14pd512">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty], [IntrNoMem]>; - def int_x86_avx512_rsqrt14_ss : GCCBuiltin<"__builtin_ia32_avx512_rsqrt14ss">, - Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], + def int_x86_avx512_rsqrt14_ss : GCCBuiltin<"__builtin_ia32_rsqrt14ss">, + Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>; - def int_x86_avx512_rsqrt14_sd : GCCBuiltin<"__builtin_ia32_avx512_rsqrt14sd">, - Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], + def int_x86_avx512_rsqrt14_sd : GCCBuiltin<"__builtin_ia32_rsqrt14sd">, + Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty], + [IntrNoMem]>; + + def int_x86_avx512_rcp28_ps_512 : GCCBuiltin<"__builtin_ia32_rcp28ps512">, + Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty], + [IntrNoMem]>; + def int_x86_avx512_rcp28_pd_512 : GCCBuiltin<"__builtin_ia32_rcp28pd512">, + Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty], + [IntrNoMem]>; + def int_x86_avx512_rcp28_ss : GCCBuiltin<"__builtin_ia32_rcp28ss">, + Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], + [IntrNoMem]>; + def int_x86_avx512_rcp28_sd : GCCBuiltin<"__builtin_ia32_rcp28sd">, + Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty], + [IntrNoMem]>; + def int_x86_avx512_rsqrt28_ps_512 : GCCBuiltin<"__builtin_ia32_rsqrt28ps512">, + Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty], + [IntrNoMem]>; + def int_x86_avx512_rsqrt28_pd_512 : GCCBuiltin<"__builtin_ia32_rsqrt28pd512">, + Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty], + [IntrNoMem]>; + def int_x86_avx512_rsqrt28_ss : GCCBuiltin<"__builtin_ia32_rsqrt28ss">, + Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], + [IntrNoMem]>; + def int_x86_avx512_rsqrt28_sd : GCCBuiltin<"__builtin_ia32_rsqrt28sd">, + Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty], [IntrNoMem]>; } @@ -2910,14 +2935,14 @@ let TargetPrefix = "x86" in { } let TargetPrefix = "x86" in { - def int_x86_avx512_mskblend_ps_512 : GCCBuiltin<"__builtin_ia32_avx512_mskblendps512">, + def int_x86_avx512_mskblend_ps_512 : GCCBuiltin<"__builtin_ia32_mskblendps512">, Intrinsic<[llvm_v16f32_ty], [llvm_i16_ty, llvm_v16f32_ty, llvm_v16f32_ty], [IntrNoMem]>; - def int_x86_avx512_cmpeq_pi_512 : GCCBuiltin<"__builtin_ia32_avx512_cmpeqpi512">, + def int_x86_avx512_cmpeq_pi_512 : GCCBuiltin<"__builtin_ia32_cmpeqpi512">, Intrinsic<[llvm_i16_ty], [llvm_v16i32_ty, llvm_v16i32_ty], [IntrNoMem]>; - def int_x86_avx512_and_pi : GCCBuiltin<"__builtin_ia32_avx512_andpi512">, + def int_x86_avx512_and_pi : GCCBuiltin<"__builtin_ia32_andpi512">, Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty], [IntrNoMem]>; } diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 47480b3a148e..fd28b1c7a380 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -2643,8 +2643,7 @@ multiclass avx512_fp_unop_p_int opc, string OpcodeStr, } /// avx512_fp_unop_s - AVX-512 unops in scalar form. -multiclass avx512_fp_unop_s opc, string OpcodeStr, - Intrinsic F32Int, Intrinsic F64Int> { +multiclass avx512_fp_unop_s opc, string OpcodeStr> { let hasSideEffects = 0 in { def SSZr : AVX5128I opc, string OpcodeStr, (ins VR128X:$src1, ssmem:$src2), !strconcat(OpcodeStr, "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128X:$dst, (F32Int VR128X:$src1, sse_load_f32:$src2))]>, - EVEX_4V, EVEX_CD8<32, CD8VT1>; + []>, EVEX_4V, EVEX_CD8<32, CD8VT1>; } def SDZr : AVX5128I opc, string OpcodeStr, (ins FR64X:$src1, f64mem:$src2), !strconcat(OpcodeStr, "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, - EVEX_4V, VEX_W, EVEX_CD8<32, CD8VT1>; + EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>; def SDZm_Int : AVX5128I, - EVEX_4V, VEX_W, EVEX_CD8<32, CD8VT1>; + []>, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>; } } } -defm VRCP14 : avx512_fp_unop_s<0x4D, "vrcp14", int_x86_avx512_rcp14_ss, - int_x86_avx512_rcp14_sd>, +defm VRCP14 : avx512_fp_unop_s<0x4D, "vrcp14">, avx512_fp_unop_p<0x4C, "vrcp14", X86frcp>, avx512_fp_unop_p_int<0x4C, "vrcp14", int_x86_avx512_rcp14_ps_512, int_x86_avx512_rcp14_pd_512>; -defm VRSQRT14 : avx512_fp_unop_s<0x4F, "vrsqrt14", int_x86_avx512_rsqrt14_ss, - int_x86_avx512_rsqrt14_sd>, +defm VRSQRT14 : avx512_fp_unop_s<0x4F, "vrsqrt14">, avx512_fp_unop_p<0x4E, "vrsqrt14", X86frsqrt>, avx512_fp_unop_p_int<0x4E, "vrsqrt14", int_x86_avx512_rsqrt14_ps_512, int_x86_avx512_rsqrt14_pd_512>; +def : Pat<(int_x86_avx512_rsqrt14_ss VR128X:$src), + (COPY_TO_REGCLASS (VRSQRT14SSZr (f32 (IMPLICIT_DEF)), + (COPY_TO_REGCLASS VR128X:$src, FR32)), + VR128X)>; +def : Pat<(int_x86_avx512_rsqrt14_ss sse_load_f32:$src), + (VRSQRT14SSZm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>; + +def : Pat<(int_x86_avx512_rcp14_ss VR128X:$src), + (COPY_TO_REGCLASS (VRCP14SSZr (f32 (IMPLICIT_DEF)), + (COPY_TO_REGCLASS VR128X:$src, FR32)), + VR128X)>; +def : Pat<(int_x86_avx512_rcp14_ss sse_load_f32:$src), + (VRCP14SSZm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>; + +let AddedComplexity = 20, Predicates = [HasERI] in { +defm VRCP28 : avx512_fp_unop_s<0xCB, "vrcp28">, + avx512_fp_unop_p<0xCA, "vrcp28", X86frcp>, + avx512_fp_unop_p_int<0xCA, "vrcp28", + int_x86_avx512_rcp28_ps_512, int_x86_avx512_rcp28_pd_512>; + +defm VRSQRT28 : avx512_fp_unop_s<0xCD, "vrsqrt28">, + avx512_fp_unop_p<0xCC, "vrsqrt28", X86frsqrt>, + avx512_fp_unop_p_int<0xCC, "vrsqrt28", + int_x86_avx512_rsqrt28_ps_512, int_x86_avx512_rsqrt28_pd_512>; +} + +let Predicates = [HasERI] in { + def : Pat<(int_x86_avx512_rsqrt28_ss VR128X:$src), + (COPY_TO_REGCLASS (VRSQRT28SSZr (f32 (IMPLICIT_DEF)), + (COPY_TO_REGCLASS VR128X:$src, FR32)), + VR128X)>; + def : Pat<(int_x86_avx512_rsqrt28_ss sse_load_f32:$src), + (VRSQRT28SSZm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>; + + def : Pat<(int_x86_avx512_rcp28_ss VR128X:$src), + (COPY_TO_REGCLASS (VRCP28SSZr (f32 (IMPLICIT_DEF)), + (COPY_TO_REGCLASS VR128X:$src, FR32)), + VR128X)>; + def : Pat<(int_x86_avx512_rcp28_ss sse_load_f32:$src), + (VRCP28SSZm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>; +} multiclass avx512_sqrt_packed opc, string OpcodeStr, SDNode OpNode, Intrinsic V16F32Int, Intrinsic V8F64Int, OpndItins itins_s, OpndItins itins_d> { @@ -2810,28 +2846,45 @@ defm VSQRT : avx512_sqrt_scalar<0x51, "sqrt", int_x86_avx512_sqrt_ps_512, int_x86_avx512_sqrt_pd_512, SSE_SQRTPS, SSE_SQRTPD>; -def : Pat<(f32 (fsqrt FR32X:$src)), - (VSQRTSSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>; -def : Pat<(f32 (fsqrt (load addr:$src))), - (VSQRTSSZm (f32 (IMPLICIT_DEF)), addr:$src)>, - Requires<[OptForSize]>; -def : Pat<(f64 (fsqrt FR64X:$src)), - (VSQRTSDZr (f64 (IMPLICIT_DEF)), FR64X:$src)>; -def : Pat<(f64 (fsqrt (load addr:$src))), - (VSQRTSDZm (f64 (IMPLICIT_DEF)), addr:$src)>, - Requires<[OptForSize]>; +let Predicates = [HasAVX512] in { + def : Pat<(f32 (fsqrt FR32X:$src)), + (VSQRTSSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>; + def : Pat<(f32 (fsqrt (load addr:$src))), + (VSQRTSSZm (f32 (IMPLICIT_DEF)), addr:$src)>, + Requires<[OptForSize]>; + def : Pat<(f64 (fsqrt FR64X:$src)), + (VSQRTSDZr (f64 (IMPLICIT_DEF)), FR64X:$src)>; + def : Pat<(f64 (fsqrt (load addr:$src))), + (VSQRTSDZm (f64 (IMPLICIT_DEF)), addr:$src)>, + Requires<[OptForSize]>; -def : Pat<(f32 (X86frsqrt FR32X:$src)), - (VRSQRT14SSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>; -def : Pat<(f32 (X86frsqrt (load addr:$src))), - (VRSQRT14SSZm (f32 (IMPLICIT_DEF)), addr:$src)>, - Requires<[OptForSize]>; + def : Pat<(f32 (X86frsqrt FR32X:$src)), + (VRSQRT14SSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>; + def : Pat<(f32 (X86frsqrt (load addr:$src))), + (VRSQRT14SSZm (f32 (IMPLICIT_DEF)), addr:$src)>, + Requires<[OptForSize]>; + + def : Pat<(f32 (X86frcp FR32X:$src)), + (VRCP14SSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>; + def : Pat<(f32 (X86frcp (load addr:$src))), + (VRCP14SSZm (f32 (IMPLICIT_DEF)), addr:$src)>, + Requires<[OptForSize]>; + + def : Pat<(int_x86_sse_sqrt_ss VR128X:$src), + (COPY_TO_REGCLASS (VSQRTSSZr (f32 (IMPLICIT_DEF)), + (COPY_TO_REGCLASS VR128X:$src, FR32)), + VR128X)>; + def : Pat<(int_x86_sse_sqrt_ss sse_load_f32:$src), + (VSQRTSSZm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>; + + def : Pat<(int_x86_sse2_sqrt_sd VR128X:$src), + (COPY_TO_REGCLASS (VSQRTSDZr (f64 (IMPLICIT_DEF)), + (COPY_TO_REGCLASS VR128X:$src, FR64)), + VR128X)>; + def : Pat<(int_x86_sse2_sqrt_sd sse_load_f64:$src), + (VSQRTSDZm_Int (v2f64 (IMPLICIT_DEF)), sse_load_f64:$src)>; +} -def : Pat<(f32 (X86frcp FR32X:$src)), - (VRCP14SSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>; -def : Pat<(f32 (X86frcp (load addr:$src))), - (VRCP14SSZm (f32 (IMPLICIT_DEF)), addr:$src)>, - Requires<[OptForSize]>; multiclass avx512_fp_unop_rm opcps, bits<8> opcpd, string OpcodeStr, X86MemOperand x86memop, RegisterClass RC, diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td index 82edecba8295..ede418df27e9 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.td +++ b/llvm/lib/Target/X86/X86InstrInfo.td @@ -649,13 +649,13 @@ def HasSSE4A : Predicate<"Subtarget->hasSSE4A()">; def HasAVX : Predicate<"Subtarget->hasAVX()">; def HasAVX2 : Predicate<"Subtarget->hasAVX2()">; def HasAVX1Only : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">; -def HasAVX512 : Predicate<"Subtarget->hasAVX512()">; +def HasAVX512 : Predicate<"Subtarget->hasAVX512()">; def UseAVX : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX512()">; def UseAVX2 : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">; def NoAVX512 : Predicate<"!Subtarget->hasAVX512()">; def HasCDI : Predicate<"Subtarget->hasCDI()">; def HasPFI : Predicate<"Subtarget->hasPFI()">; -def HasEMI : Predicate<"Subtarget->hasERI()">; +def HasERI : Predicate<"Subtarget->hasERI()">; def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">; def HasAES : Predicate<"Subtarget->hasAES()">; diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 7e0fcda6b87f..d4d92a2dc864 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -3357,7 +3357,8 @@ let Predicates = [UseAVX] in { def : Pat<(f32 (X86frcp (load addr:$src))), (VRCPSSm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX, OptForSize]>; - +} +let Predicates = [UseAVX] in { def : Pat<(int_x86_sse_sqrt_ss VR128:$src), (COPY_TO_REGCLASS (VSQRTSSr (f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS VR128:$src, FR32)), @@ -3371,7 +3372,9 @@ let Predicates = [UseAVX] in { VR128)>; def : Pat<(int_x86_sse2_sqrt_sd sse_load_f64:$src), (VSQRTSDm_Int (v2f64 (IMPLICIT_DEF)), sse_load_f64:$src)>; +} +let Predicates = [HasAVX] in { def : Pat<(int_x86_sse_rsqrt_ss VR128:$src), (COPY_TO_REGCLASS (VRSQRTSSr (f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS VR128:$src, FR32)), diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll index dc2ab859cede..ebf8c2440a84 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -1,39 +1,52 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s declare i32 @llvm.x86.avx512.kortestz(i16, i16) nounwind readnone -; CHECK: test_x86_avx3_kortestz +; CHECK: test_kortestz ; CHECK: kortestw ; CHECK: sete -define i32 @test_x86_avx3_kortestz(i16 %a0, i16 %a1) { +define i32 @test_kortestz(i16 %a0, i16 %a1) { %res = call i32 @llvm.x86.avx512.kortestz(i16 %a0, i16 %a1) ret i32 %res } declare i32 @llvm.x86.avx512.kortestc(i16, i16) nounwind readnone -; CHECK: test_x86_avx3_kortestc +; CHECK: test_kortestc ; CHECK: kortestw ; CHECK: sbbl -define i32 @test_x86_avx3_kortestc(i16 %a0, i16 %a1) { +define i32 @test_kortestc(i16 %a0, i16 %a1) { %res = call i32 @llvm.x86.avx512.kortestc(i16 %a0, i16 %a1) ret i32 %res } -define <16 x float> @test_x86_avx3_rcp_ps_512(<16 x float> %a0) { +define <16 x float> @test_rcp_ps_512(<16 x float> %a0) { ; CHECK: vrcp14ps %res = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> %a0) ; <<16 x float>> [#uses=1] ret <16 x float> %res } declare <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float>) nounwind readnone -define <8 x double> @test_x86_avx3_rcp_pd_512(<8 x double> %a0) { +define <8 x double> @test_rcp_pd_512(<8 x double> %a0) { ; CHECK: vrcp14pd %res = call <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double> %a0) ; <<8 x double>> [#uses=1] ret <8 x double> %res } declare <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double>) nounwind readnone +define <16 x float> @test_rcp28_ps_512(<16 x float> %a0) { + ; CHECK: vrcp28ps + %res = call <16 x float> @llvm.x86.avx512.rcp28.ps.512(<16 x float> %a0) ; <<16 x float>> [#uses=1] + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.rcp28.ps.512(<16 x float>) nounwind readnone -define <8 x double> @test_x86_avx3_rndscale_pd_512(<8 x double> %a0) { +define <8 x double> @test_rcp28_pd_512(<8 x double> %a0) { + ; CHECK: vrcp28pd + %res = call <8 x double> @llvm.x86.avx512.rcp28.pd.512(<8 x double> %a0) ; <<8 x double>> [#uses=1] + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.avx512.rcp28.pd.512(<8 x double>) nounwind readnone + +define <8 x double> @test_rndscale_pd_512(<8 x double> %a0) { ; CHECK: vrndscale %res = call <8 x double> @llvm.x86.avx512.rndscale.pd.512(<8 x double> %a0, i32 7) ; <<8 x double>> [#uses=1] ret <8 x double> %res @@ -41,7 +54,7 @@ define <8 x double> @test_x86_avx3_rndscale_pd_512(<8 x double> %a0) { declare <8 x double> @llvm.x86.avx512.rndscale.pd.512(<8 x double>, i32) nounwind readnone -define <16 x float> @test_x86_avx3_rndscale_ps_512(<16 x float> %a0) { +define <16 x float> @test_rndscale_ps_512(<16 x float> %a0) { ; CHECK: vrndscale %res = call <16 x float> @llvm.x86.avx512.rndscale.ps.512(<16 x float> %a0, i32 7) ; <<16 x float>> [#uses=1] ret <16 x float> %res @@ -49,37 +62,70 @@ define <16 x float> @test_x86_avx3_rndscale_ps_512(<16 x float> %a0) { declare <16 x float> @llvm.x86.avx512.rndscale.ps.512(<16 x float>, i32) nounwind readnone -define <16 x float> @test_x86_avx3_rsqrt_ps_512(<16 x float> %a0) { +define <16 x float> @test_rsqrt_ps_512(<16 x float> %a0) { ; CHECK: vrsqrt14ps %res = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %a0) ; <<16 x float>> [#uses=1] ret <16 x float> %res } declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>) nounwind readnone +define <16 x float> @test_rsqrt28_ps_512(<16 x float> %a0) { + ; CHECK: vrsqrt28ps + %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps.512(<16 x float> %a0) ; <<16 x float>> [#uses=1] + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.rsqrt28.ps.512(<16 x float>) nounwind readnone -define <8 x double> @test_x86_avx3_sqrt_pd_512(<8 x double> %a0) { +define <4 x float> @test_rsqrt14_ss(<4 x float> %a0) { + ; CHECK: vrsqrt14ss + %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1] + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float>) nounwind readnone + +define <4 x float> @test_rsqrt28_ss(<4 x float> %a0) { + ; CHECK: vrsqrt28ss + %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1] + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float>) nounwind readnone + +define <4 x float> @test_rcp14_ss(<4 x float> %a0) { + ; CHECK: vrcp14ss + %res = call <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1] + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float>) nounwind readnone + +define <4 x float> @test_rcp28_ss(<4 x float> %a0) { + ; CHECK: vrcp28ss + %res = call <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1] + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>) nounwind readnone + +define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) { ; CHECK: vsqrtpd %res = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0) ; <<8 x double>> [#uses=1] ret <8 x double> %res } declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>) nounwind readnone - -define <16 x float> @test_x86_avx3_sqrt_ps_512(<16 x float> %a0) { +define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) { ; CHECK: vsqrtps %res = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0) ; <<16 x float>> [#uses=1] ret <16 x float> %res } declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>) nounwind readnone -define <4 x float> @test_x86_avx3_sqrt_ss(<4 x float> %a0, <4 x float> %a1) { +define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1) { ; CHECK: vsqrtssz %res = call <4 x float> @llvm.x86.avx512.sqrt.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1] ret <4 x float> %res } declare <4 x float> @llvm.x86.avx512.sqrt.ss(<4 x float>, <4 x float>) nounwind readnone -define <2 x double> @test_x86_avx3_sqrt_sd(<2 x double> %a0, <2 x double> %a1) { +define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1) { ; CHECK: vsqrtsdz %res = call <2 x double> @llvm.x86.avx512.sqrt.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1] ret <2 x double> %res