[X86] Remove all of the avx512 masked packed fma intrinsics. Use llvm.fma or unmasked 512-bit intrinsics with rounding mode.

This upgrades all of the intrinsics to use fneg instructions to convert fma into fmsub/fnmsub/fnmadd/fmsubadd. And uses a select instruction for masking.

This matches how clang uses the intrinsics these days.

llvm-svn: 336409
This commit is contained in:
Craig Topper 2018-07-06 03:42:09 +00:00
parent 4ea8949697
commit 7b35585ff1
7 changed files with 2097 additions and 784 deletions

View File

@ -1912,206 +1912,27 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
[IntrNoMem]>;
def int_x86_avx512_mask_vfmadd_pd_128 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v2f64_ty],
[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask3_vfmadd_pd_128 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v2f64_ty],
[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_maskz_vfmadd_pd_128 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v2f64_ty],
[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask_vfmadd_pd_256 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v4f64_ty],
[llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask3_vfmadd_pd_256 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v4f64_ty],
[llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_maskz_vfmadd_pd_256 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v4f64_ty],
[llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_vfmadd_pd_512 :
Intrinsic<[llvm_v8f64_ty],
[llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i32_ty],
[IntrNoMem]>;
def int_x86_avx512_mask_vfmadd_pd_512 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v8f64_ty],
[llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty,
llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask3_vfmadd_pd_512 :
Intrinsic<[llvm_v8f64_ty],
[llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty,
llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_maskz_vfmadd_pd_512 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v8f64_ty],
[llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty,
llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_vfmadd_ps_128 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v4f32_ty],
[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask3_vfmadd_ps_128 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v4f32_ty],
[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_maskz_vfmadd_ps_128 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v4f32_ty],
[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask_vfmadd_ps_256 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v8f32_ty],
[llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask3_vfmadd_ps_256 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v8f32_ty],
[llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_maskz_vfmadd_ps_256 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v8f32_ty],
[llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_vfmadd_ps_512 :
Intrinsic<[llvm_v16f32_ty],
[llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i32_ty],
[IntrNoMem]>;
def int_x86_avx512_mask_vfmadd_ps_512 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v16f32_ty],
[llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask3_vfmadd_ps_512 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v16f32_ty],
[llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_maskz_vfmadd_ps_512 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v16f32_ty],
[llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_vfmaddsub_pd_128 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v2f64_ty],
[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask3_vfmaddsub_pd_128 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v2f64_ty],
[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_maskz_vfmaddsub_pd_128 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v2f64_ty],
[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask_vfmaddsub_pd_256 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v4f64_ty],
[llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask3_vfmaddsub_pd_256 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v4f64_ty],
[llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_maskz_vfmaddsub_pd_256 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v4f64_ty],
[llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty],
[IntrNoMem]>;
// TODO: Can we use 2 vfmadds+shufflevector?
def int_x86_avx512_vfmaddsub_pd_512 :
Intrinsic<[llvm_v8f64_ty],
[llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i32_ty],
[IntrNoMem]>;
def int_x86_avx512_mask_vfmaddsub_pd_512 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v8f64_ty],
[llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty,
llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask3_vfmaddsub_pd_512 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v8f64_ty],
[llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty,
llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_maskz_vfmaddsub_pd_512 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v8f64_ty],
[llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty,
llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_vfmaddsub_ps_128 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v4f32_ty],
[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask3_vfmaddsub_ps_128 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v4f32_ty],
[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_maskz_vfmaddsub_ps_128 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v4f32_ty],
[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask_vfmaddsub_ps_256 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v8f32_ty],
[llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask3_vfmaddsub_ps_256 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v8f32_ty],
[llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_maskz_vfmaddsub_ps_256 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v8f32_ty],
[llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_vfmaddsub_ps_512 :
Intrinsic<[llvm_v16f32_ty],
[llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i32_ty],
[IntrNoMem]>;
def int_x86_avx512_mask_vfmaddsub_ps_512 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v16f32_ty],
[llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask3_vfmaddsub_ps_512 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v16f32_ty],
[llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_maskz_vfmaddsub_ps_512 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v16f32_ty],
[llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_vfmadd_sd :
GCCBuiltin<"__builtin_ia32_vfmaddsd3_mask">,
@ -2161,96 +1982,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty,
llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask3_vfmsub_pd_128 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v2f64_ty],
[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask3_vfmsub_pd_256 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v4f64_ty],
[llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask3_vfmsub_pd_512 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v8f64_ty],
[llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty,
llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask3_vfmsub_ps_128 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v4f32_ty],
[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask3_vfmsub_ps_256 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v8f32_ty],
[llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask3_vfmsub_ps_512 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v16f32_ty],
[llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask3_vfmsubadd_pd_128 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v2f64_ty],
[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask3_vfmsubadd_pd_256 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v4f64_ty],
[llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask3_vfmsubadd_pd_512 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v8f64_ty],
[llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty,
llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask3_vfmsubadd_ps_128 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v4f32_ty],
[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask3_vfmsubadd_ps_256 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v8f32_ty],
[llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask3_vfmsubadd_ps_512 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v16f32_ty],
[llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_vfnmadd_pd_128 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v2f64_ty],
[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask_vfnmadd_pd_256 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v4f64_ty],
[llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask_vfnmadd_pd_512 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v8f64_ty],
[llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty,
llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_vfnmadd_ps_128 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v4f32_ty],
[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask_vfnmadd_ps_256 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v8f32_ty],
[llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask_vfnmadd_ps_512 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v16f32_ty],
[llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask3_vfnmsub_sd :
GCCBuiltin<"__builtin_ia32_vfnmsubsd3_mask3">,
Intrinsic<[llvm_v2f64_ty],
@ -2263,66 +1994,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty,
llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_vfnmsub_pd_128 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v2f64_ty],
[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask3_vfnmsub_pd_128 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v2f64_ty],
[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask_vfnmsub_pd_256 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v4f64_ty],
[llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask3_vfnmsub_pd_256 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v4f64_ty],
[llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask_vfnmsub_pd_512 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v8f64_ty],
[llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty,
llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask3_vfnmsub_pd_512 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v8f64_ty],
[llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty,
llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_vfnmsub_ps_128 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v4f32_ty],
[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask3_vfnmsub_ps_128 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v4f32_ty],
[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask_vfnmsub_ps_256 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v8f32_ty],
[llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask3_vfnmsub_ps_256 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v8f32_ty],
[llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_mask_vfnmsub_ps_512 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v16f32_ty],
[llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask3_vfnmsub_ps_512 : // FIXME: remove this intrinsic.
Intrinsic<[llvm_v16f32_ty],
[llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_vpmadd52h_uq_128 :
GCCBuiltin<"__builtin_ia32_vpmadd52huq128">,
Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,

View File

@ -80,6 +80,17 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
Name.startswith("fma.vfmsubadd.") || // Added in 7.0
Name.startswith("fma.vfnmadd.") || // Added in 7.0
Name.startswith("fma.vfnmsub.") || // Added in 7.0
Name.startswith("avx512.mask.vfmadd.p") || // Added in 7.0
Name.startswith("avx512.mask.vfnmadd.p") || // Added in 7.0
Name.startswith("avx512.mask.vfnmsub.p") || // Added in 7.0
Name.startswith("avx512.mask3.vfmadd.p") || // Added in 7.0
Name.startswith("avx512.maskz.vfmadd.p") || // Added in 7.0
Name.startswith("avx512.mask3.vfmsub.p") || // Added in 7.0
Name.startswith("avx512.mask3.vfnmsub.p") || // Added in 7.0
Name.startswith("avx512.mask.vfmaddsub.p") || // Added in 7.0
Name.startswith("avx512.maskz.vfmaddsub.p") || // Added in 7.0
Name.startswith("avx512.mask3.vfmaddsub.p") || // Added in 7.0
Name.startswith("avx512.mask3.vfmsubadd.p") || // Added in 7.0
Name.startswith("avx512.mask.shuf.i") || // Added in 6.0
Name.startswith("avx512.mask.shuf.f") || // Added in 6.0
Name.startswith("avx512.kunpck") || //added in 6.0
@ -2779,13 +2790,74 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
if (IsScalar)
Rep = Builder.CreateInsertElement(CI->getArgOperand(0), Rep,
(uint64_t)0);
} else if (IsX86 && (Name.startswith("avx512.mask.vfmadd.p") ||
Name.startswith("avx512.mask.vfnmadd.p") ||
Name.startswith("avx512.mask.vfnmsub.p") ||
Name.startswith("avx512.mask3.vfmadd.p") ||
Name.startswith("avx512.mask3.vfmsub.p") ||
Name.startswith("avx512.mask3.vfnmsub.p") ||
Name.startswith("avx512.maskz.vfmadd.p"))) {
bool IsMask3 = Name[11] == '3';
bool IsMaskZ = Name[11] == 'z';
// Drop the "avx512.mask." to make it easier.
Name = Name.drop_front(IsMask3 || IsMaskZ ? 13 : 12);
bool NegMul = Name[2] == 'n';
bool NegAcc = NegMul ? Name[4] == 's' : Name[3] == 's';
if (CI->getNumArgOperands() == 5 &&
(!isa<ConstantInt>(CI->getArgOperand(4)) ||
cast<ConstantInt>(CI->getArgOperand(4))->getZExtValue() != 4)) {
Intrinsic::ID IID;
// Check the character before ".512" in string.
if (Name[Name.size()-5] == 's')
IID = Intrinsic::x86_avx512_vfmadd_ps_512;
else
IID = Intrinsic::x86_avx512_vfmadd_pd_512;
Value *Ops[] = { CI->getArgOperand(0), CI->getArgOperand(1),
CI->getArgOperand(2), CI->getArgOperand(4) };
if (NegMul) {
if (IsMaskZ || IsMask3)
Ops[0] = Builder.CreateFNeg(Ops[0]);
else
Ops[1] = Builder.CreateFNeg(Ops[1]);
}
if (NegAcc)
Ops[2] = Builder.CreateFNeg(Ops[2]);
Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
Ops);
} else {
Value *Ops[] = { CI->getArgOperand(0), CI->getArgOperand(1),
CI->getArgOperand(2) };
if (NegMul) {
if (IsMaskZ || IsMask3)
Ops[0] = Builder.CreateFNeg(Ops[0]);
else
Ops[1] = Builder.CreateFNeg(Ops[1]);
}
if (NegAcc)
Ops[2] = Builder.CreateFNeg(Ops[2]);
Function *FMA = Intrinsic::getDeclaration(CI->getModule(),
Intrinsic::fma,
Ops[0]->getType());
Rep = Builder.CreateCall(FMA, Ops);
}
Value *PassThru = IsMaskZ ? llvm::Constant::getNullValue(CI->getType()) :
IsMask3 ? CI->getArgOperand(2) :
CI->getArgOperand(0);
Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
} else if (IsX86 && (Name.startswith("fma.vfmaddsub.p") ||
Name.startswith("fma.vfmsubadd.p"))) {
bool IsSubAdd = Name[7] == 's';
int NumElts = CI->getType()->getVectorNumElements();
Value *Ops[] = { CI->getArgOperand(0), CI->getArgOperand(1),
CI->getArgOperand(2) };
@ -2803,6 +2875,60 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
Idxs[i] = i + (i % 2) * NumElts;
Rep = Builder.CreateShuffleVector(Even, Odd, Idxs);
} else if (IsX86 && (Name.startswith("avx512.mask.vfmaddsub.p") ||
Name.startswith("avx512.mask3.vfmaddsub.p") ||
Name.startswith("avx512.maskz.vfmaddsub.p") ||
Name.startswith("avx512.mask3.vfmsubadd.p"))) {
bool IsMask3 = Name[11] == '3';
bool IsMaskZ = Name[11] == 'z';
// Drop the "avx512.mask." to make it easier.
Name = Name.drop_front(IsMask3 || IsMaskZ ? 13 : 12);
bool IsSubAdd = Name[3] == 's';
if (CI->getNumArgOperands() == 5 &&
(!isa<ConstantInt>(CI->getArgOperand(4)) ||
cast<ConstantInt>(CI->getArgOperand(4))->getZExtValue() != 4)) {
Intrinsic::ID IID;
// Check the character before ".512" in string.
if (Name[Name.size()-5] == 's')
IID = Intrinsic::x86_avx512_vfmaddsub_ps_512;
else
IID = Intrinsic::x86_avx512_vfmaddsub_pd_512;
Value *Ops[] = { CI->getArgOperand(0), CI->getArgOperand(1),
CI->getArgOperand(2), CI->getArgOperand(4) };
if (IsSubAdd)
Ops[2] = Builder.CreateFNeg(Ops[2]);
Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
{CI->getArgOperand(0), CI->getArgOperand(1),
CI->getArgOperand(2), CI->getArgOperand(4)});
} else {
int NumElts = CI->getType()->getVectorNumElements();
Value *Ops[] = { CI->getArgOperand(0), CI->getArgOperand(1),
CI->getArgOperand(2) };
Function *FMA = Intrinsic::getDeclaration(CI->getModule(), Intrinsic::fma,
Ops[0]->getType());
Value *Odd = Builder.CreateCall(FMA, Ops);
Ops[2] = Builder.CreateFNeg(Ops[2]);
Value *Even = Builder.CreateCall(FMA, Ops);
if (IsSubAdd)
std::swap(Even, Odd);
SmallVector<uint32_t, 32> Idxs(NumElts);
for (int i = 0; i != NumElts; ++i)
Idxs[i] = i + (i % 2) * NumElts;
Rep = Builder.CreateShuffleVector(Even, Odd, Idxs);
}
Value *PassThru = IsMaskZ ? llvm::Constant::getNullValue(CI->getType()) :
IsMask3 ? CI->getArgOperand(2) :
CI->getArgOperand(0);
Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
} else if (IsX86 && (Name.startswith("avx512.mask.pternlog.") ||
Name.startswith("avx512.maskz.pternlog."))) {
bool ZeroMask = Name[11] == 'z';

View File

@ -20678,7 +20678,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
// Swap Src1 and Src2 in the node creation
return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
}
case FMA_OP_MASK3:
case FMA_OP_MASKZ:
case FMA_OP_MASK: {
SDValue Src1 = Op.getOperand(1);
@ -20691,8 +20690,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
// set PassThru element
if (IntrData->Type == FMA_OP_MASKZ)
PassThru = getZeroVector(VT, Subtarget, DAG, dl);
else if (IntrData->Type == FMA_OP_MASK3)
PassThru = Src3;
else
PassThru = Src1;

View File

@ -28,7 +28,7 @@ enum IntrinsicType : uint16_t {
INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM,
INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM,
INTR_TYPE_3OP_MASK,
FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3,
FMA_OP_MASK, FMA_OP_MASKZ,
FMA_OP_SCALAR_MASK, FMA_OP_SCALAR_MASKZ, FMA_OP_SCALAR_MASK3,
IFMA_OP, VPERM_2OP, INTR_TYPE_SCALAR_MASK,
INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK,
@ -878,43 +878,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::CVTPS2PH, 0),
X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_512, INTR_TYPE_2OP_MASK,
X86ISD::CVTPS2PH, 0),
X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_128, FMA_OP_MASK, ISD::FMA, 0),
X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_256, FMA_OP_MASK, ISD::FMA, 0),
X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_512, FMA_OP_MASK, ISD::FMA,
X86ISD::FMADD_RND),
X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_128, FMA_OP_MASK, ISD::FMA, 0),
X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_256, FMA_OP_MASK, ISD::FMA, 0),
X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_512, FMA_OP_MASK, ISD::FMA,
X86ISD::FMADD_RND),
X86_INTRINSIC_DATA(avx512_mask_vfmadd_sd, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1, X86ISD::FMADDS1_RND),
X86_INTRINSIC_DATA(avx512_mask_vfmadd_ss, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1, X86ISD::FMADDS1_RND),
X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_512, FMA_OP_MASK, X86ISD::FMADDSUB,
X86ISD::FMADDSUB_RND),
X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_ps_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_ps_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_ps_512, FMA_OP_MASK, X86ISD::FMADDSUB,
X86ISD::FMADDSUB_RND),
X86_INTRINSIC_DATA(avx512_mask_vfnmadd_pd_128, FMA_OP_MASK, X86ISD::FNMADD, 0),
X86_INTRINSIC_DATA(avx512_mask_vfnmadd_pd_256, FMA_OP_MASK, X86ISD::FNMADD, 0),
X86_INTRINSIC_DATA(avx512_mask_vfnmadd_pd_512, FMA_OP_MASK, X86ISD::FNMADD,
X86ISD::FNMADD_RND),
X86_INTRINSIC_DATA(avx512_mask_vfnmadd_ps_128, FMA_OP_MASK, X86ISD::FNMADD, 0),
X86_INTRINSIC_DATA(avx512_mask_vfnmadd_ps_256, FMA_OP_MASK, X86ISD::FNMADD, 0),
X86_INTRINSIC_DATA(avx512_mask_vfnmadd_ps_512, FMA_OP_MASK, X86ISD::FNMADD,
X86ISD::FNMADD_RND),
X86_INTRINSIC_DATA(avx512_mask_vfnmsub_pd_128, FMA_OP_MASK, X86ISD::FNMSUB, 0),
X86_INTRINSIC_DATA(avx512_mask_vfnmsub_pd_256, FMA_OP_MASK, X86ISD::FNMSUB, 0),
X86_INTRINSIC_DATA(avx512_mask_vfnmsub_pd_512, FMA_OP_MASK, X86ISD::FNMSUB,
X86ISD::FNMSUB_RND),
X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_128, FMA_OP_MASK, X86ISD::FNMSUB, 0),
X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_256, FMA_OP_MASK, X86ISD::FNMSUB, 0),
X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_512, FMA_OP_MASK, X86ISD::FNMSUB,
X86ISD::FNMSUB_RND),
X86_INTRINSIC_DATA(avx512_mask_vpshldv_d_128, FMA_OP_MASK, X86ISD::VSHLDV, 0),
X86_INTRINSIC_DATA(avx512_mask_vpshldv_d_256, FMA_OP_MASK, X86ISD::VSHLDV, 0),
@ -942,54 +908,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_vpshufbitqmb_512, CMP_MASK,
X86ISD::VPSHUFBITQMB, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_128, FMA_OP_MASK3, ISD::FMA, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_256, FMA_OP_MASK3, ISD::FMA, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_512, FMA_OP_MASK3, ISD::FMA,
X86ISD::FMADD_RND),
X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_128, FMA_OP_MASK3, ISD::FMA, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_256, FMA_OP_MASK3, ISD::FMA, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_512, FMA_OP_MASK3, ISD::FMA,
X86ISD::FMADD_RND),
X86_INTRINSIC_DATA(avx512_mask3_vfmadd_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3, X86ISD::FMADDS3_RND),
X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3, X86ISD::FMADDS3_RND),
X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_512, FMA_OP_MASK3, X86ISD::FMADDSUB,
X86ISD::FMADDSUB_RND),
X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_ps_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_ps_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_ps_512, FMA_OP_MASK3, X86ISD::FMADDSUB,
X86ISD::FMADDSUB_RND),
X86_INTRINSIC_DATA(avx512_mask3_vfmsub_pd_128, FMA_OP_MASK3, X86ISD::FMSUB, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmsub_pd_256, FMA_OP_MASK3, X86ISD::FMSUB, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmsub_pd_512, FMA_OP_MASK3, X86ISD::FMSUB,
X86ISD::FMSUB_RND),
X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_128, FMA_OP_MASK3, X86ISD::FMSUB, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_256, FMA_OP_MASK3, X86ISD::FMSUB, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_512, FMA_OP_MASK3, X86ISD::FMSUB,
X86ISD::FMSUB_RND),
X86_INTRINSIC_DATA(avx512_mask3_vfmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3, X86ISD::FMSUBS3_RND),
X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3, X86ISD::FMSUBS3_RND),
X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_512, FMA_OP_MASK3, X86ISD::FMSUBADD,
X86ISD::FMSUBADD_RND),
X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_ps_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_ps_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_ps_512, FMA_OP_MASK3, X86ISD::FMSUBADD,
X86ISD::FMSUBADD_RND),
X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_pd_128, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_pd_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_pd_512, FMA_OP_MASK3, X86ISD::FNMSUB,
X86ISD::FNMSUB_RND),
X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_128, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_512, FMA_OP_MASK3, X86ISD::FNMSUB,
X86ISD::FNMSUB_RND),
X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3, X86ISD::FNMSUBS3_RND),
X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3, X86ISD::FNMSUBS3_RND),
X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_128, FIXUPIMM_MASKZ,
@ -1008,25 +932,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::VFIXUPIMMS, 0),
X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ss, FIXUPIMMS_MASKZ,
X86ISD::VFIXUPIMMS, 0),
X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_128, FMA_OP_MASKZ, ISD::FMA, 0),
X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_256, FMA_OP_MASKZ, ISD::FMA, 0),
X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_512, FMA_OP_MASKZ, ISD::FMA,
X86ISD::FMADD_RND),
X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_128, FMA_OP_MASKZ, ISD::FMA, 0),
X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_256, FMA_OP_MASKZ, ISD::FMA, 0),
X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_512, FMA_OP_MASKZ, ISD::FMA,
X86ISD::FMADD_RND),
X86_INTRINSIC_DATA(avx512_maskz_vfmadd_sd, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1, X86ISD::FMADDS1_RND),
X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ss, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1, X86ISD::FMADDS1_RND),
X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_128, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_256, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_512, FMA_OP_MASKZ, X86ISD::FMADDSUB,
X86ISD::FMADDSUB_RND),
X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_ps_128, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_ps_256, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_ps_512, FMA_OP_MASKZ, X86ISD::FMADDSUB,
X86ISD::FMADDSUB_RND),
X86_INTRINSIC_DATA(avx512_maskz_vpshldv_d_128, FMA_OP_MASKZ, X86ISD::VSHLDV, 0),
X86_INTRINSIC_DATA(avx512_maskz_vpshldv_d_256, FMA_OP_MASKZ, X86ISD::VSHLDV, 0),

View File

@ -11,10 +11,10 @@ define <16 x float> @test_x86_vfnmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <
; CHECK-NEXT: vfnmadd213ps %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0xac,0xc2]
; CHECK-NEXT: # zmm0 = -(zmm1 * zmm0) + zmm2
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
ret <16 x float> %res
%1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a1
%2 = call <16 x float> @llvm.fma.v16f32(<16 x float> %a0, <16 x float> %1, <16 x float> %a2)
ret <16 x float> %2
}
declare <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
define <16 x float> @test_mask_vfnmadd_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
; X86-LABEL: test_mask_vfnmadd_ps:
@ -30,8 +30,11 @@ define <16 x float> @test_mask_vfnmadd_ps(<16 x float> %a0, <16 x float> %a1, <1
; X64-NEXT: vfnmadd132ps %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x49,0x9c,0xc1]
; X64-NEXT: # zmm0 = -(zmm0 * zmm1) + zmm2
; X64-NEXT: retq # encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind
ret <16 x float> %res
%1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a1
%2 = call <16 x float> @llvm.fma.v16f32(<16 x float> %a0, <16 x float> %1, <16 x float> %a2)
%3 = bitcast i16 %mask to <16 x i1>
%4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %a0
ret <16 x float> %4
}
define <8 x double> @test_x86_vfnmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
@ -40,10 +43,10 @@ define <8 x double> @test_x86_vfnmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <
; CHECK-NEXT: vfnmadd213pd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x48,0xac,0xc2]
; CHECK-NEXT: # zmm0 = -(zmm1 * zmm0) + zmm2
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
ret <8 x double> %res
%1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a1
%2 = call <8 x double> @llvm.fma.v8f64(<8 x double> %a0, <8 x double> %1, <8 x double> %a2)
ret <8 x double> %2
}
declare <8 x double> @llvm.x86.avx512.mask.vfnmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
define <8 x double> @test_mask_vfnmadd_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
; X86-LABEL: test_mask_vfnmadd_pd:
@ -60,8 +63,11 @@ define <8 x double> @test_mask_vfnmadd_pd(<8 x double> %a0, <8 x double> %a1, <8
; X64-NEXT: vfnmadd132pd %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x49,0x9c,0xc1]
; X64-NEXT: # zmm0 = -(zmm0 * zmm1) + zmm2
; X64-NEXT: retq # encoding: [0xc3]
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
ret <8 x double> %res
%1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a1
%2 = call <8 x double> @llvm.fma.v8f64(<8 x double> %a0, <8 x double> %1, <8 x double> %a2)
%3 = bitcast i8 %mask to <8 x i1>
%4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %a0
ret <8 x double> %4
}
define <16 x float> @test_x86_vfnmsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
@ -70,10 +76,11 @@ define <16 x float> @test_x86_vfnmsubps_z(<16 x float> %a0, <16 x float> %a1, <1
; CHECK-NEXT: vfnmsub213ps %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0xae,0xc2]
; CHECK-NEXT: # zmm0 = -(zmm1 * zmm0) - zmm2
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
ret <16 x float> %res
%1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a1
%2 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a2
%3 = call <16 x float> @llvm.fma.v16f32(<16 x float> %a0, <16 x float> %1, <16 x float> %2)
ret <16 x float> %3
}
declare <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
define <16 x float> @test_mask_vfnmsub_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
; X86-LABEL: test_mask_vfnmsub_ps:
@ -89,8 +96,12 @@ define <16 x float> @test_mask_vfnmsub_ps(<16 x float> %a0, <16 x float> %a1, <1
; X64-NEXT: vfnmsub132ps %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x49,0x9e,0xc1]
; X64-NEXT: # zmm0 = -(zmm0 * zmm1) - zmm2
; X64-NEXT: retq # encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind
ret <16 x float> %res
%1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a1
%2 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a2
%3 = call <16 x float> @llvm.fma.v16f32(<16 x float> %a0, <16 x float> %1, <16 x float> %2)
%4 = bitcast i16 %mask to <16 x i1>
%5 = select <16 x i1> %4, <16 x float> %3, <16 x float> %a0
ret <16 x float> %5
}
define <8 x double> @test_x86_vfnmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
@ -99,10 +110,11 @@ define <8 x double> @test_x86_vfnmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8
; CHECK-NEXT: vfnmsub213pd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x48,0xae,0xc2]
; CHECK-NEXT: # zmm0 = -(zmm1 * zmm0) - zmm2
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
ret <8 x double> %res
%1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a1
%2 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a2
%3 = call <8 x double> @llvm.fma.v8f64(<8 x double> %a0, <8 x double> %1, <8 x double> %2)
ret <8 x double> %3
}
declare <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
define <8 x double> @test_mask_vfnmsub_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
; X86-LABEL: test_mask_vfnmsub_pd:
@ -119,8 +131,12 @@ define <8 x double> @test_mask_vfnmsub_pd(<8 x double> %a0, <8 x double> %a1, <8
; X64-NEXT: vfnmsub132pd %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x49,0x9e,0xc1]
; X64-NEXT: # zmm0 = -(zmm0 * zmm1) - zmm2
; X64-NEXT: retq # encoding: [0xc3]
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
ret <8 x double> %res
%1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a1
%2 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a2
%3 = call <8 x double> @llvm.fma.v8f64(<8 x double> %a0, <8 x double> %1, <8 x double> %2)
%4 = bitcast i8 %mask to <8 x i1>
%5 = select <8 x i1> %4, <8 x double> %3, <8 x double> %a0
ret <8 x double> %5
}
define <16 x float> @test_x86_vfmaddsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
@ -129,7 +145,7 @@ define <16 x float> @test_x86_vfmaddsubps_z(<16 x float> %a0, <16 x float> %a1,
; CHECK-NEXT: vfmaddsub213ps %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0xa6,0xc2]
; CHECK-NEXT: # zmm0 = (zmm1 * zmm0) +/- zmm2
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i32 4) nounwind
%res = call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i32 4) #2
ret <16 x float> %res
}
@ -161,7 +177,7 @@ define <8 x double> @test_x86_vfmaddsubpd_z(<8 x double> %a0, <8 x double> %a1,
; CHECK-NEXT: vfmaddsub213pd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x48,0xa6,0xc2]
; CHECK-NEXT: # zmm0 = (zmm1 * zmm0) +/- zmm2
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i32 4) nounwind
%res = call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i32 4) #2
ret <8 x double> %res
}
declare <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i32) nounwind readnone
@ -181,7 +197,7 @@ define <8 x double> @test_mask_vfmaddsub_pd(<8 x double> %a0, <8 x double> %a1,
; X64-NEXT: vfmaddsub132pd %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x49,0x96,0xc1]
; X64-NEXT: # zmm0 = (zmm0 * zmm1) +/- zmm2
; X64-NEXT: retq # encoding: [0xc3]
%res = call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i32 4) nounwind
%res = call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i32 4) #2
%bc = bitcast i8 %mask to <8 x i1>
%sel = select <8 x i1> %bc, <8 x double> %res, <8 x double> %a0
ret <8 x double> %sel
@ -208,8 +224,6 @@ define <8 x double>@test_int_x86_avx512_mask_vfmaddsub_pd_512(<8 x double> %x0,
ret <8 x double> %sel
}
declare <8 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
define <8 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; X86-LABEL: test_int_x86_avx512_mask3_vfmaddsub_pd_512:
; X86: # %bb.0:
@ -227,12 +241,15 @@ define <8 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_512(<8 x double> %x0,
; X64-NEXT: # zmm2 = (zmm0 * zmm1) +/- zmm2
; X64-NEXT: vmovapd %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2]
; X64-NEXT: retq # encoding: [0xc3]
%res = call <8 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4)
ret <8 x double> %res
%1 = call <8 x double> @llvm.fma.v8f64(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2)
%2 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %x2
%3 = call <8 x double> @llvm.fma.v8f64(<8 x double> %x0, <8 x double> %x1, <8 x double> %2)
%4 = shufflevector <8 x double> %3, <8 x double> %1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
%5 = bitcast i8 %x3 to <8 x i1>
%6 = select <8 x i1> %5, <8 x double> %4, <8 x double> %x2
ret <8 x double> %6
}
declare <8 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
define <8 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; X86-LABEL: test_int_x86_avx512_maskz_vfmaddsub_pd_512:
; X86: # %bb.0:
@ -248,8 +265,13 @@ define <8 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_512(<8 x double> %x0,
; X64-NEXT: vfmaddsub213pd %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0xa6,0xc2]
; X64-NEXT: # zmm0 = (zmm1 * zmm0) +/- zmm2
; X64-NEXT: retq # encoding: [0xc3]
%res = call <8 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4)
ret <8 x double> %res
%1 = call <8 x double> @llvm.fma.v8f64(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2)
%2 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %x2
%3 = call <8 x double> @llvm.fma.v8f64(<8 x double> %x0, <8 x double> %x1, <8 x double> %2)
%4 = shufflevector <8 x double> %3, <8 x double> %1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
%5 = bitcast i8 %x3 to <8 x i1>
%6 = select <8 x i1> %5, <8 x double> %4, <8 x double> zeroinitializer
ret <8 x double> %6
}
define <16 x float>@test_int_x86_avx512_mask_vfmaddsub_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
@ -272,8 +294,6 @@ define <16 x float>@test_int_x86_avx512_mask_vfmaddsub_ps_512(<16 x float> %x0,
ret <16 x float> %sel
}
declare <16 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
define <16 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
; X86-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ps_512:
; X86: # %bb.0:
@ -290,12 +310,15 @@ define <16 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_512(<16 x float> %x0,
; X64-NEXT: # zmm2 = (zmm0 * zmm1) +/- zmm2
; X64-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
; X64-NEXT: retq # encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4)
ret <16 x float> %res
%1 = call <16 x float> @llvm.fma.v16f32(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2)
%2 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
%3 = call <16 x float> @llvm.fma.v16f32(<16 x float> %x0, <16 x float> %x1, <16 x float> %2)
%4 = shufflevector <16 x float> %3, <16 x float> %1, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
%5 = bitcast i16 %x3 to <16 x i1>
%6 = select <16 x i1> %5, <16 x float> %4, <16 x float> %x2
ret <16 x float> %6
}
declare <16 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
define <16 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
; X86-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ps_512:
; X86: # %bb.0:
@ -310,12 +333,15 @@ define <16 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_512(<16 x float> %x0,
; X64-NEXT: vfmaddsub213ps %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0xa6,0xc2]
; X64-NEXT: # zmm0 = (zmm1 * zmm0) +/- zmm2
; X64-NEXT: retq # encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4)
ret <16 x float> %res
%1 = call <16 x float> @llvm.fma.v16f32(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2)
%2 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
%3 = call <16 x float> @llvm.fma.v16f32(<16 x float> %x0, <16 x float> %x1, <16 x float> %2)
%4 = shufflevector <16 x float> %3, <16 x float> %1, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
%5 = bitcast i16 %x3 to <16 x i1>
%6 = select <16 x i1> %5, <16 x float> %4, <16 x float> zeroinitializer
ret <16 x float> %6
}
declare <8 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
define <8 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; X86-LABEL: test_int_x86_avx512_mask3_vfmsubadd_pd_512:
; X86: # %bb.0:
@ -333,12 +359,15 @@ define <8 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_512(<8 x double> %x0,
; X64-NEXT: # zmm2 = (zmm0 * zmm1) -/+ zmm2
; X64-NEXT: vmovapd %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2]
; X64-NEXT: retq # encoding: [0xc3]
%res = call <8 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4)
ret <8 x double> %res
%1 = call <8 x double> @llvm.fma.v8f64(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2)
%2 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %x2
%3 = call <8 x double> @llvm.fma.v8f64(<8 x double> %x0, <8 x double> %x1, <8 x double> %2)
%4 = shufflevector <8 x double> %1, <8 x double> %3, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
%5 = bitcast i8 %x3 to <8 x i1>
%6 = select <8 x i1> %5, <8 x double> %4, <8 x double> %x2
ret <8 x double> %6
}
declare <16 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
define <16 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
; X86-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ps_512:
; X86: # %bb.0:
@ -355,8 +384,13 @@ define <16 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_512(<16 x float> %x0,
; X64-NEXT: # zmm2 = (zmm0 * zmm1) -/+ zmm2
; X64-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
; X64-NEXT: retq # encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4)
ret <16 x float> %res
%1 = call <16 x float> @llvm.fma.v16f32(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2)
%2 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
%3 = call <16 x float> @llvm.fma.v16f32(<16 x float> %x0, <16 x float> %x1, <16 x float> %2)
%4 = shufflevector <16 x float> %1, <16 x float> %3, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
%5 = bitcast i16 %x3 to <16 x i1>
%6 = select <16 x i1> %5, <16 x float> %4, <16 x float> %x2
ret <16 x float> %6
}
define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rne(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
@ -497,8 +531,6 @@ define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_current(<16 x float> %a0,
ret <16 x float> %res
}
declare <8 x double> @llvm.x86.avx512.mask3.vfmsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
define <8 x double>@test_int_x86_avx512_mask3_vfmsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; X86-LABEL: test_int_x86_avx512_mask3_vfmsub_pd_512:
; X86: # %bb.0:
@ -516,12 +548,13 @@ define <8 x double>@test_int_x86_avx512_mask3_vfmsub_pd_512(<8 x double> %x0, <8
; X64-NEXT: # zmm2 = (zmm0 * zmm1) - zmm2
; X64-NEXT: vmovapd %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2]
; X64-NEXT: retq # encoding: [0xc3]
%res = call <8 x double> @llvm.x86.avx512.mask3.vfmsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4)
ret <8 x double> %res
%1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %x2
%2 = call <8 x double> @llvm.fma.v8f64(<8 x double> %x0, <8 x double> %x1, <8 x double> %1)
%3 = bitcast i8 %x3 to <8 x i1>
%4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %x2
ret <8 x double> %4
}
declare <16 x float> @llvm.x86.avx512.mask3.vfmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
define <16 x float>@test_int_x86_avx512_mask3_vfmsub_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
; X86-LABEL: test_int_x86_avx512_mask3_vfmsub_ps_512:
; X86: # %bb.0:
@ -538,8 +571,11 @@ define <16 x float>@test_int_x86_avx512_mask3_vfmsub_ps_512(<16 x float> %x0, <1
; X64-NEXT: # zmm2 = (zmm0 * zmm1) - zmm2
; X64-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
; X64-NEXT: retq # encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask3.vfmsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4)
ret <16 x float> %res
%1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
%2 = call <16 x float> @llvm.fma.v16f32(<16 x float> %x0, <16 x float> %x1, <16 x float> %1)
%3 = bitcast i16 %x3 to <16 x i1>
%4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %x2
ret <16 x float> %4
}
define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
@ -706,8 +742,6 @@ define <8 x double>@test_int_x86_avx512_mask_vfmadd_pd_512(<8 x double> %x0, <8
ret <8 x double> %sel
}
declare <8 x double> @llvm.x86.avx512.mask3.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
define <8 x double>@test_int_x86_avx512_mask3_vfmadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; X86-LABEL: test_int_x86_avx512_mask3_vfmadd_pd_512:
; X86: # %bb.0:
@ -725,12 +759,12 @@ define <8 x double>@test_int_x86_avx512_mask3_vfmadd_pd_512(<8 x double> %x0, <8
; X64-NEXT: # zmm2 = (zmm0 * zmm1) + zmm2
; X64-NEXT: vmovapd %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2]
; X64-NEXT: retq # encoding: [0xc3]
%res = call <8 x double> @llvm.x86.avx512.mask3.vfmadd.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4)
ret <8 x double> %res
%1 = call <8 x double> @llvm.fma.v8f64(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2)
%2 = bitcast i8 %x3 to <8 x i1>
%3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %x2
ret <8 x double> %3
}
declare <8 x double> @llvm.x86.avx512.maskz.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
define <8 x double>@test_int_x86_avx512_maskz_vfmadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; X86-LABEL: test_int_x86_avx512_maskz_vfmadd_pd_512:
; X86: # %bb.0:
@ -746,8 +780,10 @@ define <8 x double>@test_int_x86_avx512_maskz_vfmadd_pd_512(<8 x double> %x0, <8
; X64-NEXT: vfmadd213pd %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0xa8,0xc2]
; X64-NEXT: # zmm0 = (zmm1 * zmm0) + zmm2
; X64-NEXT: retq # encoding: [0xc3]
%res = call <8 x double> @llvm.x86.avx512.maskz.vfmadd.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4)
ret <8 x double> %res
%1 = call <8 x double> @llvm.fma.v8f64(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2)
%2 = bitcast i8 %x3 to <8 x i1>
%3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
ret <8 x double> %3
}
define <16 x float>@test_int_x86_avx512_mask_vfmadd_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
@ -770,8 +806,6 @@ define <16 x float>@test_int_x86_avx512_mask_vfmadd_ps_512(<16 x float> %x0, <16
ret <16 x float> %sel
}
declare <16 x float> @llvm.x86.avx512.mask3.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
define <16 x float>@test_int_x86_avx512_mask3_vfmadd_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
; X86-LABEL: test_int_x86_avx512_mask3_vfmadd_ps_512:
; X86: # %bb.0:
@ -788,13 +822,13 @@ define <16 x float>@test_int_x86_avx512_mask3_vfmadd_ps_512(<16 x float> %x0, <1
; X64-NEXT: # zmm2 = (zmm0 * zmm1) + zmm2
; X64-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
; X64-NEXT: retq # encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask3.vfmadd.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4)
ret <16 x float> %res
%1 = call <16 x float> @llvm.fma.v16f32(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2)
%2 = bitcast i16 %x3 to <16 x i1>
%3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %x2
ret <16 x float> %3
}
declare <16 x float> @llvm.x86.avx512.maskz.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
define <16 x float>@test_int_x86_avx512_maskz_vfmadd_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
define <16 x float> @test_int_x86_avx512_maskz_vfmadd_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
; X86-LABEL: test_int_x86_avx512_maskz_vfmadd_ps_512:
; X86: # %bb.0:
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
@ -808,11 +842,12 @@ define <16 x float>@test_int_x86_avx512_maskz_vfmadd_ps_512(<16 x float> %x0, <1
; X64-NEXT: vfmadd213ps %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0xa8,0xc2]
; X64-NEXT: # zmm0 = (zmm1 * zmm0) + zmm2
; X64-NEXT: retq # encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.maskz.vfmadd.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4)
ret <16 x float> %res
%1 = call <16 x float> @llvm.fma.v16f32(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2)
%2 = bitcast i16 %x3 to <16 x i1>
%3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
ret <16 x float> %3
}
define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
; X86-LABEL: test_mask_round_vfnmsub512_pd_rrb_rne:
; X86: # %bb.0:
@ -826,8 +861,12 @@ define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rne(<8 x double> %a0, <8
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vfnmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x19,0x9e,0xc1]
; X64-NEXT: retq # encoding: [0xc3]
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 0) nounwind
ret <8 x double> %res
%1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a1
%2 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a2
%3 = call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %a0, <8 x double> %1, <8 x double> %2, i32 0)
%4 = bitcast i8 %mask to <8 x i1>
%5 = select <8 x i1> %4, <8 x double> %3, <8 x double> %a0
ret <8 x double> %5
}
define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
@ -843,8 +882,12 @@ define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtn(<8 x double> %a0, <8
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vfnmsub132pd {rd-sae}, %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x39,0x9e,0xc1]
; X64-NEXT: retq # encoding: [0xc3]
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 1) nounwind
ret <8 x double> %res
%1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a1
%2 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a2
%3 = call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %a0, <8 x double> %1, <8 x double> %2, i32 1)
%4 = bitcast i8 %mask to <8 x i1>
%5 = select <8 x i1> %4, <8 x double> %3, <8 x double> %a0
ret <8 x double> %5
}
define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
@ -860,8 +903,12 @@ define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtp(<8 x double> %a0, <8
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vfnmsub132pd {ru-sae}, %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x59,0x9e,0xc1]
; X64-NEXT: retq # encoding: [0xc3]
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 2) nounwind
ret <8 x double> %res
%1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a1
%2 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a2
%3 = call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %a0, <8 x double> %1, <8 x double> %2, i32 2)
%4 = bitcast i8 %mask to <8 x i1>
%5 = select <8 x i1> %4, <8 x double> %3, <8 x double> %a0
ret <8 x double> %5
}
define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
@ -877,8 +924,12 @@ define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtz(<8 x double> %a0, <8
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vfnmsub132pd {rz-sae}, %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x79,0x9e,0xc1]
; X64-NEXT: retq # encoding: [0xc3]
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 3) nounwind
ret <8 x double> %res
%1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a1
%2 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a2
%3 = call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %a0, <8 x double> %1, <8 x double> %2, i32 3)
%4 = bitcast i8 %mask to <8 x i1>
%5 = select <8 x i1> %4, <8 x double> %3, <8 x double> %a0
ret <8 x double> %5
}
define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
@ -896,8 +947,12 @@ define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_current(<8 x double> %a0,
; X64-NEXT: vfnmsub132pd %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x49,0x9e,0xc1]
; X64-NEXT: # zmm0 = -(zmm0 * zmm1) - zmm2
; X64-NEXT: retq # encoding: [0xc3]
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
ret <8 x double> %res
%1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a1
%2 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a2
%3 = call <8 x double> @llvm.fma.v8f64(<8 x double> %a0, <8 x double> %1, <8 x double> %2)
%4 = bitcast i8 %mask to <8 x i1>
%5 = select <8 x i1> %4, <8 x double> %3, <8 x double> %a0
ret <8 x double> %5
}
define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
@ -905,8 +960,10 @@ define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rne(<8 x double> %a0, <8
; CHECK: # %bb.0:
; CHECK-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x18,0xae,0xc2]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 0) nounwind
ret <8 x double> %res
%1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a1
%2 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a2
%3 = call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %a0, <8 x double> %1, <8 x double> %2, i32 0)
ret <8 x double> %3
}
define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
@ -914,8 +971,10 @@ define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtn(<8 x double> %a0, <8
; CHECK: # %bb.0:
; CHECK-NEXT: vfnmsub213pd {rd-sae}, %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x38,0xae,0xc2]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 1) nounwind
ret <8 x double> %res
%1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a1
%2 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a2
%3 = call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %a0, <8 x double> %1, <8 x double> %2, i32 1)
ret <8 x double> %3
}
define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
@ -923,8 +982,10 @@ define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtp(<8 x double> %a0, <8
; CHECK: # %bb.0:
; CHECK-NEXT: vfnmsub213pd {ru-sae}, %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x58,0xae,0xc2]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 2) nounwind
ret <8 x double> %res
%1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a1
%2 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a2
%3 = call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %a0, <8 x double> %1, <8 x double> %2, i32 2)
ret <8 x double> %3
}
define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
@ -932,8 +993,10 @@ define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtz(<8 x double> %a0, <8
; CHECK: # %bb.0:
; CHECK-NEXT: vfnmsub213pd {rz-sae}, %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x78,0xae,0xc2]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 3) nounwind
ret <8 x double> %res
%1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a1
%2 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a2
%3 = call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %a0, <8 x double> %1, <8 x double> %2, i32 3)
ret <8 x double> %3
}
define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
@ -942,8 +1005,10 @@ define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_current(<8 x double> %a0
; CHECK-NEXT: vfnmsub213pd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x48,0xae,0xc2]
; CHECK-NEXT: # zmm0 = -(zmm1 * zmm0) - zmm2
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
ret <8 x double> %res
%1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a1
%2 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a2
%3 = call <8 x double> @llvm.fma.v8f64(<8 x double> %a0, <8 x double> %1, <8 x double> %2)
ret <8 x double> %3
}
define <8 x double>@test_int_x86_avx512_mask_vfnmsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
@ -961,12 +1026,14 @@ define <8 x double>@test_int_x86_avx512_mask_vfnmsub_pd_512(<8 x double> %x0, <8
; X64-NEXT: vfnmsub132pd %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x49,0x9e,0xc1]
; X64-NEXT: # zmm0 = -(zmm0 * zmm1) - zmm2
; X64-NEXT: retq # encoding: [0xc3]
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4)
ret <8 x double> %res
%1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %x1
%2 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %x2
%3 = call <8 x double> @llvm.fma.v8f64(<8 x double> %x0, <8 x double> %1, <8 x double> %2)
%4 = bitcast i8 %x3 to <8 x i1>
%5 = select <8 x i1> %4, <8 x double> %3, <8 x double> %x0
ret <8 x double> %5
}
declare <8 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
define <8 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; X86-LABEL: test_int_x86_avx512_mask3_vfnmsub_pd_512:
; X86: # %bb.0:
@ -984,8 +1051,12 @@ define <8 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_512(<8 x double> %x0, <
; X64-NEXT: # zmm2 = -(zmm0 * zmm1) - zmm2
; X64-NEXT: vmovapd %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2]
; X64-NEXT: retq # encoding: [0xc3]
%res = call <8 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4)
ret <8 x double> %res
%1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %x0
%2 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %x2
%3 = call <8 x double> @llvm.fma.v8f64(<8 x double> %1, <8 x double> %x1, <8 x double> %2)
%4 = bitcast i8 %x3 to <8 x i1>
%5 = select <8 x i1> %4, <8 x double> %3, <8 x double> %x2
ret <8 x double> %5
}
define <16 x float>@test_int_x86_avx512_mask_vfnmsub_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
@ -1002,12 +1073,14 @@ define <16 x float>@test_int_x86_avx512_mask_vfnmsub_ps_512(<16 x float> %x0, <1
; X64-NEXT: vfnmsub132ps %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x49,0x9e,0xc1]
; X64-NEXT: # zmm0 = -(zmm0 * zmm1) - zmm2
; X64-NEXT: retq # encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4)
ret <16 x float> %res
%1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x1
%2 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
%3 = call <16 x float> @llvm.fma.v16f32(<16 x float> %x0, <16 x float> %1, <16 x float> %2)
%4 = bitcast i16 %x3 to <16 x i1>
%5 = select <16 x i1> %4, <16 x float> %3, <16 x float> %x0
ret <16 x float> %5
}
declare <16 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
define <16 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
; X86-LABEL: test_int_x86_avx512_mask3_vfnmsub_ps_512:
; X86: # %bb.0:
@ -1024,8 +1097,12 @@ define <16 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_512(<16 x float> %x0, <
; X64-NEXT: # zmm2 = -(zmm0 * zmm1) - zmm2
; X64-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
; X64-NEXT: retq # encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4)
ret <16 x float> %res
%1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x0
%2 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
%3 = call <16 x float> @llvm.fma.v16f32(<16 x float> %1, <16 x float> %x1, <16 x float> %2)
%4 = bitcast i16 %x3 to <16 x i1>
%5 = select <16 x i1> %4, <16 x float> %3, <16 x float> %x2
ret <16 x float> %5
}
define <8 x double>@test_int_x86_avx512_mask_vfnmadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
@ -1043,8 +1120,11 @@ define <8 x double>@test_int_x86_avx512_mask_vfnmadd_pd_512(<8 x double> %x0, <8
; X64-NEXT: vfnmadd132pd %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x49,0x9c,0xc1]
; X64-NEXT: # zmm0 = -(zmm0 * zmm1) + zmm2
; X64-NEXT: retq # encoding: [0xc3]
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmadd.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 4)
ret <8 x double> %res
%1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %x1
%2 = call <8 x double> @llvm.fma.v8f64(<8 x double> %x0, <8 x double> %1, <8 x double> %x2)
%3 = bitcast i8 %x3 to <8 x i1>
%4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %x0
ret <8 x double> %4
}
define <16 x float>@test_int_x86_avx512_mask_vfnmadd_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
@ -1061,6 +1141,12 @@ define <16 x float>@test_int_x86_avx512_mask_vfnmadd_ps_512(<16 x float> %x0, <1
; X64-NEXT: vfnmadd132ps %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x49,0x9c,0xc1]
; X64-NEXT: # zmm0 = -(zmm0 * zmm1) + zmm2
; X64-NEXT: retq # encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4)
ret <16 x float> %res
%1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x1
%2 = call <16 x float> @llvm.fma.v16f32(<16 x float> %x0, <16 x float> %1, <16 x float> %x2)
%3 = bitcast i16 %x3 to <16 x i1>
%4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %x0
ret <16 x float> %4
}
declare <16 x float> @llvm.fma.v16f32(<16 x float>, <16 x float>, <16 x float>)
declare <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff