[AVX512] add vfmadd132ss and vfmadd132sd Intrinsic

Differential Revision: http://reviews.llvm.org/D16589

llvm-svn: 259789
This commit is contained in:
Michael Zuckerman 2016-02-04 14:41:08 +00:00
parent 4d67ec3f8c
commit 7d73360479
5 changed files with 265 additions and 11 deletions

View File

@ -3941,6 +3941,43 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
[llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_vfmadd_sd :
GCCBuiltin<"__builtin_ia32_vfmaddsd3_mask">,
Intrinsic<[llvm_v2f64_ty],
[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty,
llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_vfmadd_ss :
GCCBuiltin<"__builtin_ia32_vfmaddss3_mask">,
Intrinsic<[llvm_v4f32_ty],
[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty,
llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_maskz_vfmadd_sd :
GCCBuiltin<"__builtin_ia32_vfmaddsd3_maskz">,
Intrinsic<[llvm_v2f64_ty],
[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty,
llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_maskz_vfmadd_ss :
GCCBuiltin<"__builtin_ia32_vfmaddss3_maskz">,
Intrinsic<[llvm_v4f32_ty],
[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty,
llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask3_vfmadd_sd :
GCCBuiltin<"__builtin_ia32_vfmaddsd3_mask3">,
Intrinsic<[llvm_v2f64_ty],
[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty,
llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask3_vfmadd_ss :
GCCBuiltin<"__builtin_ia32_vfmaddss3_mask3">,
Intrinsic<[llvm_v4f32_ty],
[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty,
llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask3_vfmsub_pd_128 :
GCCBuiltin<"__builtin_ia32_vfmsubpd128_mask3">,
Intrinsic<[llvm_v2f64_ty],

View File

@ -16938,6 +16938,30 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
Src1, Src2, Src3),
Mask, PassThru, Subtarget, DAG);
}
case FMA_OP_SCALAR_MASK:
case FMA_OP_SCALAR_MASK3:
case FMA_OP_SCALAR_MASKZ: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue Src3 = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
MVT VT = Op.getSimpleValueType();
SDValue PassThru = SDValue();
// set PassThru element
if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
PassThru = getZeroVector(VT, Subtarget, DAG, dl);
else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
PassThru = Src3;
else
PassThru = Src1;
SDValue Rnd = Op.getOperand(5);
return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
Op.getValueType(), Src1, Src2,
Src3, Rnd),
Mask, PassThru, Subtarget, DAG);
}
case TERLOG_OP_MASK:
case TERLOG_OP_MASKZ: {
SDValue Src1 = Op.getOperand(1);

View File

@ -4713,9 +4713,9 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
string SUFF> {
defm NAME#213#SUFF: avx512_fma3s_common<opc213, OpcodeStr#"213"#_.Suffix , _ ,
(_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)),
(_.VT (OpNode _.RC:$src2, _.RC:$src1,
(_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))))),
(_.VT (OpNodeRnd _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 FROUND_CURRENT))),
(_.VT (OpNodeRnd _.RC:$src2, _.RC:$src1,
(_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))), (i32 FROUND_CURRENT))),
(_.VT ( OpNodeRnd _.RC:$src2, _.RC:$src1, _.RC:$src3,
(i32 imm:$rc))),
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1,
@ -4724,10 +4724,10 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
(_.ScalarLdFrag addr:$src3))))>;
defm NAME#231#SUFF: avx512_fma3s_common<opc231, OpcodeStr#"231"#_.Suffix , _ ,
(_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)),
(_.VT (OpNode _.RC:$src2,
(_.VT (OpNodeRnd _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 FROUND_CURRENT))),
(_.VT (OpNodeRnd _.RC:$src2,
(_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))),
_.RC:$src1)),
_.RC:$src1, (i32 FROUND_CURRENT))),
(_.VT ( OpNodeRnd _.RC:$src2, _.RC:$src3, _.RC:$src1,
(i32 imm:$rc))),
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src3,
@ -4736,10 +4736,10 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
(_.ScalarLdFrag addr:$src3), _.FRC:$src1)))>;
defm NAME#132#SUFF: avx512_fma3s_common<opc132, OpcodeStr#"132"#_.Suffix , _ ,
(_.VT (OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2)),
(_.VT (OpNode _.RC:$src1,
(_.VT (OpNodeRnd _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 FROUND_CURRENT))),
(_.VT (OpNodeRnd _.RC:$src1,
(_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))),
_.RC:$src2)),
_.RC:$src2, (i32 FROUND_CURRENT))),
(_.VT ( OpNodeRnd _.RC:$src1, _.RC:$src3, _.RC:$src2,
(i32 imm:$rc))),
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1, _.FRC:$src3,

View File

@ -27,8 +27,9 @@ enum IntrinsicType {
INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM,
INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, INTR_TYPE_2OP_IMM8_MASK,
INTR_TYPE_3OP_MASK, INTR_TYPE_3OP_MASK_RM, INTR_TYPE_3OP_IMM8_MASK,
FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3, VPERM_3OP_MASK,
VPERM_3OP_MASKZ, INTR_TYPE_SCALAR_MASK,
FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3,
FMA_OP_SCALAR_MASK, FMA_OP_SCALAR_MASKZ, FMA_OP_SCALAR_MASK3,
VPERM_3OP_MASK, VPERM_3OP_MASKZ, INTR_TYPE_SCALAR_MASK,
INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK_RM,
COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, BRCST_SUBVEC_TO_VEC,
TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
@ -1748,6 +1749,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_512, FMA_OP_MASK, X86ISD::FMADD,
X86ISD::FMADD_RND),
X86_INTRINSIC_DATA(avx512_mask_vfmadd_sd, FMA_OP_SCALAR_MASK, X86ISD::FMADD_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_vfmadd_ss, FMA_OP_SCALAR_MASK, X86ISD::FMADD_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_512, FMA_OP_MASK, X86ISD::FMADDSUB,
@ -1898,6 +1901,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_512, FMA_OP_MASK3, X86ISD::FMADD,
X86ISD::FMADD_RND),
X86_INTRINSIC_DATA(avx512_mask3_vfmadd_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMADD_RND, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMADD_RND, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_512, FMA_OP_MASK3, X86ISD::FMADDSUB,
@ -1970,6 +1975,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_512, FMA_OP_MASKZ, X86ISD::FMADD,
X86ISD::FMADD_RND),
X86_INTRINSIC_DATA(avx512_maskz_vfmadd_sd, FMA_OP_SCALAR_MASKZ, X86ISD::FMADD_RND, 0),
X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ss, FMA_OP_SCALAR_MASKZ, X86ISD::FMADD_RND, 0),
X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_128, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_256, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_512, FMA_OP_MASKZ, X86ISD::FMADDSUB,

View File

@ -7356,3 +7356,189 @@ define i8@test_int_x86_avx512_ptestnm_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2
%res2 = add i8 %res, %res1
ret i8 %res2
}
declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_sd:
; CHECK: ## BB#0:
; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm0, %zmm3
; CHECK-NEXT: vfmadd132sd %xmm1, %xmm2, %xmm3 {%k1}
; CHECK-NEXT: vmovaps %zmm1, %zmm4
; CHECK-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm4
; CHECK-NEXT: vmovaps %zmm0, %zmm5
; CHECK-NEXT: vfmadd132sd {rz-sae}, %xmm1, %xmm2, %xmm5 {%k1}
; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm0, %xmm1
; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0
; CHECK-NEXT: vaddpd %xmm5, %xmm1, %xmm1
; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
%res1 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
%res2 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 3)
%res3 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3)
%res4 = fadd <2 x double> %res, %res1
%res5 = fadd <2 x double> %res2, %res3
%res6 = fadd <2 x double> %res4, %res5
ret <2 x double> %res6
}
declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ss:
; CHECK: ## BB#0:
; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm0, %zmm3
; CHECK-NEXT: vfmadd132ss %xmm1, %xmm2, %xmm3 {%k1}
; CHECK-NEXT: vmovaps %zmm1, %zmm4
; CHECK-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm4
; CHECK-NEXT: vmovaps %zmm0, %zmm5
; CHECK-NEXT: vfmadd132ss {rz-sae}, %xmm1, %xmm2, %xmm5 {%k1}
; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm0, %xmm1
; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0
; CHECK-NEXT: vaddps %xmm5, %xmm1, %xmm1
; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
%res1 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
%res2 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 3)
%res3 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3)
%res4 = fadd <4 x float> %res, %res1
%res5 = fadd <4 x float> %res2, %res3
%res6 = fadd <4 x float> %res4, %res5
ret <4 x float> %res6
}
declare <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
define <2 x double>@test_int_x86_avx512_maskz_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_sd:
; CHECK: ## BB#0:
; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm1, %zmm3
; CHECK-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm3 {%k1} {z}
; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm0, %xmm1 {%k1} {z}
; CHECK-NEXT: vaddpd %xmm1, %xmm3, %xmm0
; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
%res1 = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3)
%res2 = fadd <2 x double> %res, %res1
ret <2 x double> %res2
}
declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss:
; CHECK: ## BB#0:
; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1 {%k1} {z}
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
%res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3)
%res2 = fadd <4 x float> %res, %res1
ret <4 x float> %res
}
declare <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_sd:
; CHECK: ## BB#0:
; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm2, %zmm3
; CHECK-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm3 {%k1}
; CHECK-NEXT: vmovaps %zmm1, %zmm4
; CHECK-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm4
; CHECK-NEXT: vmovaps %zmm2, %zmm5
; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm0, %xmm1
; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0
; CHECK-NEXT: vaddpd %xmm5, %xmm1, %xmm1
; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
%res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
%res2 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 3)
%res3 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3)
%res4 = fadd <2 x double> %res, %res1
%res5 = fadd <2 x double> %res2, %res3
%res6 = fadd <2 x double> %res4, %res5
ret <2 x double> %res6
}
declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ss:
; CHECK: ## BB#0:
; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm2, %zmm3
; CHECK-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm3 {%k1}
; CHECK-NEXT: vmovaps %zmm1, %zmm4
; CHECK-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm4
; CHECK-NEXT: vmovaps %zmm2, %zmm5
; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm0, %xmm1
; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0
; CHECK-NEXT: vaddps %xmm5, %xmm1, %xmm1
; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
%res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
%res2 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 3)
%res3 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3)
%res4 = fadd <4 x float> %res, %res1
%res5 = fadd <4 x float> %res2, %res3
%res6 = fadd <4 x float> %res4, %res5
ret <4 x float> %res6
}
define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1, float *%ptr_b ,i8 %x3,i32 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ss_rm:
; CHECK: ## BB#0:
; CHECK-NEXT: andl $1, %esi
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vfmadd231ss (%rdi), %xmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
%q = load float, float* %ptr_b
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %vecinit.i, <4 x float> %x1, i8 %x3, i32 4)
ret < 4 x float> %res
}
define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,float *%ptr_b ,i8 %x3,i32 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ss_rm:
; CHECK: ## BB#0:
; CHECK-NEXT: andl $1, %esi
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vfmadd132ss (%rdi), %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
%q = load float, float* %ptr_b
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0,<4 x float> %vecinit.i, <4 x float> %x1, i8 %x3, i32 4)
ret < 4 x float> %res
}
define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,float *%ptr_b ,i8 %x3,i32 %x4) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_rm:
; CHECK: ## BB#0:
; CHECK-NEXT: kxorw %k0, %k0, %k1
; CHECK-NEXT: vfmadd213ss (%rdi), %xmm0, %xmm1 {%k1} {z}
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
%q = load float, float* %ptr_b
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %vecinit.i, i8 0, i32 4)
ret < 4 x float> %res
}