forked from OSchip/llvm-project
[AVX512] add vfmadd132ss and vfmadd132sd Intrinsic
Differential Revision: http://reviews.llvm.org/D16589 llvm-svn: 259789
This commit is contained in:
parent
4d67ec3f8c
commit
7d73360479
|
@ -3941,6 +3941,43 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
|
|||
[llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
|
||||
llvm_i32_ty], [IntrNoMem]>;
|
||||
|
||||
|
||||
def int_x86_avx512_mask_vfmadd_sd :
|
||||
GCCBuiltin<"__builtin_ia32_vfmaddsd3_mask">,
|
||||
Intrinsic<[llvm_v2f64_ty],
|
||||
[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty,
|
||||
llvm_i32_ty], [IntrNoMem]>;
|
||||
|
||||
def int_x86_avx512_mask_vfmadd_ss :
|
||||
GCCBuiltin<"__builtin_ia32_vfmaddss3_mask">,
|
||||
Intrinsic<[llvm_v4f32_ty],
|
||||
[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty,
|
||||
llvm_i32_ty], [IntrNoMem]>;
|
||||
|
||||
def int_x86_avx512_maskz_vfmadd_sd :
|
||||
GCCBuiltin<"__builtin_ia32_vfmaddsd3_maskz">,
|
||||
Intrinsic<[llvm_v2f64_ty],
|
||||
[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty,
|
||||
llvm_i32_ty], [IntrNoMem]>;
|
||||
|
||||
def int_x86_avx512_maskz_vfmadd_ss :
|
||||
GCCBuiltin<"__builtin_ia32_vfmaddss3_maskz">,
|
||||
Intrinsic<[llvm_v4f32_ty],
|
||||
[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty,
|
||||
llvm_i32_ty], [IntrNoMem]>;
|
||||
|
||||
def int_x86_avx512_mask3_vfmadd_sd :
|
||||
GCCBuiltin<"__builtin_ia32_vfmaddsd3_mask3">,
|
||||
Intrinsic<[llvm_v2f64_ty],
|
||||
[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty,
|
||||
llvm_i32_ty], [IntrNoMem]>;
|
||||
|
||||
def int_x86_avx512_mask3_vfmadd_ss :
|
||||
GCCBuiltin<"__builtin_ia32_vfmaddss3_mask3">,
|
||||
Intrinsic<[llvm_v4f32_ty],
|
||||
[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty,
|
||||
llvm_i32_ty], [IntrNoMem]>;
|
||||
|
||||
def int_x86_avx512_mask3_vfmsub_pd_128 :
|
||||
GCCBuiltin<"__builtin_ia32_vfmsubpd128_mask3">,
|
||||
Intrinsic<[llvm_v2f64_ty],
|
||||
|
|
|
@ -16938,6 +16938,30 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
|
|||
Src1, Src2, Src3),
|
||||
Mask, PassThru, Subtarget, DAG);
|
||||
}
|
||||
case FMA_OP_SCALAR_MASK:
|
||||
case FMA_OP_SCALAR_MASK3:
|
||||
case FMA_OP_SCALAR_MASKZ: {
|
||||
SDValue Src1 = Op.getOperand(1);
|
||||
SDValue Src2 = Op.getOperand(2);
|
||||
SDValue Src3 = Op.getOperand(3);
|
||||
SDValue Mask = Op.getOperand(4);
|
||||
MVT VT = Op.getSimpleValueType();
|
||||
SDValue PassThru = SDValue();
|
||||
|
||||
// set PassThru element
|
||||
if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
|
||||
PassThru = getZeroVector(VT, Subtarget, DAG, dl);
|
||||
else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
|
||||
PassThru = Src3;
|
||||
else
|
||||
PassThru = Src1;
|
||||
|
||||
SDValue Rnd = Op.getOperand(5);
|
||||
return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
|
||||
Op.getValueType(), Src1, Src2,
|
||||
Src3, Rnd),
|
||||
Mask, PassThru, Subtarget, DAG);
|
||||
}
|
||||
case TERLOG_OP_MASK:
|
||||
case TERLOG_OP_MASKZ: {
|
||||
SDValue Src1 = Op.getOperand(1);
|
||||
|
|
|
@ -4713,9 +4713,9 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
|
|||
string SUFF> {
|
||||
|
||||
defm NAME#213#SUFF: avx512_fma3s_common<opc213, OpcodeStr#"213"#_.Suffix , _ ,
|
||||
(_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)),
|
||||
(_.VT (OpNode _.RC:$src2, _.RC:$src1,
|
||||
(_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))))),
|
||||
(_.VT (OpNodeRnd _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 FROUND_CURRENT))),
|
||||
(_.VT (OpNodeRnd _.RC:$src2, _.RC:$src1,
|
||||
(_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))), (i32 FROUND_CURRENT))),
|
||||
(_.VT ( OpNodeRnd _.RC:$src2, _.RC:$src1, _.RC:$src3,
|
||||
(i32 imm:$rc))),
|
||||
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1,
|
||||
|
@ -4724,10 +4724,10 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
|
|||
(_.ScalarLdFrag addr:$src3))))>;
|
||||
|
||||
defm NAME#231#SUFF: avx512_fma3s_common<opc231, OpcodeStr#"231"#_.Suffix , _ ,
|
||||
(_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)),
|
||||
(_.VT (OpNode _.RC:$src2,
|
||||
(_.VT (OpNodeRnd _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 FROUND_CURRENT))),
|
||||
(_.VT (OpNodeRnd _.RC:$src2,
|
||||
(_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))),
|
||||
_.RC:$src1)),
|
||||
_.RC:$src1, (i32 FROUND_CURRENT))),
|
||||
(_.VT ( OpNodeRnd _.RC:$src2, _.RC:$src3, _.RC:$src1,
|
||||
(i32 imm:$rc))),
|
||||
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src3,
|
||||
|
@ -4736,10 +4736,10 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
|
|||
(_.ScalarLdFrag addr:$src3), _.FRC:$src1)))>;
|
||||
|
||||
defm NAME#132#SUFF: avx512_fma3s_common<opc132, OpcodeStr#"132"#_.Suffix , _ ,
|
||||
(_.VT (OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2)),
|
||||
(_.VT (OpNode _.RC:$src1,
|
||||
(_.VT (OpNodeRnd _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 FROUND_CURRENT))),
|
||||
(_.VT (OpNodeRnd _.RC:$src1,
|
||||
(_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))),
|
||||
_.RC:$src2)),
|
||||
_.RC:$src2, (i32 FROUND_CURRENT))),
|
||||
(_.VT ( OpNodeRnd _.RC:$src1, _.RC:$src3, _.RC:$src2,
|
||||
(i32 imm:$rc))),
|
||||
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1, _.FRC:$src3,
|
||||
|
|
|
@ -27,8 +27,9 @@ enum IntrinsicType {
|
|||
INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM,
|
||||
INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, INTR_TYPE_2OP_IMM8_MASK,
|
||||
INTR_TYPE_3OP_MASK, INTR_TYPE_3OP_MASK_RM, INTR_TYPE_3OP_IMM8_MASK,
|
||||
FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3, VPERM_3OP_MASK,
|
||||
VPERM_3OP_MASKZ, INTR_TYPE_SCALAR_MASK,
|
||||
FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3,
|
||||
FMA_OP_SCALAR_MASK, FMA_OP_SCALAR_MASKZ, FMA_OP_SCALAR_MASK3,
|
||||
VPERM_3OP_MASK, VPERM_3OP_MASKZ, INTR_TYPE_SCALAR_MASK,
|
||||
INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK_RM,
|
||||
COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, BRCST_SUBVEC_TO_VEC,
|
||||
TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
|
||||
|
@ -1748,6 +1749,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
|
|||
X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_512, FMA_OP_MASK, X86ISD::FMADD,
|
||||
X86ISD::FMADD_RND),
|
||||
|
||||
X86_INTRINSIC_DATA(avx512_mask_vfmadd_sd, FMA_OP_SCALAR_MASK, X86ISD::FMADD_RND, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_vfmadd_ss, FMA_OP_SCALAR_MASK, X86ISD::FMADD_RND, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_512, FMA_OP_MASK, X86ISD::FMADDSUB,
|
||||
|
@ -1898,6 +1901,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
|
|||
X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_512, FMA_OP_MASK3, X86ISD::FMADD,
|
||||
X86ISD::FMADD_RND),
|
||||
|
||||
X86_INTRINSIC_DATA(avx512_mask3_vfmadd_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMADD_RND, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMADD_RND, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_512, FMA_OP_MASK3, X86ISD::FMADDSUB,
|
||||
|
@ -1970,6 +1975,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
|
|||
X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_512, FMA_OP_MASKZ, X86ISD::FMADD,
|
||||
X86ISD::FMADD_RND),
|
||||
|
||||
X86_INTRINSIC_DATA(avx512_maskz_vfmadd_sd, FMA_OP_SCALAR_MASKZ, X86ISD::FMADD_RND, 0),
|
||||
X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ss, FMA_OP_SCALAR_MASKZ, X86ISD::FMADD_RND, 0),
|
||||
X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_128, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
|
||||
X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_256, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
|
||||
X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_512, FMA_OP_MASKZ, X86ISD::FMADDSUB,
|
||||
|
|
|
@ -7356,3 +7356,189 @@ define i8@test_int_x86_avx512_ptestnm_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2
|
|||
%res2 = add i8 %res, %res1
|
||||
ret i8 %res2
|
||||
}
|
||||
|
||||
declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
|
||||
|
||||
define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
|
||||
; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_sd:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: andl $1, %edi
|
||||
; CHECK-NEXT: kmovw %edi, %k1
|
||||
; CHECK-NEXT: vmovaps %zmm0, %zmm3
|
||||
; CHECK-NEXT: vfmadd132sd %xmm1, %xmm2, %xmm3 {%k1}
|
||||
; CHECK-NEXT: vmovaps %zmm1, %zmm4
|
||||
; CHECK-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm4
|
||||
; CHECK-NEXT: vmovaps %zmm0, %zmm5
|
||||
; CHECK-NEXT: vfmadd132sd {rz-sae}, %xmm1, %xmm2, %xmm5 {%k1}
|
||||
; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm0, %xmm1
|
||||
; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0
|
||||
; CHECK-NEXT: vaddpd %xmm5, %xmm1, %xmm1
|
||||
; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
|
||||
%res1 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
|
||||
%res2 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 3)
|
||||
%res3 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3)
|
||||
%res4 = fadd <2 x double> %res, %res1
|
||||
%res5 = fadd <2 x double> %res2, %res3
|
||||
%res6 = fadd <2 x double> %res4, %res5
|
||||
ret <2 x double> %res6
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
|
||||
|
||||
define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
|
||||
; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ss:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: andl $1, %edi
|
||||
; CHECK-NEXT: kmovw %edi, %k1
|
||||
; CHECK-NEXT: vmovaps %zmm0, %zmm3
|
||||
; CHECK-NEXT: vfmadd132ss %xmm1, %xmm2, %xmm3 {%k1}
|
||||
; CHECK-NEXT: vmovaps %zmm1, %zmm4
|
||||
; CHECK-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm4
|
||||
; CHECK-NEXT: vmovaps %zmm0, %zmm5
|
||||
; CHECK-NEXT: vfmadd132ss {rz-sae}, %xmm1, %xmm2, %xmm5 {%k1}
|
||||
; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm0, %xmm1
|
||||
; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0
|
||||
; CHECK-NEXT: vaddps %xmm5, %xmm1, %xmm1
|
||||
; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
|
||||
%res1 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
|
||||
%res2 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 3)
|
||||
%res3 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3)
|
||||
%res4 = fadd <4 x float> %res, %res1
|
||||
%res5 = fadd <4 x float> %res2, %res3
|
||||
%res6 = fadd <4 x float> %res4, %res5
|
||||
ret <4 x float> %res6
|
||||
}
|
||||
|
||||
declare <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
|
||||
|
||||
define <2 x double>@test_int_x86_avx512_maskz_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
|
||||
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_sd:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: andl $1, %edi
|
||||
; CHECK-NEXT: kmovw %edi, %k1
|
||||
; CHECK-NEXT: vmovaps %zmm1, %zmm3
|
||||
; CHECK-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm3 {%k1} {z}
|
||||
; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm0, %xmm1 {%k1} {z}
|
||||
; CHECK-NEXT: vaddpd %xmm1, %xmm3, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
|
||||
%res1 = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3)
|
||||
%res2 = fadd <2 x double> %res, %res1
|
||||
ret <2 x double> %res2
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
|
||||
|
||||
define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
|
||||
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: andl $1, %edi
|
||||
; CHECK-NEXT: kmovw %edi, %k1
|
||||
; CHECK-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1 {%k1} {z}
|
||||
; CHECK-NEXT: vmovaps %zmm1, %zmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
|
||||
%res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3)
|
||||
%res2 = fadd <4 x float> %res, %res1
|
||||
ret <4 x float> %res
|
||||
}
|
||||
declare <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
|
||||
|
||||
define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
|
||||
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_sd:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: andl $1, %edi
|
||||
; CHECK-NEXT: kmovw %edi, %k1
|
||||
; CHECK-NEXT: vmovaps %zmm2, %zmm3
|
||||
; CHECK-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm3 {%k1}
|
||||
; CHECK-NEXT: vmovaps %zmm1, %zmm4
|
||||
; CHECK-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm4
|
||||
; CHECK-NEXT: vmovaps %zmm2, %zmm5
|
||||
; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
|
||||
; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm0, %xmm1
|
||||
; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0
|
||||
; CHECK-NEXT: vaddpd %xmm5, %xmm1, %xmm1
|
||||
; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
|
||||
%res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
|
||||
%res2 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 3)
|
||||
%res3 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3)
|
||||
%res4 = fadd <2 x double> %res, %res1
|
||||
%res5 = fadd <2 x double> %res2, %res3
|
||||
%res6 = fadd <2 x double> %res4, %res5
|
||||
ret <2 x double> %res6
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
|
||||
|
||||
define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
|
||||
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ss:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: andl $1, %edi
|
||||
; CHECK-NEXT: kmovw %edi, %k1
|
||||
; CHECK-NEXT: vmovaps %zmm2, %zmm3
|
||||
; CHECK-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm3 {%k1}
|
||||
; CHECK-NEXT: vmovaps %zmm1, %zmm4
|
||||
; CHECK-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm4
|
||||
; CHECK-NEXT: vmovaps %zmm2, %zmm5
|
||||
; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
|
||||
; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm0, %xmm1
|
||||
; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0
|
||||
; CHECK-NEXT: vaddps %xmm5, %xmm1, %xmm1
|
||||
; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
|
||||
%res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
|
||||
%res2 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 3)
|
||||
%res3 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3)
|
||||
%res4 = fadd <4 x float> %res, %res1
|
||||
%res5 = fadd <4 x float> %res2, %res3
|
||||
%res6 = fadd <4 x float> %res4, %res5
|
||||
ret <4 x float> %res6
|
||||
}
|
||||
|
||||
define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1, float *%ptr_b ,i8 %x3,i32 %x4) {
|
||||
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ss_rm:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: andl $1, %esi
|
||||
; CHECK-NEXT: kmovw %esi, %k1
|
||||
; CHECK-NEXT: vfmadd231ss (%rdi), %xmm0, %xmm1 {%k1}
|
||||
; CHECK-NEXT: vmovaps %zmm1, %zmm0
|
||||
; CHECK-NEXT: retq
|
||||
%q = load float, float* %ptr_b
|
||||
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
|
||||
%res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %vecinit.i, <4 x float> %x1, i8 %x3, i32 4)
|
||||
ret < 4 x float> %res
|
||||
}
|
||||
|
||||
define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,float *%ptr_b ,i8 %x3,i32 %x4) {
|
||||
; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ss_rm:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: andl $1, %esi
|
||||
; CHECK-NEXT: kmovw %esi, %k1
|
||||
; CHECK-NEXT: vfmadd132ss (%rdi), %xmm1, %xmm0 {%k1}
|
||||
; CHECK-NEXT: retq
|
||||
%q = load float, float* %ptr_b
|
||||
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
|
||||
%res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0,<4 x float> %vecinit.i, <4 x float> %x1, i8 %x3, i32 4)
|
||||
ret < 4 x float> %res
|
||||
}
|
||||
|
||||
|
||||
define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,float *%ptr_b ,i8 %x3,i32 %x4) {
|
||||
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_rm:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: kxorw %k0, %k0, %k1
|
||||
; CHECK-NEXT: vfmadd213ss (%rdi), %xmm0, %xmm1 {%k1} {z}
|
||||
; CHECK-NEXT: vmovaps %zmm1, %zmm0
|
||||
; CHECK-NEXT: retq
|
||||
%q = load float, float* %ptr_b
|
||||
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
|
||||
%res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %vecinit.i, i8 0, i32 4)
|
||||
ret < 4 x float> %res
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue