forked from OSchip/llvm-project
AVX-512: Intrinsics for ERI
3 instructions: vrcp28, vrsqrt28, vexp2, only vector forms. Intrinsics include SAE (Suppres All Exceptions) parameter. http://reviews.llvm.org/D6214 llvm-svn: 221774
This commit is contained in:
parent
a48273390c
commit
be8808dc3f
|
@ -3050,6 +3050,13 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
|
|||
def int_x86_avx512_rcp28_pd : GCCBuiltin<"__builtin_ia32_rcp28pd_mask">,
|
||||
Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
|
||||
llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
|
||||
def int_x86_avx512_exp2_ps : GCCBuiltin<"__builtin_ia32_exp2ps_mask">,
|
||||
Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
|
||||
llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
|
||||
def int_x86_avx512_exp2_pd : GCCBuiltin<"__builtin_ia32_exp2pd_mask">,
|
||||
Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
|
||||
llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
|
||||
|
||||
def int_x86_avx512_rcp28_ss : GCCBuiltin<"__builtin_ia32_rcp28ss_mask">,
|
||||
Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
|
||||
llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty],
|
||||
|
|
|
@ -16310,7 +16310,9 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
|
|||
/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
|
||||
/// necessary casting for \p Mask when lowering masking intrinsics.
|
||||
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
|
||||
SDValue PreservedSrc, SelectionDAG &DAG) {
|
||||
SDValue PreservedSrc,
|
||||
const X86Subtarget *Subtarget,
|
||||
SelectionDAG &DAG) {
|
||||
EVT VT = Op.getValueType();
|
||||
EVT MaskVT = EVT::getVectorVT(*DAG.getContext(),
|
||||
MVT::i1, VT.getVectorNumElements());
|
||||
|
@ -16337,7 +16339,8 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
|
|||
case X86ISD::CMPMU:
|
||||
return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
|
||||
}
|
||||
|
||||
if (PreservedSrc.getOpcode() == ISD::UNDEF)
|
||||
PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
|
||||
return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc);
|
||||
}
|
||||
|
||||
|
@ -16389,10 +16392,11 @@ static unsigned getOpcodeForFMAIntrinsic(unsigned IntNo) {
|
|||
}
|
||||
}
|
||||
|
||||
static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
|
||||
static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
|
||||
SelectionDAG &DAG) {
|
||||
SDLoc dl(Op);
|
||||
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
|
||||
|
||||
EVT VT = Op.getValueType();
|
||||
const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
|
||||
if (IntrData) {
|
||||
switch(IntrData->Type) {
|
||||
|
@ -16404,6 +16408,16 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
|
|||
case INTR_TYPE_3OP:
|
||||
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
|
||||
Op.getOperand(2), Op.getOperand(3));
|
||||
case INTR_TYPE_1OP_MASK_RM: {
|
||||
SDValue Src = Op.getOperand(1);
|
||||
SDValue Src0 = Op.getOperand(2);
|
||||
SDValue Mask = Op.getOperand(3);
|
||||
SDValue RoundingMode = Op.getOperand(4);
|
||||
return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
|
||||
RoundingMode),
|
||||
Mask, Src0, Subtarget, DAG);
|
||||
}
|
||||
|
||||
case CMP_MASK:
|
||||
case CMP_MASK_CC: {
|
||||
// Comparison intrinsics with masks.
|
||||
|
@ -16431,7 +16445,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
|
|||
Op.getOperand(2));
|
||||
}
|
||||
SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
|
||||
DAG.getTargetConstant(0, MaskVT), DAG);
|
||||
DAG.getTargetConstant(0, MaskVT),
|
||||
Subtarget, DAG);
|
||||
SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
|
||||
DAG.getUNDEF(BitcastVT), CmpMask,
|
||||
DAG.getIntPtrConstant(0));
|
||||
|
@ -16598,7 +16613,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
|
|||
Op.getValueType(), Op.getOperand(2),
|
||||
Op.getOperand(1),
|
||||
Op.getOperand(3)),
|
||||
Op.getOperand(5), Op.getOperand(4), DAG);
|
||||
Op.getOperand(5), Op.getOperand(4),
|
||||
Subtarget, DAG);
|
||||
|
||||
// ptest and testp intrinsics. The intrinsic these come from are designed to
|
||||
// return an integer value, not just an instruction so lower it to the ptest
|
||||
|
@ -16772,7 +16788,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
|
|||
Op.getOperand(1),
|
||||
Op.getOperand(2),
|
||||
Op.getOperand(3)),
|
||||
Op.getOperand(4), Op.getOperand(1), DAG);
|
||||
Op.getOperand(4), Op.getOperand(1),
|
||||
Subtarget, DAG);
|
||||
else
|
||||
return SDValue();
|
||||
}
|
||||
|
@ -18887,7 +18904,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
|||
case ISD::VASTART: return LowerVASTART(Op, DAG);
|
||||
case ISD::VAARG: return LowerVAARG(Op, DAG);
|
||||
case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
|
||||
case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
|
||||
case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
|
||||
case ISD::INTRINSIC_VOID:
|
||||
case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
|
||||
case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
|
||||
|
|
|
@ -419,6 +419,9 @@ namespace llvm {
|
|||
// Test if in transactional execution.
|
||||
XTEST,
|
||||
|
||||
// ERI instructions
|
||||
RSQRT28, RCP28, EXP2,
|
||||
|
||||
// Compare and swap.
|
||||
LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
|
||||
LCMPXCHG8_DAG,
|
||||
|
|
|
@ -146,20 +146,21 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F,
|
|||
list<dag> Pattern,
|
||||
list<dag> MaskingPattern,
|
||||
list<dag> ZeroMaskingPattern,
|
||||
string Round = "",
|
||||
string MaskingConstraint = "",
|
||||
InstrItinClass itin = NoItinerary,
|
||||
bit IsCommutable = 0> {
|
||||
let isCommutable = IsCommutable in
|
||||
def NAME: AVX512<O, F, Outs, Ins,
|
||||
OpcodeStr#"\t{"#AttSrcAsm#", $dst|"#
|
||||
"$dst, "#IntelSrcAsm#"}",
|
||||
OpcodeStr#"\t{"#AttSrcAsm#", $dst "#Round#"|"#
|
||||
"$dst "#Round#", "#IntelSrcAsm#"}",
|
||||
Pattern, itin>;
|
||||
|
||||
// Prefer over VMOV*rrk Pat<>
|
||||
let AddedComplexity = 20 in
|
||||
def NAME#k: AVX512<O, F, Outs, MaskingIns,
|
||||
OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"#
|
||||
"$dst {${mask}}, "#IntelSrcAsm#"}",
|
||||
OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}"#Round#"|"#
|
||||
"$dst {${mask}}"#Round#", "#IntelSrcAsm#"}",
|
||||
MaskingPattern, itin>,
|
||||
EVEX_K {
|
||||
// In case of the 3src subclass this is overridden with a let.
|
||||
|
@ -167,8 +168,8 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F,
|
|||
}
|
||||
let AddedComplexity = 30 in // Prefer over VMOV*rrkz Pat<>
|
||||
def NAME#kz: AVX512<O, F, Outs, ZeroMaskingIns,
|
||||
OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}|"#
|
||||
"$dst {${mask}} {z}, "#IntelSrcAsm#"}",
|
||||
OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}"#Round#"|"#
|
||||
"$dst {${mask}} {z}"#Round#", "#IntelSrcAsm#"}",
|
||||
ZeroMaskingPattern,
|
||||
itin>,
|
||||
EVEX_KZ;
|
||||
|
@ -182,6 +183,7 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,
|
|||
string OpcodeStr,
|
||||
string AttSrcAsm, string IntelSrcAsm,
|
||||
dag RHS, dag MaskingRHS,
|
||||
string Round = "",
|
||||
string MaskingConstraint = "",
|
||||
InstrItinClass itin = NoItinerary,
|
||||
bit IsCommutable = 0> :
|
||||
|
@ -191,7 +193,7 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,
|
|||
[(set _.RC:$dst, MaskingRHS)],
|
||||
[(set _.RC:$dst,
|
||||
(vselect _.KRCWM:$mask, RHS, _.ImmAllZerosV))],
|
||||
MaskingConstraint, NoItinerary, IsCommutable>;
|
||||
Round, MaskingConstraint, NoItinerary, IsCommutable>;
|
||||
|
||||
// This multiclass generates the unconditional/non-masking, the masking and
|
||||
// the zero-masking variant of the instruction. In the masking case, the
|
||||
|
@ -199,13 +201,14 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,
|
|||
multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _,
|
||||
dag Outs, dag Ins, string OpcodeStr,
|
||||
string AttSrcAsm, string IntelSrcAsm,
|
||||
dag RHS, InstrItinClass itin = NoItinerary,
|
||||
dag RHS, string Round = "",
|
||||
InstrItinClass itin = NoItinerary,
|
||||
bit IsCommutable = 0> :
|
||||
AVX512_maskable_common<O, F, _, Outs, Ins,
|
||||
!con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
|
||||
!con((ins _.KRCWM:$mask), Ins),
|
||||
OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
|
||||
(vselect _.KRCWM:$mask, RHS, _.RC:$src0),
|
||||
(vselect _.KRCWM:$mask, RHS, _.RC:$src0), Round,
|
||||
"$src0 = $dst", itin, IsCommutable>;
|
||||
|
||||
// Similar to AVX512_maskable but in this case one of the source operands
|
||||
|
@ -232,7 +235,7 @@ multiclass AVX512_maskable_in_asm<bits<8> O, Format F, X86VectorVTInfo _,
|
|||
AVX512_maskable_custom<O, F, Outs, Ins,
|
||||
!con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
|
||||
!con((ins _.KRCWM:$mask), Ins),
|
||||
OpcodeStr, AttSrcAsm, IntelSrcAsm, Pattern, [], [],
|
||||
OpcodeStr, AttSrcAsm, IntelSrcAsm, Pattern, [], [], "",
|
||||
"$src0 = $dst">;
|
||||
|
||||
// Bitcasts between 512-bit vector types. Return the original type since
|
||||
|
@ -2626,7 +2629,7 @@ multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
|
|||
(ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
|
||||
"$src2, $src1", "$src1, $src2",
|
||||
(_.VT (OpNode _.RC:$src1, _.RC:$src2)),
|
||||
itins.rr, IsCommutable>,
|
||||
"", itins.rr, IsCommutable>,
|
||||
AVX512BIBase, EVEX_4V;
|
||||
|
||||
let mayLoad = 1 in
|
||||
|
@ -2635,7 +2638,7 @@ multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
|
|||
"$src2, $src1", "$src1, $src2",
|
||||
(_.VT (OpNode _.RC:$src1,
|
||||
(bitconvert (_.LdFrag addr:$src2)))),
|
||||
itins.rm>,
|
||||
"", itins.rm>,
|
||||
AVX512BIBase, EVEX_4V;
|
||||
}
|
||||
|
||||
|
@ -2651,7 +2654,7 @@ multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
|
|||
(_.VT (OpNode _.RC:$src1,
|
||||
(X86VBroadcast
|
||||
(_.ScalarLdFrag addr:$src2)))),
|
||||
itins.rm>,
|
||||
"", itins.rm>,
|
||||
AVX512BIBase, EVEX_4V, EVEX_B;
|
||||
}
|
||||
|
||||
|
@ -4199,44 +4202,44 @@ def : Pat <(v2f64 (int_x86_avx512_rsqrt28_sd (v2f64 VR128X:$src1),
|
|||
(COPY_TO_REGCLASS VR128X:$src2, FR64X)), VR128X)>;
|
||||
|
||||
/// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd
|
||||
multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr,
|
||||
RegisterClass RC, X86MemOperand x86memop> {
|
||||
let hasSideEffects = 0, Predicates = [HasERI] in {
|
||||
def r : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
|
||||
!strconcat(OpcodeStr,
|
||||
" \t{$src, $dst|$dst, $src}"),
|
||||
[]>, EVEX;
|
||||
def rb : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
|
||||
!strconcat(OpcodeStr,
|
||||
" \t{{sae}, $src, $dst|$dst, $src, {sae}}"),
|
||||
[]>, EVEX, EVEX_B;
|
||||
def m : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
|
||||
!strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
|
||||
[]>, EVEX;
|
||||
}
|
||||
|
||||
multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
|
||||
SDNode OpNode> {
|
||||
|
||||
defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
|
||||
(ins _.RC:$src), OpcodeStr, "$src", "$src",
|
||||
(OpNode (_.VT _.RC:$src), (i32 FROUND_CURRENT))>;
|
||||
|
||||
defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
|
||||
(ins _.RC:$src), OpcodeStr,
|
||||
"$src", "$src",
|
||||
(OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC)), "{sae}">, EVEX_B;
|
||||
|
||||
defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
|
||||
(ins _.MemOp:$src), OpcodeStr, "$src", "$src",
|
||||
(OpNode (_.FloatVT
|
||||
(bitconvert (_.LdFrag addr:$src))), (i32 FROUND_CURRENT))>;
|
||||
|
||||
defm mb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
|
||||
(ins _.MemOp:$src), OpcodeStr, "$src", "$src",
|
||||
(OpNode (_.FloatVT
|
||||
(X86VBroadcast (_.ScalarLdFrag addr:$src))),
|
||||
(i32 FROUND_CURRENT))>, EVEX_B;
|
||||
}
|
||||
defm VRSQRT28PSZ : avx512_fp28_p<0xCC, "vrsqrt28ps", VR512, f512mem>,
|
||||
EVEX_V512, EVEX_CD8<32, CD8VF>;
|
||||
defm VRSQRT28PDZ : avx512_fp28_p<0xCC, "vrsqrt28pd", VR512, f512mem>,
|
||||
VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
|
||||
defm VRCP28PSZ : avx512_fp28_p<0xCA, "vrcp28ps", VR512, f512mem>,
|
||||
EVEX_V512, EVEX_CD8<32, CD8VF>;
|
||||
defm VRCP28PDZ : avx512_fp28_p<0xCA, "vrcp28pd", VR512, f512mem>,
|
||||
VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
|
||||
|
||||
def : Pat <(v16f32 (int_x86_avx512_rsqrt28_ps (v16f32 VR512:$src),
|
||||
(bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), FROUND_NO_EXC)),
|
||||
(VRSQRT28PSZrb VR512:$src)>;
|
||||
def : Pat <(v8f64 (int_x86_avx512_rsqrt28_pd (v8f64 VR512:$src),
|
||||
(bc_v8f64 (v16i32 immAllZerosV)), (i8 -1), FROUND_NO_EXC)),
|
||||
(VRSQRT28PDZrb VR512:$src)>;
|
||||
multiclass avx512_eri<bits<8> opc, string OpcodeStr, SDNode OpNode> {
|
||||
defm PS : avx512_fp28_p<opc, OpcodeStr#"ps", v16f32_info, OpNode>,
|
||||
EVEX_CD8<32, CD8VF>;
|
||||
defm PD : avx512_fp28_p<opc, OpcodeStr#"pd", v8f64_info, OpNode>,
|
||||
VEX_W, EVEX_CD8<32, CD8VF>;
|
||||
}
|
||||
|
||||
def : Pat <(v16f32 (int_x86_avx512_rcp28_ps (v16f32 VR512:$src),
|
||||
(bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), FROUND_NO_EXC)),
|
||||
(VRCP28PSZrb VR512:$src)>;
|
||||
def : Pat <(v8f64 (int_x86_avx512_rcp28_pd (v8f64 VR512:$src),
|
||||
(bc_v8f64 (v16i32 immAllZerosV)), (i8 -1), FROUND_NO_EXC)),
|
||||
(VRCP28PDZrb VR512:$src)>;
|
||||
let Predicates = [HasERI], hasSideEffects = 0 in {
|
||||
|
||||
defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28>, EVEX, EVEX_V512, T8PD;
|
||||
defm VRCP28 : avx512_eri<0xCA, "vrcp28", X86rcp28>, EVEX, EVEX_V512, T8PD;
|
||||
defm VEXP2 : avx512_eri<0xC8, "vexp2", X86exp2>, EVEX, EVEX_V512, T8PD;
|
||||
}
|
||||
|
||||
multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr,
|
||||
SDNode OpNode, X86VectorVTInfo _>{
|
||||
|
|
|
@ -203,6 +203,8 @@ def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
|
|||
|
||||
def SDTFma : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>,
|
||||
SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>;
|
||||
def STDFp1SrcRm : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>,
|
||||
SDTCisVec<0>, SDTCisInt<2>]>;
|
||||
|
||||
def X86PAlignr : SDNode<"X86ISD::PALIGNR", SDTShuff3OpI>;
|
||||
def X86VAlign : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>;
|
||||
|
@ -261,6 +263,10 @@ def X86Fnmsub : SDNode<"X86ISD::FNMSUB", SDTFma>;
|
|||
def X86Fmaddsub : SDNode<"X86ISD::FMADDSUB", SDTFma>;
|
||||
def X86Fmsubadd : SDNode<"X86ISD::FMSUBADD", SDTFma>;
|
||||
|
||||
def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", STDFp1SrcRm>;
|
||||
def X86rcp28 : SDNode<"X86ISD::RCP28", STDFp1SrcRm>;
|
||||
def X86exp2 : SDNode<"X86ISD::EXP2", STDFp1SrcRm>;
|
||||
|
||||
def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
|
||||
SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>,
|
||||
SDTCisVT<4, i8>]>;
|
||||
|
|
|
@ -20,7 +20,7 @@ enum IntrinsicType {
|
|||
INTR_NO_TYPE,
|
||||
GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX,
|
||||
INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP,
|
||||
CMP_MASK, CMP_MASK_CC, VSHIFT, COMI
|
||||
CMP_MASK, CMP_MASK_CC, VSHIFT, COMI, INTR_TYPE_1OP_MASK_RM
|
||||
};
|
||||
|
||||
struct IntrinsicData {
|
||||
|
@ -156,6 +156,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
|
|||
X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
|
||||
X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
|
||||
X86_INTRINSIC_DATA(avx2_vperm2i128, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
|
||||
X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::EXP2, 0),
|
||||
X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::EXP2, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_cmp_b_128, CMP_MASK_CC, X86ISD::CMPM, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_cmp_b_256, CMP_MASK_CC, X86ISD::CMPM, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_cmp_b_512, CMP_MASK_CC, X86ISD::CMPM, 0),
|
||||
|
@ -204,6 +206,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
|
|||
X86_INTRINSIC_DATA(avx512_mask_ucmp_w_128, CMP_MASK_CC, X86ISD::CMPMU, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_ucmp_w_256, CMP_MASK_CC, X86ISD::CMPMU, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512, CMP_MASK_CC, X86ISD::CMPMU, 0),
|
||||
X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0),
|
||||
X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0),
|
||||
X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
|
||||
X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
|
||||
X86_INTRINSIC_DATA(avx_hadd_pd_256, INTR_TYPE_2OP, X86ISD::FHADD, 0),
|
||||
X86_INTRINSIC_DATA(avx_hadd_ps_256, INTR_TYPE_2OP, X86ISD::FHADD, 0),
|
||||
X86_INTRINSIC_DATA(avx_hsub_pd_256, INTR_TYPE_2OP, X86ISD::FHSUB, 0),
|
||||
|
|
|
@ -60,20 +60,6 @@ define <8 x double> @test_rcp_pd_512(<8 x double> %a0) {
|
|||
}
|
||||
declare <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double>, <8 x double>, i8) nounwind readnone
|
||||
|
||||
define <16 x float> @test_rcp28_ps_512(<16 x float> %a0) {
|
||||
; CHECK: vrcp28ps {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xca,0xc0]
|
||||
%res = call <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) ; <<16 x float>> [#uses=1]
|
||||
ret <16 x float> %res
|
||||
}
|
||||
declare <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
|
||||
|
||||
define <8 x double> @test_rcp28_pd_512(<8 x double> %a0) {
|
||||
; CHECK: vrcp28pd {sae}, {{.*}}encoding: [0x62,0xf2,0xfd,0x18,0xca,0xc0]
|
||||
%res = call <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8) ; <<8 x double>> [#uses=1]
|
||||
ret <8 x double> %res
|
||||
}
|
||||
declare <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone
|
||||
|
||||
declare <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
|
||||
|
||||
define <8 x double> @test7(<8 x double> %a) {
|
||||
|
@ -97,13 +83,6 @@ define <16 x float> @test_rsqrt_ps_512(<16 x float> %a0) {
|
|||
}
|
||||
declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
|
||||
|
||||
define <16 x float> @test_rsqrt28_ps_512(<16 x float> %a0) {
|
||||
; CHECK: vrsqrt28ps {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcc,0xc0]
|
||||
%res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) ; <<16 x float>> [#uses=1]
|
||||
ret <16 x float> %res
|
||||
}
|
||||
declare <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
|
||||
|
||||
define <4 x float> @test_rsqrt14_ss(<4 x float> %a0) {
|
||||
; CHECK: vrsqrt14ss {{.*}}encoding: [0x62,0xf2,0x7d,0x08,0x4f,0xc0]
|
||||
%res = call <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ; <<4 x float>> [#uses=1]
|
||||
|
@ -111,13 +90,6 @@ define <4 x float> @test_rsqrt14_ss(<4 x float> %a0) {
|
|||
}
|
||||
declare <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
|
||||
|
||||
define <4 x float> @test_rsqrt28_ss(<4 x float> %a0) {
|
||||
; CHECK: vrsqrt28ss {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc0]
|
||||
%res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]
|
||||
ret <4 x float> %res
|
||||
}
|
||||
declare <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
|
||||
|
||||
define <4 x float> @test_rcp14_ss(<4 x float> %a0) {
|
||||
; CHECK: vrcp14ss {{.*}}encoding: [0x62,0xf2,0x7d,0x08,0x4d,0xc0]
|
||||
%res = call <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ; <<4 x float>> [#uses=1]
|
||||
|
@ -125,13 +97,6 @@ define <4 x float> @test_rcp14_ss(<4 x float> %a0) {
|
|||
}
|
||||
declare <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
|
||||
|
||||
define <4 x float> @test_rcp28_ss(<4 x float> %a0) {
|
||||
; CHECK: vrcp28ss {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcb,0xc0]
|
||||
%res = call <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]
|
||||
ret <4 x float> %res
|
||||
}
|
||||
declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
|
||||
|
||||
define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) {
|
||||
; CHECK: vsqrtpd
|
||||
%res = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 4) ; <<8 x double>> [#uses=1]
|
||||
|
|
|
@ -0,0 +1,79 @@
|
|||
; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=knl --show-mc-encoding| FileCheck %s
|
||||
|
||||
define <16 x float> @test_rsqrt28_ps(<16 x float> %a0) {
|
||||
; CHECK: vrsqrt28ps %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xcc,0xc0]
|
||||
%res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
|
||||
ret <16 x float> %res
|
||||
}
|
||||
|
||||
define <16 x float> @test1_rsqrt28_ps(<16 x float> %a0, <16 x float> %a1) {
|
||||
; CHECK: kmovw
|
||||
; CHECK: vrsqrt28ps %zmm0, %zmm1 {%k1}{sae} # encoding: [0x62,0xf2,0x7d,0x19,0xcc,0xc8]
|
||||
%res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> %a1, i16 6, i32 8)
|
||||
ret <16 x float> %res
|
||||
}
|
||||
|
||||
define <16 x float> @test2_rsqrt28_ps(<16 x float> %a0) {
|
||||
; CHECK: vrsqrt28ps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcc,0xc0]
|
||||
%res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> undef, i16 6, i32 4)
|
||||
ret <16 x float> %res
|
||||
}
|
||||
|
||||
define <16 x float> @test3_rsqrt28_ps(<16 x float> %a0) {
|
||||
; CHECK: kmovw
|
||||
; CHECK: vrsqrt28ps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcc,0xc0]
|
||||
%res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 6, i32 4)
|
||||
ret <16 x float> %res
|
||||
}
|
||||
|
||||
define <16 x float> @test4_rsqrt28_ps(<16 x float> %a0) {
|
||||
; CHECK: vrsqrt28ps %zmm0, %zmm0 {%k1} {z}{sae} # encoding: [0x62,0xf2,0x7d,0x99,0xcc,0xc0]
|
||||
%res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> undef, i16 6, i32 8)
|
||||
ret <16 x float> %res
|
||||
}
|
||||
|
||||
|
||||
declare <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
|
||||
|
||||
define <16 x float> @test_rcp28_ps_512(<16 x float> %a0) {
|
||||
; CHECK: vrcp28ps %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xca,0xc0]
|
||||
%res = call <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
|
||||
ret <16 x float> %res
|
||||
}
|
||||
declare <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
|
||||
|
||||
define <8 x double> @test_rcp28_pd_512(<8 x double> %a0) {
|
||||
; CHECK: vrcp28pd %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0xfd,0x18,0xca,0xc0]
|
||||
%res = call <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8)
|
||||
ret <8 x double> %res
|
||||
}
|
||||
declare <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone
|
||||
|
||||
define <16 x float> @test_exp2_ps_512(<16 x float> %a0) {
|
||||
; CHECK: vexp2ps %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xc8,0xc0]
|
||||
%res = call <16 x float> @llvm.x86.avx512.exp2.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
|
||||
ret <16 x float> %res
|
||||
}
|
||||
declare <16 x float> @llvm.x86.avx512.exp2.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
|
||||
|
||||
define <8 x double> @test_exp2_pd_512(<8 x double> %a0) {
|
||||
; CHECK: vexp2pd %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0xfd,0x18,0xc8,0xc0]
|
||||
%res = call <8 x double> @llvm.x86.avx512.exp2.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8)
|
||||
ret <8 x double> %res
|
||||
}
|
||||
declare <8 x double> @llvm.x86.avx512.exp2.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone
|
||||
|
||||
define <4 x float> @test_rsqrt28_ss(<4 x float> %a0) {
|
||||
; CHECK: vrsqrt28ss {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc0]
|
||||
%res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]
|
||||
ret <4 x float> %res
|
||||
}
|
||||
declare <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
|
||||
|
||||
define <4 x float> @test_rcp28_ss(<4 x float> %a0) {
|
||||
; CHECK: vrcp28ss {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcb,0xc0]
|
||||
%res = call <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]
|
||||
ret <4 x float> %res
|
||||
}
|
||||
declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
|
||||
|
Loading…
Reference in New Issue