Reland "Reland "Reland "[X86][RFC] Enable `_Float16` type support on X86 following the psABI"""

Fix the crash on lowering X86ISD::FCMP.
This commit is contained in:
Phoebe Wang 2022-06-17 11:25:22 +08:00
parent de74756571
commit 04a3d5f3a1
50 changed files with 5036 additions and 4699 deletions

View File

@ -138,7 +138,7 @@ Changes to the WebAssembly Backend
Changes to the X86 Backend
--------------------------
* ...
* Support ``half`` type on SSE2 and above targets.
Changes to the OCaml bindings
-----------------------------

View File

@ -148,8 +148,7 @@ private:
/// computed in an SSE register, not on the X87 floating point stack.
bool isScalarFPTypeInSSEReg(EVT VT) const {
return (VT == MVT::f64 && Subtarget->hasSSE2()) ||
(VT == MVT::f32 && Subtarget->hasSSE1()) ||
(VT == MVT::f16 && Subtarget->hasFP16());
(VT == MVT::f32 && Subtarget->hasSSE1()) || VT == MVT::f16;
}
bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false);
@ -2281,12 +2280,13 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
default: return false;
case MVT::i8: Opc = X86::CMOV_GR8; break;
case MVT::i16: Opc = X86::CMOV_GR16; break;
case MVT::f16: Opc = X86::CMOV_FR16X; break;
case MVT::i32: Opc = X86::CMOV_GR32; break;
case MVT::f32: Opc = Subtarget->hasAVX512() ? X86::CMOV_FR32X
: X86::CMOV_FR32; break;
case MVT::f64: Opc = Subtarget->hasAVX512() ? X86::CMOV_FR64X
: X86::CMOV_FR64; break;
case MVT::f16:
Opc = Subtarget->hasAVX512() ? X86::CMOV_FR16X : X86::CMOV_FR16; break;
case MVT::f32:
Opc = Subtarget->hasAVX512() ? X86::CMOV_FR32X : X86::CMOV_FR32; break;
case MVT::f64:
Opc = Subtarget->hasAVX512() ? X86::CMOV_FR64X : X86::CMOV_FR64; break;
}
const Value *Cond = I->getOperand(0);
@ -3903,6 +3903,9 @@ unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) {
unsigned Opc = 0;
switch (VT.SimpleTy) {
default: return 0;
case MVT::f16:
Opc = HasAVX512 ? X86::AVX512_FsFLD0SH : X86::FsFLD0SH;
break;
case MVT::f32:
Opc = HasAVX512 ? X86::AVX512_FsFLD0SS
: HasSSE1 ? X86::FsFLD0SS

View File

@ -553,9 +553,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
// f32 and f64 use SSE.
// f16, f32 and f64 use SSE.
// Set up the FP register classes.
addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
: &X86::FR16RegClass);
addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
: &X86::FR32RegClass);
addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
@ -587,6 +591,49 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FSINCOS, VT, Expand);
}
// Half type will be promoted by default.
setOperationAction(ISD::FABS, MVT::f16, Promote);
setOperationAction(ISD::FNEG, MVT::f16, Promote);
setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote);
setOperationAction(ISD::FADD, MVT::f16, Promote);
setOperationAction(ISD::FSUB, MVT::f16, Promote);
setOperationAction(ISD::FMUL, MVT::f16, Promote);
setOperationAction(ISD::FDIV, MVT::f16, Promote);
setOperationAction(ISD::FREM, MVT::f16, Promote);
setOperationAction(ISD::FMA, MVT::f16, Promote);
setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
setOperationAction(ISD::FMINIMUM, MVT::f16, Promote);
setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote);
setOperationAction(ISD::FSIN, MVT::f16, Promote);
setOperationAction(ISD::FCOS, MVT::f16, Promote);
setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
setOperationAction(ISD::FSQRT, MVT::f16, Promote);
setOperationAction(ISD::FPOW, MVT::f16, Promote);
setOperationAction(ISD::FLOG, MVT::f16, Promote);
setOperationAction(ISD::FLOG2, MVT::f16, Promote);
setOperationAction(ISD::FLOG10, MVT::f16, Promote);
setOperationAction(ISD::FEXP, MVT::f16, Promote);
setOperationAction(ISD::FEXP2, MVT::f16, Promote);
setOperationAction(ISD::FCEIL, MVT::f16, Promote);
setOperationAction(ISD::FFLOOR, MVT::f16, Promote);
setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote);
setOperationAction(ISD::FRINT, MVT::f16, Promote);
setOperationAction(ISD::BR_CC, MVT::f16, Promote);
setOperationAction(ISD::SETCC, MVT::f16, Promote);
setOperationAction(ISD::SELECT, MVT::f16, Custom);
setOperationAction(ISD::SELECT_CC, MVT::f16, Promote);
setOperationAction(ISD::FROUND, MVT::f16, Promote);
setOperationAction(ISD::FROUNDEVEN, MVT::f16, Promote);
setOperationAction(ISD::FTRUNC, MVT::f16, Promote);
setOperationAction(ISD::FP_ROUND, MVT::f16, LibCall);
setOperationAction(ISD::FP_EXTEND, MVT::f32, LibCall);
setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);
setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
// Lower this to MOVMSK plus an AND.
setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
@ -661,6 +708,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
} else // SSE immediates.
addLegalFPImmediate(APFloat(+0.0)); // xorpd
}
// Support fp16 0 immediate.
if (isTypeLegal(MVT::f16))
addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
// Handle constrained floating-point operations of scalar.
setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
@ -670,7 +721,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);
setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
@ -722,7 +772,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal);
setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal);
setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
if (isTypeLegal(MVT::f16)) {
setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom);
} else {
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
}
// FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
// as Custom.
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);
@ -1445,6 +1500,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
}
if (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) {
setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom);
}
// This block controls legalization of the mask vector sizes that are
// available with AVX512. 512-bit vectors are in a separate block controlled
// by useAVX512Regs.
@ -1973,10 +2035,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);
if (isTypeLegal(MVT::f80)) {
setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom);
}
setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand);
setCondCodeAction(ISD::SETUNE, MVT::f16, Expand);
@ -2062,9 +2120,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
setOperationAction(ISD::STORE, MVT::v4f16, Custom);
}
// Support fp16 0 immediate
addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
}
if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
@ -3914,7 +3969,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
else if (Is64Bit && RegVT == MVT::i64)
RC = &X86::GR64RegClass;
else if (RegVT == MVT::f16)
RC = &X86::FR16XRegClass;
RC = Subtarget.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass;
else if (RegVT == MVT::f32)
RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
else if (RegVT == MVT::f64)
@ -5669,8 +5724,7 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const {
}
bool X86TargetLowering::hasBitPreservingFPLogic(EVT VT) const {
return VT == MVT::f32 || VT == MVT::f64 || VT.isVector() ||
(VT == MVT::f16 && Subtarget.hasFP16());
return VT == MVT::f32 || VT == MVT::f64 || VT.isVector();
}
bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const {
@ -5682,8 +5736,7 @@ bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const {
bool X86TargetLowering::isScalarFPTypeInSSEReg(EVT VT) const {
return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
(VT == MVT::f32 && Subtarget.hasSSE1()) ||
(VT == MVT::f16 && Subtarget.hasFP16());
(VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
}
bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
@ -20740,6 +20793,16 @@ static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
return Cvt;
}
template<typename T>
static bool isSoftFP16(T VT, const X86Subtarget &Subtarget) {
return VT == MVT::f16 && !Subtarget.hasFP16();
}
template<typename T>
bool X86TargetLowering::isSoftFP16(T VT) const {
return ::isSoftFP16(VT, Subtarget);
}
SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
bool IsStrict = Op->isStrictFPOpcode();
@ -20781,6 +20844,10 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
"Unknown SINT_TO_FP to lower!");
// Bail out when we don't have native conversion instructions.
if (isSoftFP16(VT))
return SDValue();
bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
// These are really Legal; return the operand so the caller accepts it as
@ -21246,7 +21313,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
MVT DstVT = Op->getSimpleValueType(0);
SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
if (DstVT == MVT::f128)
// Bail out when we don't have native conversion instructions.
if (DstVT == MVT::f128 || isSoftFP16(DstVT))
return SDValue();
if (DstVT.isVector())
@ -22069,6 +22137,16 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl(Op);
SDValue Res;
if (isSoftFP16(SrcVT)) {
if (IsStrict)
return DAG.getNode(
Op.getOpcode(), dl, {VT, MVT::Other},
{Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
{Chain, Src})});
return DAG.getNode(Op.getOpcode(), dl, VT,
DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Src));
}
if (VT.isVector()) {
if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
MVT ResVT = MVT::v4i32;
@ -22406,6 +22484,9 @@ SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
SDValue Src = Op.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
if (SrcVT == MVT::f16)
return SDValue();
// If the source is in an SSE register, the node is Legal.
if (isScalarFPTypeInSSEReg(SrcVT))
return Op;
@ -22477,7 +22558,7 @@ X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
// This code is only for floats and doubles. Fall back to generic code for
// anything else.
if (!isScalarFPTypeInSSEReg(SrcVT))
if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftFP16(SrcVT))
return SDValue();
EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
@ -22612,28 +22693,53 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
MVT VT = Op.getSimpleValueType();
SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
SDValue In = Op.getOperand(IsStrict ? 1 : 0);
MVT SVT = In.getSimpleValueType();
if (VT == MVT::f128)
if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80))
return SDValue();
if (VT == MVT::f80) {
if (SVT == MVT::f16) {
assert(Subtarget.hasFP16() && "Unexpected features!");
RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, VT);
MakeLibCallOptions CallOptions;
std::pair<SDValue, SDValue> Tmp =
makeLibCall(DAG, LC, VT, In, CallOptions, DL,
IsStrict ? Op.getOperand(0) : SDValue());
if (SVT == MVT::f16) {
if (Subtarget.hasFP16())
return Op;
if (!Subtarget.hasF16C())
return SDValue();
if (VT != MVT::f32) {
if (IsStrict)
return DAG.getMergeValues({Tmp.first, Tmp.second}, DL);
else
return Tmp.first;
return DAG.getNode(
ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},
{Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,
{MVT::f32, MVT::Other}, {Chain, In})});
return DAG.getNode(ISD::FP_EXTEND, DL, VT,
DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));
}
return Op;
In = DAG.getBitcast(MVT::i16, In);
In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,
getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,
DAG.getIntPtrConstant(0, DL));
SDValue Res;
if (IsStrict) {
Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},
{Chain, In});
Chain = Res.getValue(1);
} else {
Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,
DAG.getTargetConstant(4, DL, MVT::i32));
}
Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,
DAG.getIntPtrConstant(0, DL));
if (IsStrict)
return DAG.getMergeValues({Res, Chain}, DL);
return Res;
}
if (!SVT.isVector())
return Op;
if (SVT.getVectorElementType() == MVT::f16) {
assert(Subtarget.hasFP16() && Subtarget.hasVLX() && "Unexpected features!");
if (SVT == MVT::v2f16)
@ -22659,15 +22765,64 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
bool IsStrict = Op->isStrictFPOpcode();
SDLoc DL(Op);
SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
SDValue In = Op.getOperand(IsStrict ? 1 : 0);
SDValue Op2 = Op.getOperand(IsStrict ? 2 : 1);
MVT VT = Op.getSimpleValueType();
MVT SVT = In.getSimpleValueType();
// It's legal except when f128 is involved or we're converting f80->f16.
if (SVT != MVT::f128 && !(VT == MVT::f16 && SVT == MVT::f80))
return Op;
if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
return SDValue();
return SDValue();
if (VT == MVT::f16) {
if (Subtarget.hasFP16())
return Op;
if (!Subtarget.hasF16C())
return SDValue();
if (SVT != MVT::f32) {
if (IsStrict)
return DAG.getNode(
ISD::STRICT_FP_ROUND, DL, {VT, MVT::Other},
{Chain,
DAG.getNode(ISD::STRICT_FP_ROUND, DL, {MVT::f32, MVT::Other},
{Chain, In, Op2}),
Op2});
return DAG.getNode(ISD::FP_ROUND, DL, VT,
DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, In, Op2),
Op2);
}
SDValue Res;
SDValue Rnd = DAG.getTargetConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, DL,
MVT::i32);
if (IsStrict) {
Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
DAG.getConstantFP(0, DL, MVT::v4f32), In,
DAG.getIntPtrConstant(0, DL));
Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},
{Chain, Res, Rnd});
Chain = Res.getValue(1);
} else {
// FIXME: Should we use zeros for upper elements for non-strict?
Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);
Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);
}
Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
DAG.getIntPtrConstant(0, DL));
Res = DAG.getBitcast(MVT::f16, Res);
if (IsStrict)
return DAG.getMergeValues({Res, Chain}, DL);
return Res;
}
return Op;
}
static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
@ -24690,6 +24845,11 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
MVT VT = Op1.getSimpleValueType();
SDValue CC;
if (isSoftFP16(VT))
return DAG.getBitcast(MVT::f16, DAG.getNode(ISD::SELECT, DL, MVT::i16, Cond,
DAG.getBitcast(MVT::i16, Op1),
DAG.getBitcast(MVT::i16, Op2)));
// Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
// are available or VBLENDV if AVX is available.
// Otherwise FP cmovs get lowered into a less efficient branch sequence later.
@ -24754,7 +24914,8 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
}
if (Cond.getOpcode() == ISD::SETCC) {
if (Cond.getOpcode() == ISD::SETCC &&
!isSoftFP16(Cond.getOperand(0).getSimpleValueType())) {
if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
Cond = NewCond;
// If the condition was updated, it's possible that the operands of the
@ -25429,8 +25590,10 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
SDValue Dest = Op.getOperand(2);
SDLoc dl(Op);
// Bail out when we don't have native compare instructions.
if (Cond.getOpcode() == ISD::SETCC &&
Cond.getOperand(0).getValueType() != MVT::f128) {
Cond.getOperand(0).getValueType() != MVT::f128 &&
!isSoftFP16(Cond.getOperand(0).getValueType())) {
SDValue LHS = Cond.getOperand(0);
SDValue RHS = Cond.getOperand(1);
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
@ -34152,6 +34315,7 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
// conditional jump around it.
static bool isCMOVPseudo(MachineInstr &MI) {
switch (MI.getOpcode()) {
case X86::CMOV_FR16:
case X86::CMOV_FR16X:
case X86::CMOV_FR32:
case X86::CMOV_FR32X:
@ -35827,6 +35991,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case X86::TLSCall_32:
case X86::TLSCall_64:
return EmitLoweredTLSCall(MI, BB);
case X86::CMOV_FR16:
case X86::CMOV_FR16X:
case X86::CMOV_FR32:
case X86::CMOV_FR32X:
case X86::CMOV_FR64:
@ -44101,7 +44267,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// ignored in unsafe-math mode).
// We also try to create v2f32 min/max nodes, which we later widen to v4f32.
if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
VT != MVT::f80 && VT != MVT::f128 &&
VT != MVT::f80 && VT != MVT::f128 && !isSoftFP16(VT, Subtarget) &&
(TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
(Subtarget.hasSSE2() ||
(Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {

View File

@ -1640,6 +1640,8 @@ namespace llvm {
bool needsCmpXchgNb(Type *MemType) const;
template<typename T> bool isSoftFP16(T VT) const;
void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB,
MachineBasicBlock *DispatchBB, int FI) const;

View File

@ -476,6 +476,7 @@ let Predicates = [HasAVX512] in {
def : Pat<(v64i8 immAllZerosV), (AVX512_512_SET0)>;
def : Pat<(v32i16 immAllZerosV), (AVX512_512_SET0)>;
def : Pat<(v8i64 immAllZerosV), (AVX512_512_SET0)>;
def : Pat<(v32f16 immAllZerosV), (AVX512_512_SET0)>;
def : Pat<(v16f32 immAllZerosV), (AVX512_512_SET0)>;
def : Pat<(v8f64 immAllZerosV), (AVX512_512_SET0)>;
}
@ -508,25 +509,23 @@ let Predicates = [HasAVX512] in {
def : Pat<(v8i16 immAllZerosV), (AVX512_128_SET0)>;
def : Pat<(v16i8 immAllZerosV), (AVX512_128_SET0)>;
def : Pat<(v2i64 immAllZerosV), (AVX512_128_SET0)>;
def : Pat<(v8f16 immAllZerosV), (AVX512_128_SET0)>;
def : Pat<(v4f32 immAllZerosV), (AVX512_128_SET0)>;
def : Pat<(v2f64 immAllZerosV), (AVX512_128_SET0)>;
def : Pat<(v32i8 immAllZerosV), (AVX512_256_SET0)>;
def : Pat<(v16i16 immAllZerosV), (AVX512_256_SET0)>;
def : Pat<(v4i64 immAllZerosV), (AVX512_256_SET0)>;
def : Pat<(v16f16 immAllZerosV), (AVX512_256_SET0)>;
def : Pat<(v8f32 immAllZerosV), (AVX512_256_SET0)>;
def : Pat<(v4f64 immAllZerosV), (AVX512_256_SET0)>;
}
let Predicates = [HasFP16] in {
def : Pat<(v8f16 immAllZerosV), (AVX512_128_SET0)>;
def : Pat<(v16f16 immAllZerosV), (AVX512_256_SET0)>;
def : Pat<(v32f16 immAllZerosV), (AVX512_512_SET0)>;
}
// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
// This is expanded by ExpandPostRAPseudos.
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasAVX512] in {
def AVX512_FsFLD0SH : I<0, Pseudo, (outs FR16X:$dst), (ins), "",
[(set FR16X:$dst, fp16imm0)]>;
def AVX512_FsFLD0SS : I<0, Pseudo, (outs FR32X:$dst), (ins), "",
[(set FR32X:$dst, fp32imm0)]>;
def AVX512_FsFLD0SD : I<0, Pseudo, (outs FR64X:$dst), (ins), "",
@ -535,12 +534,6 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
[(set VR128X:$dst, fp128imm0)]>;
}
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasFP16] in {
def AVX512_FsFLD0SH : I<0, Pseudo, (outs FR16X:$dst), (ins), "",
[(set FR16X:$dst, fp16imm0)]>;
}
//===----------------------------------------------------------------------===//
// AVX-512 - VECTOR INSERT
//
@ -678,21 +671,21 @@ defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v8i16x_info, v16i16x_info,
defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v16i8x_info, v32i8x_info,
vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
defm : vinsert_for_size_lowering<"VINSERTF32x4Z256", v8f16x_info, v16f16x_info,
vinsert128_insert, INSERT_get_vinsert128_imm, [HasFP16, HasVLX]>;
vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
// Codegen pattern with the alternative types insert VEC128 into VEC512
defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v8i16x_info, v32i16_info,
vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v16i8x_info, v64i8_info,
vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
defm : vinsert_for_size_lowering<"VINSERTF32x4Z", v8f16x_info, v32f16_info,
vinsert128_insert, INSERT_get_vinsert128_imm, [HasFP16]>;
vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
// Codegen pattern with the alternative types insert VEC256 into VEC512
defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v16i16x_info, v32i16_info,
vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v32i8x_info, v64i8_info,
vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
defm : vinsert_for_size_lowering<"VINSERTF64x4Z", v16f16x_info, v32f16_info,
vinsert256_insert, INSERT_get_vinsert256_imm, [HasFP16]>;
vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
multiclass vinsert_for_mask_cast<string InstrStr, X86VectorVTInfo From,
@ -979,7 +972,7 @@ defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v16i16x_info, v8i16x_info
defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v32i8x_info, v16i8x_info,
vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;
defm : vextract_for_size_lowering<"VEXTRACTF32x4Z256", v16f16x_info, v8f16x_info,
vextract128_extract, EXTRACT_get_vextract128_imm, [HasFP16, HasVLX]>;
vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;
// Codegen pattern with the alternative types extract VEC128 from VEC512
defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v32i16_info, v8i16x_info,
@ -987,14 +980,14 @@ defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v32i16_info, v8i16x_info,
defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v64i8_info, v16i8x_info,
vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
defm : vextract_for_size_lowering<"VEXTRACTF32x4Z", v32f16_info, v8f16x_info,
vextract128_extract, EXTRACT_get_vextract128_imm, [HasFP16]>;
vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
// Codegen pattern with the alternative types extract VEC256 from VEC512
defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v32i16_info, v16i16x_info,
vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v64i8_info, v32i8x_info,
vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
defm : vextract_for_size_lowering<"VEXTRACTF64x4Z", v32f16_info, v16f16x_info,
vextract256_extract, EXTRACT_get_vextract256_imm, [HasFP16]>;
vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
// A 128-bit extract from bits [255:128] of a 512-bit vector should use a
@ -1020,6 +1013,10 @@ def : Pat<(v8i16 (extract_subvector (v32i16 VR512:$src), (iPTR 8))),
(v8i16 (VEXTRACTI128rr
(v16i16 (EXTRACT_SUBREG (v32i16 VR512:$src), sub_ymm)),
(iPTR 1)))>;
def : Pat<(v8f16 (extract_subvector (v32f16 VR512:$src), (iPTR 8))),
(v8f16 (VEXTRACTF128rr
(v16f16 (EXTRACT_SUBREG (v32f16 VR512:$src), sub_ymm)),
(iPTR 1)))>;
def : Pat<(v16i8 (extract_subvector (v64i8 VR512:$src), (iPTR 16))),
(v16i8 (VEXTRACTI128rr
(v32i8 (EXTRACT_SUBREG (v64i8 VR512:$src), sub_ymm)),
@ -1049,18 +1046,16 @@ def : Pat<(v8i16 (extract_subvector (v32i16 VR512:$src), (iPTR 8))),
(v8i16 (VEXTRACTI32x4Z256rr
(v16i16 (EXTRACT_SUBREG (v32i16 VR512:$src), sub_ymm)),
(iPTR 1)))>;
def : Pat<(v8f16 (extract_subvector (v32f16 VR512:$src), (iPTR 8))),
(v8f16 (VEXTRACTF32x4Z256rr
(v16f16 (EXTRACT_SUBREG (v32f16 VR512:$src), sub_ymm)),
(iPTR 1)))>;
def : Pat<(v16i8 (extract_subvector (v64i8 VR512:$src), (iPTR 16))),
(v16i8 (VEXTRACTI32x4Z256rr
(v32i8 (EXTRACT_SUBREG (v64i8 VR512:$src), sub_ymm)),
(iPTR 1)))>;
}
let Predicates = [HasFP16, HasVLX] in
def : Pat<(v8f16 (extract_subvector (v32f16 VR512:$src), (iPTR 8))),
(v8f16 (VEXTRACTF32x4Z256rr
(v16f16 (EXTRACT_SUBREG (v32f16 VR512:$src), sub_ymm)),
(iPTR 1)))>;
// Additional patterns for handling a bitcast between the vselect and the
// extract_subvector.
@ -1478,7 +1473,7 @@ multiclass avx512_subvec_broadcast_rm_dq<bits<8> opc, string OpcodeStr,
Sched<[SchedWriteShuffle.YMM.Folded]>,
AVX5128IBase, EVEX;
}
let Predicates = [HasFP16] in {
let Predicates = [HasBWI] in {
def : Pat<(v32f16 (X86VBroadcastld16 addr:$src)),
(VPBROADCASTWZrm addr:$src)>;
@ -1487,7 +1482,7 @@ let Predicates = [HasFP16] in {
def : Pat<(v32f16 (X86VBroadcast (f16 FR16X:$src))),
(VPBROADCASTWZrr (COPY_TO_REGCLASS FR16X:$src, VR128X))>;
}
let Predicates = [HasVLX, HasFP16] in {
let Predicates = [HasVLX, HasBWI] in {
def : Pat<(v8f16 (X86VBroadcastld16 addr:$src)),
(VPBROADCASTWZ128rm addr:$src)>;
def : Pat<(v16f16 (X86VBroadcastld16 addr:$src)),
@ -3763,6 +3758,9 @@ let Predicates = [HasBWI, NoVLX] in {
defm : mask_move_lowering<"VMOVDQU16Z", v8i16x_info, v32i16_info>;
defm : mask_move_lowering<"VMOVDQU16Z", v16i16x_info, v32i16_info>;
defm : mask_move_lowering<"VMOVDQU16Z", v8f16x_info, v32f16_info>;
defm : mask_move_lowering<"VMOVDQU16Z", v16f16x_info, v32f16_info>;
}
let Predicates = [HasAVX512] in {
@ -3852,7 +3850,7 @@ let Predicates = [HasVLX] in {
def : Pat<(store (v32i8 VR256X:$src), addr:$dst),
(VMOVDQU64Z256mr addr:$dst, VR256X:$src)>;
}
let Predicates = [HasFP16] in {
let Predicates = [HasBWI] in {
def : Pat<(v32f16 (vselect VK32WM:$mask, (v32f16 VR512:$src1), (v32f16 VR512:$src0))),
(VMOVDQU16Zrrk VR512:$src0, VK32WM:$mask, VR512:$src1)>;
def : Pat<(v32f16 (vselect VK32WM:$mask, (v32f16 VR512:$src1), v32f16_info.ImmAllZerosV)),
@ -3887,7 +3885,7 @@ let Predicates = [HasFP16] in {
def : Pat<(masked_store (v32f16 VR512:$src), addr:$dst, VK32WM:$mask),
(VMOVDQU16Zmrk addr:$dst, VK32WM:$mask, VR512:$src)>;
}
let Predicates = [HasFP16, HasVLX] in {
let Predicates = [HasBWI, HasVLX] in {
def : Pat<(v16f16 (vselect VK16WM:$mask, (v16f16 VR256X:$src1), (v16f16 VR256X:$src0))),
(VMOVDQU16Z256rrk VR256X:$src0, VK16WM:$mask, VR256X:$src1)>;
def : Pat<(v16f16 (vselect VK16WM:$mask, (v16f16 VR256X:$src1), v16f16x_info.ImmAllZerosV)),
@ -4099,14 +4097,14 @@ def : Pat<(f64 (bitconvert VK64:$src)),
//===----------------------------------------------------------------------===//
multiclass avx512_move_scalar<string asm, SDNode OpNode, PatFrag vzload_frag,
X86VectorVTInfo _,
list<Predicate> prd = [HasAVX512, OptForSize]> {
let Predicates = prd in
X86VectorVTInfo _, Predicate prd = HasAVX512> {
let Predicates = !if (!eq (prd, HasFP16), [HasFP16], [prd, OptForSize]) in
def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, _.RC:$src2)))],
_.ExeDomain>, EVEX_4V, Sched<[SchedWriteFShuffle.XMM]>;
let Predicates = [prd] in {
def rrkz : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
!strconcat(asm, "\t{$src2, $src1, $dst {${mask}} {z}|",
@ -4159,6 +4157,7 @@ multiclass avx512_move_scalar<string asm, SDNode OpNode, PatFrag vzload_frag,
!strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
[], _.ExeDomain>, EVEX, EVEX_K, Sched<[WriteFStore]>,
NotMemoryFoldable;
}
}
defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, X86vzload32, f32x_info>,
@ -4168,7 +4167,7 @@ defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, X86vzload64, f64x_info>,
VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
defm VMOVSHZ : avx512_move_scalar<"vmovsh", X86Movsh, X86vzload16, f16x_info,
[HasFP16]>,
HasFP16>,
VEX_LIG, T_MAP5XS, EVEX_CD8<16, CD8VT1>;
multiclass avx512_move_scalar_lowering<string InstrStr, SDNode OpNode,
@ -4338,14 +4337,9 @@ def : Pat<(_.info128.VT (masked_load addr:$srcAddr, Mask128,
addr:$srcAddr)>;
}
defm : avx512_move_scalar_lowering<"VMOVSHZ", X86Movsh, fp16imm0, v8f16x_info>;
defm : avx512_move_scalar_lowering<"VMOVSSZ", X86Movss, fp32imm0, v4f32x_info>;
defm : avx512_move_scalar_lowering<"VMOVSDZ", X86Movsd, fp64imm0, v2f64x_info>;
defm : avx512_store_scalar_lowering<"VMOVSHZ", avx512vl_f16_info,
(v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32>;
defm : avx512_store_scalar_lowering_subreg<"VMOVSHZ", avx512vl_f16_info,
(v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32, sub_32bit>;
defm : avx512_store_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
(v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>;
defm : avx512_store_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
@ -4353,6 +4347,12 @@ defm : avx512_store_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
defm : avx512_store_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info,
(v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>;
let Predicates = [HasFP16] in {
defm : avx512_move_scalar_lowering<"VMOVSHZ", X86Movsh, fp16imm0, v8f16x_info>;
defm : avx512_store_scalar_lowering<"VMOVSHZ", avx512vl_f16_info,
(v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32>;
defm : avx512_store_scalar_lowering_subreg<"VMOVSHZ", avx512vl_f16_info,
(v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32, sub_32bit>;
defm : avx512_store_scalar_lowering_subreg2<"VMOVSHZ", avx512vl_f16_info,
(v32i1 (insert_subvector
(v32i1 immAllZerosV),
@ -4360,6 +4360,30 @@ defm : avx512_store_scalar_lowering_subreg2<"VMOVSHZ", avx512vl_f16_info,
(iPTR 0))),
(v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
GR8, sub_8bit>;
defm : avx512_load_scalar_lowering<"VMOVSHZ", avx512vl_f16_info,
(v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32>;
defm : avx512_load_scalar_lowering_subreg<"VMOVSHZ", avx512vl_f16_info,
(v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32, sub_32bit>;
defm : avx512_load_scalar_lowering_subreg2<"VMOVSHZ", avx512vl_f16_info,
(v32i1 (insert_subvector
(v32i1 immAllZerosV),
(v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
(iPTR 0))),
(v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
GR8, sub_8bit>;
def : Pat<(f16 (X86selects VK1WM:$mask, (f16 FR16X:$src1), (f16 FR16X:$src2))),
(COPY_TO_REGCLASS (v8f16 (VMOVSHZrrk
(v8f16 (COPY_TO_REGCLASS FR16X:$src2, VR128X)),
VK1WM:$mask, (v8f16 (IMPLICIT_DEF)),
(v8f16 (COPY_TO_REGCLASS FR16X:$src1, VR128X)))), FR16X)>;
def : Pat<(f16 (X86selects VK1WM:$mask, (f16 FR16X:$src1), fp16imm0)),
(COPY_TO_REGCLASS (v8f16 (VMOVSHZrrkz VK1WM:$mask, (v8f16 (IMPLICIT_DEF)),
(v8f16 (COPY_TO_REGCLASS FR16X:$src1, VR128X)))), FR16X)>;
}
defm : avx512_store_scalar_lowering_subreg2<"VMOVSSZ", avx512vl_f32_info,
(v16i1 (insert_subvector
(v16i1 immAllZerosV),
@ -4385,10 +4409,6 @@ defm : avx512_store_scalar_lowering_subreg2<"VMOVSDZ", avx512vl_f64_info,
(v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))),
(iPTR 0))), GR8, sub_8bit>;
defm : avx512_load_scalar_lowering<"VMOVSHZ", avx512vl_f16_info,
(v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32>;
defm : avx512_load_scalar_lowering_subreg<"VMOVSHZ", avx512vl_f16_info,
(v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32, sub_32bit>;
defm : avx512_load_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
(v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>;
defm : avx512_load_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
@ -4396,13 +4416,6 @@ defm : avx512_load_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
defm : avx512_load_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info,
(v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>;
defm : avx512_load_scalar_lowering_subreg2<"VMOVSHZ", avx512vl_f16_info,
(v32i1 (insert_subvector
(v32i1 immAllZerosV),
(v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
(iPTR 0))),
(v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
GR8, sub_8bit>;
defm : avx512_load_scalar_lowering_subreg2<"VMOVSSZ", avx512vl_f32_info,
(v16i1 (insert_subvector
(v16i1 immAllZerosV),
@ -4428,16 +4441,6 @@ defm : avx512_load_scalar_lowering_subreg2<"VMOVSDZ", avx512vl_f64_info,
(v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))),
(iPTR 0))), GR8, sub_8bit>;
def : Pat<(f16 (X86selects VK1WM:$mask, (f16 FR16X:$src1), (f16 FR16X:$src2))),
(COPY_TO_REGCLASS (v8f16 (VMOVSHZrrk
(v8f16 (COPY_TO_REGCLASS FR16X:$src2, VR128X)),
VK1WM:$mask, (v8f16 (IMPLICIT_DEF)),
(v8f16 (COPY_TO_REGCLASS FR16X:$src1, VR128X)))), FR16X)>;
def : Pat<(f16 (X86selects VK1WM:$mask, (f16 FR16X:$src1), fp16imm0)),
(COPY_TO_REGCLASS (v8f16 (VMOVSHZrrkz VK1WM:$mask, (v8f16 (IMPLICIT_DEF)),
(v8f16 (COPY_TO_REGCLASS FR16X:$src1, VR128X)))), FR16X)>;
def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))),
(COPY_TO_REGCLASS (v4f32 (VMOVSSZrrk
(v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)),
@ -11651,6 +11654,14 @@ defm VPINSRWZ : avx512_insert_elt_bw<0xC4, "vpinsrw", X86pinsrw, v8i16x_info,
defm VPINSRDZ : avx512_insert_elt_dq<0x22, "vpinsrd", v4i32x_info, GR32>;
defm VPINSRQZ : avx512_insert_elt_dq<0x22, "vpinsrq", v2i64x_info, GR64>, VEX_W;
// Always select FP16 instructions if available.
let Predicates = [HasBWI], AddedComplexity = -10 in {
def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (VPINSRWZrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16X)>;
def : Pat<(store f16:$src, addr:$dst), (VPEXTRWZmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>;
def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (VPEXTRWZrr (v8i16 (COPY_TO_REGCLASS FR16X:$src, VR128X)), 0), sub_16bit)>;
def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (VPINSRWZrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16X)>;
}
//===----------------------------------------------------------------------===//
// VSHUFPS - VSHUFPD Operations
//===----------------------------------------------------------------------===//
@ -12988,7 +12999,6 @@ def : Pat<(i16 (bitconvert FR16X:$src)),
sub_16bit))>;
def : Pat<(i16 (extractelt (v8i16 VR128X:$src), (iPTR 0))),
(i16 (EXTRACT_SUBREG (VMOVSH2Wrr VR128X:$src), sub_16bit))>;
}
// Allow "vmovw" to use GR64
let hasSideEffects = 0 in {
@ -12997,6 +13007,7 @@ let hasSideEffects = 0 in {
def VMOVSHtoW64rr : AVX512<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),
"vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5PD, EVEX, VEX_W, Sched<[WriteVecMoveToGpr]>;
}
}
// Convert 16-bit float to i16/u16
multiclass avx512_cvtph2w<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,

View File

@ -562,12 +562,14 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] in {
let Predicates = [HasMMX] in
defm _VR64 : CMOVrr_PSEUDO<VR64, x86mmx>;
defm _FR16X : CMOVrr_PSEUDO<FR16X, f16>;
let Predicates = [HasSSE1,NoAVX512] in
defm _FR32 : CMOVrr_PSEUDO<FR32, f32>;
let Predicates = [HasSSE2,NoAVX512] in
let Predicates = [HasSSE2,NoAVX512] in {
defm _FR16 : CMOVrr_PSEUDO<FR16, f16>;
defm _FR64 : CMOVrr_PSEUDO<FR64, f64>;
}
let Predicates = [HasAVX512] in {
defm _FR16X : CMOVrr_PSEUDO<FR16X, f16>;
defm _FR32X : CMOVrr_PSEUDO<FR32X, f32>;
defm _FR64X : CMOVrr_PSEUDO<FR64X, f64>;
}

View File

@ -765,6 +765,7 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
case X86::AVX_SET0:
case X86::FsFLD0SD:
case X86::FsFLD0SS:
case X86::FsFLD0SH:
case X86::FsFLD0F128:
case X86::KSET0D:
case X86::KSET0Q:
@ -3583,10 +3584,6 @@ static unsigned getLoadStoreRegOpcode(Register Reg,
case 2:
if (X86::VK16RegClass.hasSubClassEq(RC))
return load ? X86::KMOVWkm : X86::KMOVWmk;
if (X86::FR16XRegClass.hasSubClassEq(RC)) {
assert(STI.hasFP16());
return load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr;
}
assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
return load ? X86::MOV16rm : X86::MOV16mr;
case 4:
@ -3614,6 +3611,10 @@ static unsigned getLoadStoreRegOpcode(Register Reg,
X86::VK8PAIRRegClass.hasSubClassEq(RC) ||
X86::VK16PAIRRegClass.hasSubClassEq(RC))
return load ? X86::MASKPAIR16LOAD : X86::MASKPAIR16STORE;
if ((X86::FR16RegClass.hasSubClassEq(RC) ||
X86::FR16XRegClass.hasSubClassEq(RC)) &&
STI.hasFP16())
return load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr;
llvm_unreachable("Unknown 4-byte regclass");
case 8:
if (X86::GR64RegClass.hasSubClassEq(RC))
@ -3853,12 +3854,12 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
const TargetRegisterInfo *TRI) const {
const MachineFunction &MF = *MBB.getParent();
const MachineFrameInfo &MFI = MF.getFrameInfo();
MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
"Stack slot too small for store");
if (RC->getID() == X86::TILERegClassID) {
unsigned Opc = X86::TILESTORED;
// tilestored %tmm, (%sp, %idx)
MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
MachineInstr *NewMI =
@ -3867,6 +3868,14 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
MachineOperand &MO = NewMI->getOperand(2);
MO.setReg(VirtReg);
MO.setIsKill(true);
} else if ((RC->getID() == X86::FR16RegClassID ||
RC->getID() == X86::FR16XRegClassID) &&
!Subtarget.hasFP16()) {
unsigned Opc = Subtarget.hasAVX512() ? X86::VMOVSSZmr
: Subtarget.hasAVX() ? X86::VMOVSSmr
: X86::MOVSSmr;
addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
.addReg(SrcReg, getKillRegState(isKill));
} else {
unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
bool isAligned =
@ -3895,6 +3904,14 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
MachineOperand &MO = NewMI->getOperand(3);
MO.setReg(VirtReg);
MO.setIsKill(true);
} else if ((RC->getID() == X86::FR16RegClassID ||
RC->getID() == X86::FR16XRegClassID) &&
!Subtarget.hasFP16()) {
unsigned Opc = Subtarget.hasAVX512() ? X86::VMOVSSZrm
: Subtarget.hasAVX() ? X86::VMOVSSrm
: X86::MOVSSrm;
addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
FrameIdx);
} else {
const MachineFunction &MF = *MBB.getParent();
const MachineFrameInfo &MFI = MF.getFrameInfo();
@ -4870,6 +4887,7 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case X86::V_SET0:
case X86::FsFLD0SS:
case X86::FsFLD0SD:
case X86::FsFLD0SH:
case X86::FsFLD0F128:
return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
case X86::AVX_SET0: {
@ -6605,6 +6623,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
case X86::AVX512_FsFLD0SS:
Alignment = Align(4);
break;
case X86::FsFLD0SH:
case X86::AVX512_FsFLD0SH:
Alignment = Align(2);
break;
@ -6643,6 +6662,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
case X86::AVX512_256_SET0:
case X86::AVX512_512_SET0:
case X86::AVX512_512_SETALLONES:
case X86::FsFLD0SH:
case X86::AVX512_FsFLD0SH:
case X86::FsFLD0SD:
case X86::AVX512_FsFLD0SD:
@ -6682,7 +6702,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
Ty = Type::getDoubleTy(MF.getFunction().getContext());
else if (Opc == X86::FsFLD0F128 || Opc == X86::AVX512_FsFLD0F128)
Ty = Type::getFP128Ty(MF.getFunction().getContext());
else if (Opc == X86::AVX512_FsFLD0SH)
else if (Opc == X86::FsFLD0SH || Opc == X86::AVX512_FsFLD0SH)
Ty = Type::getHalfTy(MF.getFunction().getContext());
else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES)
Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),

View File

@ -112,6 +112,8 @@ multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
// This is expanded by ExpandPostRAPseudos.
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
isPseudo = 1, SchedRW = [WriteZero] in {
def FsFLD0SH : I<0, Pseudo, (outs FR16:$dst), (ins), "",
[(set FR16:$dst, fp16imm0)]>, Requires<[HasSSE2, NoAVX512]>;
def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
[(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>;
def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
@ -3965,6 +3967,20 @@ defm PINSRW : sse2_pinsrw, PD;
} // ExeDomain = SSEPackedInt
// Always select FP16 instructions if available.
let Predicates = [UseSSE2], AddedComplexity = -10 in {
def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (PINSRWrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16)>;
def : Pat<(store f16:$src, addr:$dst), (MOV16mr addr:$dst, (EXTRACT_SUBREG (PEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit))>;
def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (PEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit)>;
def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (PINSRWrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16)>;
}
let Predicates = [HasAVX, NoBWI] in {
def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (VPINSRWrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16)>;
def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (VPEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit)>;
def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (VPINSRWrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16)>;
}
//===---------------------------------------------------------------------===//
// SSE2 - Packed Mask Creation
//===---------------------------------------------------------------------===//
@ -5193,6 +5209,12 @@ let Predicates = [HasAVX, NoBWI] in
defm PEXTRW : SS41I_extract16<0x15, "pextrw">;
let Predicates = [UseSSE41] in
def : Pat<(store f16:$src, addr:$dst), (PEXTRWmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>;
let Predicates = [HasAVX, NoBWI] in
def : Pat<(store f16:$src, addr:$dst), (VPEXTRWmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>;
/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
@ -7575,6 +7597,21 @@ let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
(VPBROADCASTWYrr (VMOVDI2PDIrr
(i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
GR16:$src, sub_16bit))))>;
def : Pat<(v8f16 (X86VBroadcastld16 addr:$src)),
(VPBROADCASTWrm addr:$src)>;
def : Pat<(v16f16 (X86VBroadcastld16 addr:$src)),
(VPBROADCASTWYrm addr:$src)>;
def : Pat<(v8f16 (X86VBroadcast (v8f16 VR128:$src))),
(VPBROADCASTWrr VR128:$src)>;
def : Pat<(v16f16 (X86VBroadcast (v8f16 VR128:$src))),
(VPBROADCASTWYrr VR128:$src)>;
def : Pat<(v8f16 (X86VBroadcast (f16 FR16:$src))),
(VPBROADCASTWrr (COPY_TO_REGCLASS FR16:$src, VR128))>;
def : Pat<(v16f16 (X86VBroadcast (f16 FR16:$src))),
(VPBROADCASTWYrr (COPY_TO_REGCLASS FR16:$src, VR128))>;
}
let Predicates = [HasAVX2, NoVLX] in {
def : Pat<(v4i32 (X86VBroadcast GR32:$src)),

View File

@ -17,6 +17,8 @@
let Predicates = [NoAVX512] in {
// A vector extract of the first f32/f64 position is a subregister copy
def : Pat<(f16 (extractelt (v8f16 VR128:$src), (iPTR 0))),
(COPY_TO_REGCLASS (v8f16 VR128:$src), FR16)>;
def : Pat<(f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
(COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>;
def : Pat<(f64 (extractelt (v2f64 VR128:$src), (iPTR 0))),
@ -34,8 +36,8 @@ let Predicates = [HasAVX512] in {
}
let Predicates = [NoVLX] in {
def : Pat<(v8f16 (scalar_to_vector FR16X:$src)),
(COPY_TO_REGCLASS FR16X:$src, VR128)>;
def : Pat<(v8f16 (scalar_to_vector FR16:$src)),
(COPY_TO_REGCLASS FR16:$src, VR128)>;
// Implicitly promote a 32-bit scalar to a vector.
def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
(COPY_TO_REGCLASS FR32:$src, VR128)>;

View File

@ -179,6 +179,8 @@ X86InstructionSelector::getRegClass(LLT Ty, const RegisterBank &RB) const {
return &X86::GR64RegClass;
}
if (RB.getID() == X86::VECRRegBankID) {
if (Ty.getSizeInBits() == 16)
return STI.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass;
if (Ty.getSizeInBits() == 32)
return STI.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
if (Ty.getSizeInBits() == 64)

View File

@ -537,6 +537,8 @@ def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>;
def FR64 : RegisterClass<"X86", [f64], 64, (add FR32)>;
def FR16 : RegisterClass<"X86", [f16], 16, (add FR32)> {let Size = 32;}
// FIXME: This sets up the floating point register files as though they are f64
// values, though they really are f80 values. This will cause us to spill
@ -599,7 +601,7 @@ def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>;
def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>;
def FR16X : RegisterClass<"X86", [f16], 16, (add FR32X)>;
def FR16X : RegisterClass<"X86", [f16], 16, (add FR32X)> {let Size = 32;}
// Extended VR128 and VR256 for AVX-512 instructions
def VR128X : RegisterClass<"X86", [v4f32, v2f64, v8f16, v16i8, v8i16, v4i32, v2i64, f128],

View File

@ -847,228 +847,228 @@ define void @casts() {
define void @fp16() {
; SSE2-LABEL: 'fp16'
; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 89 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 118 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 117 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 107 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 114 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 177 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 159 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 229 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 211 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 234 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 214 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 228 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 204 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 232 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 200 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 113 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 117 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 122 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SSE42-LABEL: 'fp16'
; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 89 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 93 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 83 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 177 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 159 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 181 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 163 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 186 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 166 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 180 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 156 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 200 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 113 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 117 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 122 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX1-LABEL: 'fp16'
; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 73 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 145 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 127 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 150 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 151 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 126 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 152 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 172 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 113 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 118 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 119 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX2-LABEL: 'fp16'
; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 73 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 75 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 73 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 145 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 127 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 146 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 126 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 156 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 113 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 114 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; AVX512-LABEL: 'fp16'
; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
@ -1112,56 +1112,56 @@ define void @fp16() {
; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SLM-LABEL: 'fp16'
; SLM-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 89 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 93 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 83 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 177 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 159 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 181 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 163 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 186 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 166 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 180 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 156 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 200 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 113 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 117 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 122 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
%f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)

View File

@ -28,8 +28,8 @@ body: |
liveins: $rdi, $rsi
; CHECK-LABEL: name: test
; CHECK: INLINEASM &foo, 0 /* attdialect */, 4390922 /* regdef:GR64 */, def $rsi, 4390922 /* regdef:GR64 */, def dead $rdi,
INLINEASM &foo, 0, 4390922, def $rsi, 4390922, def dead $rdi, 2147549193, killed $rdi, 2147483657, killed $rsi, 12, implicit-def dead early-clobber $eflags
; CHECK: INLINEASM &foo, 0 /* attdialect */, 4456458 /* regdef:GR64 */, def $rsi, 4456458 /* regdef:GR64 */, def dead $rdi,
INLINEASM &foo, 0, 4456458, def $rsi, 4456458, def dead $rdi, 2147549193, killed $rdi, 2147483657, killed $rsi, 12, implicit-def dead early-clobber $eflags
$rax = MOV64rr killed $rsi
RET64 killed $rax
...
@ -45,8 +45,8 @@ body: |
; Verify that the register ties are preserved.
; CHECK-LABEL: name: test2
; CHECK: INLINEASM &foo, 0 /* attdialect */, 4390922 /* regdef:GR64 */, def $rsi, 4390922 /* regdef:GR64 */, def dead $rdi, 2147549193 /* reguse tiedto:$1 */, killed $rdi(tied-def 5), 2147483657 /* reguse tiedto:$0 */, killed $rsi(tied-def 3), 12 /* clobber */, implicit-def dead early-clobber $eflags
INLINEASM &foo, 0, 4390922, def $rsi, 4390922, def dead $rdi, 2147549193, killed $rdi(tied-def 5), 2147483657, killed $rsi(tied-def 3), 12, implicit-def dead early-clobber $eflags
; CHECK: INLINEASM &foo, 0 /* attdialect */, 4456458 /* regdef:GR64 */, def $rsi, 4456458 /* regdef:GR64 */, def dead $rdi, 2147549193 /* reguse tiedto:$1 */, killed $rdi(tied-def 5), 2147483657 /* reguse tiedto:$0 */, killed $rsi(tied-def 3), 12 /* clobber */, implicit-def dead early-clobber $eflags
INLINEASM &foo, 0, 4456458, def $rsi, 4456458, def dead $rdi, 2147549193, killed $rdi(tied-def 5), 2147483657, killed $rsi(tied-def 3), 12, implicit-def dead early-clobber $eflags
$rax = MOV64rr killed $rsi
RET64 killed $rax
...

View File

@ -4,9 +4,9 @@
; RUN: llc < %s -mtriple=i386-linux-generic -verify-machineinstrs -mattr=avx | FileCheck %s --check-prefixes=X86,X86-AVX
; RUN: llc < %s -mtriple=i386-linux-generic -verify-machineinstrs -mattr=avx512f | FileCheck %s --check-prefixes=X86,X86-AVX
; RUN: llc < %s -mtriple=i386-linux-generic -verify-machineinstrs | FileCheck %s --check-prefixes=X86,X86-NOSSE
; RUN: llc < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mattr=sse2 | FileCheck %s --check-prefixes=X64,X64-SSE
; RUN: llc < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mattr=avx | FileCheck %s --check-prefixes=X64,X64-AVX
; RUN: llc < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mattr=avx512f | FileCheck %s --check-prefixes=X64,X64-AVX
; RUN: llc < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mattr=sse2 | FileCheck %s --check-prefixes=X64-SSE
; RUN: llc < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mattr=avx | FileCheck %s --check-prefixes=X64-AVX
; RUN: llc < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mattr=avx512f | FileCheck %s --check-prefixes=X64-AVX
; Note: This test is testing that the lowering for atomics matches what we
; currently emit for non-atomics + the atomic restriction. The presence of
@ -16,17 +16,45 @@
; and their calling convention which remain unresolved.)
define void @store_half(half* %fptr, half %v) {
; X86-LABEL: store_half:
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movw %ax, (%ecx)
; X86-NEXT: retl
; X86-SSE1-LABEL: store_half:
; X86-SSE1: # %bb.0:
; X86-SSE1-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE1-NEXT: movw %ax, (%ecx)
; X86-SSE1-NEXT: retl
;
; X64-LABEL: store_half:
; X64: # %bb.0:
; X64-NEXT: movw %si, (%rdi)
; X64-NEXT: retq
; X86-SSE2-LABEL: store_half:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X86-SSE2-NEXT: movw %cx, (%eax)
; X86-SSE2-NEXT: retl
;
; X86-AVX-LABEL: store_half:
; X86-AVX: # %bb.0:
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movw %cx, (%eax)
; X86-AVX-NEXT: retl
;
; X86-NOSSE-LABEL: store_half:
; X86-NOSSE: # %bb.0:
; X86-NOSSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NOSSE-NEXT: movw %ax, (%ecx)
; X86-NOSSE-NEXT: retl
;
; X64-SSE-LABEL: store_half:
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: pextrw $0, %xmm0, %eax
; X64-SSE-NEXT: movw %ax, (%rdi)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: store_half:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vpextrw $0, %xmm0, %eax
; X64-AVX-NEXT: movw %ax, (%rdi)
; X64-AVX-NEXT: retq
store atomic half %v, half* %fptr unordered, align 2
ret void
}
@ -193,16 +221,43 @@ define void @store_fp128(fp128* %fptr, fp128 %v) {
}
define half @load_half(half* %fptr) {
; X86-LABEL: load_half:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movzwl (%eax), %eax
; X86-NEXT: retl
; X86-SSE1-LABEL: load_half:
; X86-SSE1: # %bb.0:
; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE1-NEXT: movzwl (%eax), %eax
; X86-SSE1-NEXT: retl
;
; X64-LABEL: load_half:
; X64: # %bb.0:
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: retq
; X86-SSE2-LABEL: load_half:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movzwl (%eax), %eax
; X86-SSE2-NEXT: pinsrw $0, %eax, %xmm0
; X86-SSE2-NEXT: retl
;
; X86-AVX-LABEL: load_half:
; X86-AVX: # %bb.0:
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movzwl (%eax), %eax
; X86-AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
; X86-AVX-NEXT: retl
;
; X86-NOSSE-LABEL: load_half:
; X86-NOSSE: # %bb.0:
; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NOSSE-NEXT: movzwl (%eax), %eax
; X86-NOSSE-NEXT: retl
;
; X64-SSE-LABEL: load_half:
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: movzwl (%rdi), %eax
; X64-SSE-NEXT: pinsrw $0, %eax, %xmm0
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: load_half:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: movzwl (%rdi), %eax
; X64-AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
; X64-AVX-NEXT: retq
%v = load atomic half, half* %fptr unordered, align 2
ret half %v
}

View File

@ -2254,22 +2254,19 @@ define void @test_concat_v2i1(<2 x half>* %arg, <2 x half>* %arg1, <2 x half>* %
; KNL-NEXT: korw %k2, %k1, %k1
; KNL-NEXT: kandw %k1, %k0, %k0
; KNL-NEXT: kshiftrw $1, %k0, %k1
; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: xorl %eax, %eax
; KNL-NEXT: testb $1, %cl
; KNL-NEXT: movl $0, %ecx
; KNL-NEXT: je LBB85_2
; KNL-NEXT: ## %bb.1:
; KNL-NEXT: movzwl 2(%rsi), %ecx
; KNL-NEXT: LBB85_2:
; KNL-NEXT: kmovw %k1, %edi
; KNL-NEXT: movzwl 2(%rsi), %eax
; KNL-NEXT: xorl %ecx, %ecx
; KNL-NEXT: testb $1, %dil
; KNL-NEXT: cmovel %ecx, %eax
; KNL-NEXT: kmovw %k0, %edi
; KNL-NEXT: testb $1, %dil
; KNL-NEXT: je LBB85_4
; KNL-NEXT: ## %bb.3:
; KNL-NEXT: movzwl (%rsi), %eax
; KNL-NEXT: LBB85_4:
; KNL-NEXT: movw %ax, (%rdx)
; KNL-NEXT: movw %cx, 2(%rdx)
; KNL-NEXT: je LBB85_2
; KNL-NEXT: ## %bb.1:
; KNL-NEXT: movl (%rsi), %ecx
; KNL-NEXT: LBB85_2:
; KNL-NEXT: movw %cx, (%rdx)
; KNL-NEXT: movw %ax, 2(%rdx)
; KNL-NEXT: retq
;
; SKX-LABEL: test_concat_v2i1:
@ -2304,22 +2301,19 @@ define void @test_concat_v2i1(<2 x half>* %arg, <2 x half>* %arg1, <2 x half>* %
; SKX-NEXT: korw %k1, %k2, %k1
; SKX-NEXT: kandw %k1, %k0, %k0
; SKX-NEXT: kshiftrb $1, %k0, %k1
; SKX-NEXT: kmovd %k1, %ecx
; SKX-NEXT: xorl %eax, %eax
; SKX-NEXT: testb $1, %cl
; SKX-NEXT: movl $0, %ecx
; SKX-NEXT: je LBB85_2
; SKX-NEXT: ## %bb.1:
; SKX-NEXT: movzwl 2(%rsi), %ecx
; SKX-NEXT: LBB85_2:
; SKX-NEXT: kmovd %k1, %edi
; SKX-NEXT: movzwl 2(%rsi), %eax
; SKX-NEXT: xorl %ecx, %ecx
; SKX-NEXT: testb $1, %dil
; SKX-NEXT: cmovel %ecx, %eax
; SKX-NEXT: kmovd %k0, %edi
; SKX-NEXT: testb $1, %dil
; SKX-NEXT: je LBB85_4
; SKX-NEXT: ## %bb.3:
; SKX-NEXT: movzwl (%rsi), %eax
; SKX-NEXT: LBB85_4:
; SKX-NEXT: movw %ax, (%rdx)
; SKX-NEXT: movw %cx, 2(%rdx)
; SKX-NEXT: je LBB85_2
; SKX-NEXT: ## %bb.1:
; SKX-NEXT: movl (%rsi), %ecx
; SKX-NEXT: LBB85_2:
; SKX-NEXT: movw %cx, (%rdx)
; SKX-NEXT: movw %ax, 2(%rdx)
; SKX-NEXT: retq
%tmp = load <2 x half>, <2 x half>* %arg, align 8
%tmp3 = fcmp fast olt <2 x half> %tmp, <half 0xH4600, half 0xH4600>

View File

@ -153,206 +153,156 @@ define void @test_mask_store_32xi16(<32 x i1> %mask, <32 x i16>* %addr, <32 x i1
declare void @llvm.masked.store.v32i16.p0v32i16(<32 x i16>, <32 x i16>*, i32, <32 x i1>)
; Make sure we scalarize masked loads of f16.
define <16 x half> @test_mask_load_16xf16(<16 x i1> %mask, <16 x half>* %addr, <16 x half> %val) {
define <16 x half> @test_mask_load_16xf16(<16 x i1> %mask, <16 x half>* %addr) {
; CHECK-LABEL: test_mask_load_16xf16:
; CHECK: ## %bb.0:
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: pushq %r15
; CHECK-NEXT: .cfi_def_cfa_offset 24
; CHECK-NEXT: pushq %r14
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: pushq %r13
; CHECK-NEXT: .cfi_def_cfa_offset 40
; CHECK-NEXT: pushq %r12
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: .cfi_def_cfa_offset 56
; CHECK-NEXT: .cfi_offset %rbx, -56
; CHECK-NEXT: .cfi_offset %r12, -48
; CHECK-NEXT: .cfi_offset %r13, -40
; CHECK-NEXT: .cfi_offset %r14, -32
; CHECK-NEXT: .cfi_offset %r15, -24
; CHECK-NEXT: .cfi_offset %rbp, -16
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0
; CHECK-NEXT: vpmovmskb %xmm0, %r11d
; CHECK-NEXT: testb $1, %r11b
; CHECK-NEXT: vpmovmskb %xmm0, %ecx
; CHECK-NEXT: testb $1, %cl
; CHECK-NEXT: je LBB12_1
; CHECK-NEXT: ## %bb.2: ## %cond.load
; CHECK-NEXT: movzwl (%rsi), %ecx
; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
; CHECK-NEXT: vpinsrw $0, (%rsi), %xmm0, %xmm8
; CHECK-NEXT: jmp LBB12_3
; CHECK-NEXT: LBB12_1:
; CHECK-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill
; CHECK-NEXT: vpxor %xmm8, %xmm8, %xmm8
; CHECK-NEXT: LBB12_3: ## %else
; CHECK-NEXT: xorl %edi, %edi
; CHECK-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill
; CHECK-NEXT: movl %edi, %ecx
; CHECK-NEXT: testb $2, %r11b
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpxor %xmm9, %xmm9, %xmm9
; CHECK-NEXT: vmovdqa %xmm2, %xmm10
; CHECK-NEXT: vmovdqa %xmm2, %xmm4
; CHECK-NEXT: vmovdqa %xmm2, %xmm5
; CHECK-NEXT: vmovdqa %xmm2, %xmm6
; CHECK-NEXT: vmovdqa %xmm2, %xmm7
; CHECK-NEXT: vmovdqa %xmm2, %xmm1
; CHECK-NEXT: vmovdqa %xmm2, %xmm0
; CHECK-NEXT: vmovdqa %xmm2, %xmm3
; CHECK-NEXT: vmovdqa %xmm2, %xmm11
; CHECK-NEXT: vmovdqa %xmm2, %xmm12
; CHECK-NEXT: vmovdqa %xmm2, %xmm13
; CHECK-NEXT: vmovdqa %xmm2, %xmm14
; CHECK-NEXT: testb $2, %cl
; CHECK-NEXT: je LBB12_4
; CHECK-NEXT: ## %bb.5: ## %cond.load1
; CHECK-NEXT: movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
; CHECK-NEXT: movl %edi, %r12d
; CHECK-NEXT: movl %edi, %ebx
; CHECK-NEXT: movl %edi, %ebp
; CHECK-NEXT: movl %edi, %r13d
; CHECK-NEXT: movl %edi, %r14d
; CHECK-NEXT: movl %edi, %r8d
; CHECK-NEXT: movl %edi, %r9d
; CHECK-NEXT: movl %edi, %r10d
; CHECK-NEXT: movl %edi, %r15d
; CHECK-NEXT: movl %edi, %edx
; CHECK-NEXT: movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
; CHECK-NEXT: movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
; CHECK-NEXT: movzwl 2(%rsi), %edi
; CHECK-NEXT: ## kill: def $di killed $di def $edi
; CHECK-NEXT: testb $4, %r11b
; CHECK-NEXT: vmovdqa %xmm2, %xmm15
; CHECK-NEXT: vpinsrw $0, 2(%rsi), %xmm0, %xmm2
; CHECK-NEXT: testb $4, %cl
; CHECK-NEXT: jne LBB12_7
; CHECK-NEXT: jmp LBB12_8
; CHECK-NEXT: LBB12_4:
; CHECK-NEXT: movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
; CHECK-NEXT: movl %edi, %r12d
; CHECK-NEXT: movl %edi, %ebx
; CHECK-NEXT: movl %edi, %ebp
; CHECK-NEXT: movl %edi, %r13d
; CHECK-NEXT: movl %edi, %r14d
; CHECK-NEXT: movl %edi, %r8d
; CHECK-NEXT: movl %edi, %r9d
; CHECK-NEXT: movl %edi, %r10d
; CHECK-NEXT: movl %edi, %r15d
; CHECK-NEXT: movl %edi, %edx
; CHECK-NEXT: movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
; CHECK-NEXT: movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
; CHECK-NEXT: testb $4, %r11b
; CHECK-NEXT: vmovdqa %xmm2, %xmm15
; CHECK-NEXT: testb $4, %cl
; CHECK-NEXT: je LBB12_8
; CHECK-NEXT: LBB12_7: ## %cond.load4
; CHECK-NEXT: movzwl 4(%rsi), %ecx
; CHECK-NEXT: movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
; CHECK-NEXT: vpinsrw $0, 4(%rsi), %xmm0, %xmm10
; CHECK-NEXT: LBB12_8: ## %else5
; CHECK-NEXT: testb $8, %r11b
; CHECK-NEXT: testb $8, %cl
; CHECK-NEXT: jne LBB12_9
; CHECK-NEXT: ## %bb.10: ## %else8
; CHECK-NEXT: testb $16, %r11b
; CHECK-NEXT: testb $16, %cl
; CHECK-NEXT: jne LBB12_11
; CHECK-NEXT: LBB12_12: ## %else11
; CHECK-NEXT: testb $32, %r11b
; CHECK-NEXT: testb $32, %cl
; CHECK-NEXT: jne LBB12_13
; CHECK-NEXT: LBB12_14: ## %else14
; CHECK-NEXT: testb $64, %r11b
; CHECK-NEXT: testb $64, %cl
; CHECK-NEXT: jne LBB12_15
; CHECK-NEXT: LBB12_16: ## %else17
; CHECK-NEXT: testb $-128, %r11b
; CHECK-NEXT: testb $-128, %cl
; CHECK-NEXT: jne LBB12_17
; CHECK-NEXT: LBB12_18: ## %else20
; CHECK-NEXT: testl $256, %r11d ## imm = 0x100
; CHECK-NEXT: testl $256, %ecx ## imm = 0x100
; CHECK-NEXT: jne LBB12_19
; CHECK-NEXT: LBB12_20: ## %else23
; CHECK-NEXT: testl $512, %r11d ## imm = 0x200
; CHECK-NEXT: testl $512, %ecx ## imm = 0x200
; CHECK-NEXT: jne LBB12_21
; CHECK-NEXT: LBB12_22: ## %else26
; CHECK-NEXT: testl $1024, %r11d ## imm = 0x400
; CHECK-NEXT: testl $1024, %ecx ## imm = 0x400
; CHECK-NEXT: jne LBB12_23
; CHECK-NEXT: LBB12_24: ## %else29
; CHECK-NEXT: testl $2048, %r11d ## imm = 0x800
; CHECK-NEXT: testl $2048, %ecx ## imm = 0x800
; CHECK-NEXT: jne LBB12_25
; CHECK-NEXT: LBB12_26: ## %else32
; CHECK-NEXT: testl $4096, %r11d ## imm = 0x1000
; CHECK-NEXT: je LBB12_28
; CHECK-NEXT: LBB12_27: ## %cond.load34
; CHECK-NEXT: movzwl 24(%rsi), %edx
; CHECK-NEXT: testl $4096, %ecx ## imm = 0x1000
; CHECK-NEXT: jne LBB12_27
; CHECK-NEXT: LBB12_28: ## %else35
; CHECK-NEXT: movw %dx, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
; CHECK-NEXT: testl $8192, %r11d ## imm = 0x2000
; CHECK-NEXT: testl $8192, %ecx ## imm = 0x2000
; CHECK-NEXT: jne LBB12_29
; CHECK-NEXT: ## %bb.30: ## %else38
; CHECK-NEXT: testl $16384, %r11d ## imm = 0x4000
; CHECK-NEXT: LBB12_30: ## %else38
; CHECK-NEXT: testl $16384, %ecx ## imm = 0x4000
; CHECK-NEXT: jne LBB12_31
; CHECK-NEXT: LBB12_32: ## %else41
; CHECK-NEXT: testl $32768, %r11d ## imm = 0x8000
; CHECK-NEXT: je LBB12_33
; CHECK-NEXT: LBB12_34: ## %cond.load43
; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 2-byte Folded Reload
; CHECK-NEXT: movzwl 30(%rsi), %esi
; CHECK-NEXT: jmp LBB12_35
; CHECK-NEXT: testl $32768, %ecx ## imm = 0x8000
; CHECK-NEXT: je LBB12_34
; CHECK-NEXT: LBB12_33: ## %cond.load43
; CHECK-NEXT: vpinsrw $0, 30(%rsi), %xmm0, %xmm9
; CHECK-NEXT: LBB12_34: ## %else44
; CHECK-NEXT: vpextrw $0, %xmm8, (%rax)
; CHECK-NEXT: vpextrw $0, %xmm2, 2(%rax)
; CHECK-NEXT: vpextrw $0, %xmm10, 4(%rax)
; CHECK-NEXT: vpextrw $0, %xmm4, 6(%rax)
; CHECK-NEXT: vpextrw $0, %xmm5, 8(%rax)
; CHECK-NEXT: vpextrw $0, %xmm6, 10(%rax)
; CHECK-NEXT: vpextrw $0, %xmm7, 12(%rax)
; CHECK-NEXT: vpextrw $0, %xmm1, 14(%rax)
; CHECK-NEXT: vpextrw $0, %xmm0, 16(%rax)
; CHECK-NEXT: vpextrw $0, %xmm3, 18(%rax)
; CHECK-NEXT: vpextrw $0, %xmm11, 20(%rax)
; CHECK-NEXT: vpextrw $0, %xmm12, 22(%rax)
; CHECK-NEXT: vpextrw $0, %xmm13, 24(%rax)
; CHECK-NEXT: vpextrw $0, %xmm14, 26(%rax)
; CHECK-NEXT: vpextrw $0, %xmm15, 28(%rax)
; CHECK-NEXT: vpextrw $0, %xmm9, 30(%rax)
; CHECK-NEXT: retq
; CHECK-NEXT: LBB12_9: ## %cond.load7
; CHECK-NEXT: movzwl 6(%rsi), %r12d
; CHECK-NEXT: testb $16, %r11b
; CHECK-NEXT: vpinsrw $0, 6(%rsi), %xmm0, %xmm4
; CHECK-NEXT: testb $16, %cl
; CHECK-NEXT: je LBB12_12
; CHECK-NEXT: LBB12_11: ## %cond.load10
; CHECK-NEXT: movzwl 8(%rsi), %ebx
; CHECK-NEXT: testb $32, %r11b
; CHECK-NEXT: vpinsrw $0, 8(%rsi), %xmm0, %xmm5
; CHECK-NEXT: testb $32, %cl
; CHECK-NEXT: je LBB12_14
; CHECK-NEXT: LBB12_13: ## %cond.load13
; CHECK-NEXT: movzwl 10(%rsi), %ebp
; CHECK-NEXT: testb $64, %r11b
; CHECK-NEXT: vpinsrw $0, 10(%rsi), %xmm0, %xmm6
; CHECK-NEXT: testb $64, %cl
; CHECK-NEXT: je LBB12_16
; CHECK-NEXT: LBB12_15: ## %cond.load16
; CHECK-NEXT: movzwl 12(%rsi), %r13d
; CHECK-NEXT: testb $-128, %r11b
; CHECK-NEXT: vpinsrw $0, 12(%rsi), %xmm0, %xmm7
; CHECK-NEXT: testb $-128, %cl
; CHECK-NEXT: je LBB12_18
; CHECK-NEXT: LBB12_17: ## %cond.load19
; CHECK-NEXT: movzwl 14(%rsi), %r14d
; CHECK-NEXT: testl $256, %r11d ## imm = 0x100
; CHECK-NEXT: vpinsrw $0, 14(%rsi), %xmm0, %xmm1
; CHECK-NEXT: testl $256, %ecx ## imm = 0x100
; CHECK-NEXT: je LBB12_20
; CHECK-NEXT: LBB12_19: ## %cond.load22
; CHECK-NEXT: movzwl 16(%rsi), %r8d
; CHECK-NEXT: testl $512, %r11d ## imm = 0x200
; CHECK-NEXT: vpinsrw $0, 16(%rsi), %xmm0, %xmm0
; CHECK-NEXT: testl $512, %ecx ## imm = 0x200
; CHECK-NEXT: je LBB12_22
; CHECK-NEXT: LBB12_21: ## %cond.load25
; CHECK-NEXT: movzwl 18(%rsi), %r9d
; CHECK-NEXT: testl $1024, %r11d ## imm = 0x400
; CHECK-NEXT: vpinsrw $0, 18(%rsi), %xmm0, %xmm3
; CHECK-NEXT: testl $1024, %ecx ## imm = 0x400
; CHECK-NEXT: je LBB12_24
; CHECK-NEXT: LBB12_23: ## %cond.load28
; CHECK-NEXT: movzwl 20(%rsi), %r10d
; CHECK-NEXT: testl $2048, %r11d ## imm = 0x800
; CHECK-NEXT: vpinsrw $0, 20(%rsi), %xmm0, %xmm11
; CHECK-NEXT: testl $2048, %ecx ## imm = 0x800
; CHECK-NEXT: je LBB12_26
; CHECK-NEXT: LBB12_25: ## %cond.load31
; CHECK-NEXT: movzwl 22(%rsi), %r15d
; CHECK-NEXT: testl $4096, %r11d ## imm = 0x1000
; CHECK-NEXT: jne LBB12_27
; CHECK-NEXT: jmp LBB12_28
; CHECK-NEXT: vpinsrw $0, 22(%rsi), %xmm0, %xmm12
; CHECK-NEXT: testl $4096, %ecx ## imm = 0x1000
; CHECK-NEXT: je LBB12_28
; CHECK-NEXT: LBB12_27: ## %cond.load34
; CHECK-NEXT: vpinsrw $0, 24(%rsi), %xmm0, %xmm13
; CHECK-NEXT: testl $8192, %ecx ## imm = 0x2000
; CHECK-NEXT: je LBB12_30
; CHECK-NEXT: LBB12_29: ## %cond.load37
; CHECK-NEXT: movzwl 26(%rsi), %ecx
; CHECK-NEXT: movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
; CHECK-NEXT: testl $16384, %r11d ## imm = 0x4000
; CHECK-NEXT: vpinsrw $0, 26(%rsi), %xmm0, %xmm14
; CHECK-NEXT: testl $16384, %ecx ## imm = 0x4000
; CHECK-NEXT: je LBB12_32
; CHECK-NEXT: LBB12_31: ## %cond.load40
; CHECK-NEXT: movzwl 28(%rsi), %ecx
; CHECK-NEXT: movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
; CHECK-NEXT: testl $32768, %r11d ## imm = 0x8000
; CHECK-NEXT: jne LBB12_34
; CHECK-NEXT: LBB12_33:
; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 2-byte Folded Reload
; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi ## 4-byte Reload
; CHECK-NEXT: LBB12_35: ## %else44
; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx ## 4-byte Reload
; CHECK-NEXT: movw %dx, (%rax)
; CHECK-NEXT: movw %di, 2(%rax)
; CHECK-NEXT: movw %cx, 4(%rax)
; CHECK-NEXT: movw %r12w, 6(%rax)
; CHECK-NEXT: movw %bx, 8(%rax)
; CHECK-NEXT: movw %bp, 10(%rax)
; CHECK-NEXT: movw %r13w, 12(%rax)
; CHECK-NEXT: movw %r14w, 14(%rax)
; CHECK-NEXT: movw %r8w, 16(%rax)
; CHECK-NEXT: movw %r9w, 18(%rax)
; CHECK-NEXT: movw %r10w, 20(%rax)
; CHECK-NEXT: movw %r15w, 22(%rax)
; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 2-byte Folded Reload
; CHECK-NEXT: movw %cx, 24(%rax)
; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 2-byte Folded Reload
; CHECK-NEXT: movw %cx, 26(%rax)
; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 2-byte Folded Reload
; CHECK-NEXT: movw %cx, 28(%rax)
; CHECK-NEXT: movw %si, 30(%rax)
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r12
; CHECK-NEXT: popq %r13
; CHECK-NEXT: popq %r14
; CHECK-NEXT: popq %r15
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: retq
; CHECK-NEXT: vpinsrw $0, 28(%rsi), %xmm0, %xmm15
; CHECK-NEXT: testl $32768, %ecx ## imm = 0x8000
; CHECK-NEXT: jne LBB12_33
; CHECK-NEXT: jmp LBB12_34
%res = call <16 x half> @llvm.masked.load.v16f16(<16 x half>* %addr, i32 4, <16 x i1>%mask, <16 x half> zeroinitializer)
ret <16 x half> %res
}
@ -414,78 +364,76 @@ define void @test_mask_store_16xf16(<16 x i1> %mask, <16 x half>* %addr, <16 x h
; CHECK-NEXT: LBB13_32: ## %else30
; CHECK-NEXT: retq
; CHECK-NEXT: LBB13_1: ## %cond.store
; CHECK-NEXT: movw %si, (%rdi)
; CHECK-NEXT: vpextrw $0, %xmm1, (%rdi)
; CHECK-NEXT: testb $2, %al
; CHECK-NEXT: je LBB13_4
; CHECK-NEXT: LBB13_3: ## %cond.store1
; CHECK-NEXT: movw %dx, 2(%rdi)
; CHECK-NEXT: vpextrw $0, %xmm2, 2(%rdi)
; CHECK-NEXT: testb $4, %al
; CHECK-NEXT: je LBB13_6
; CHECK-NEXT: LBB13_5: ## %cond.store3
; CHECK-NEXT: movw %cx, 4(%rdi)
; CHECK-NEXT: vpextrw $0, %xmm3, 4(%rdi)
; CHECK-NEXT: testb $8, %al
; CHECK-NEXT: je LBB13_8
; CHECK-NEXT: LBB13_7: ## %cond.store5
; CHECK-NEXT: movw %r8w, 6(%rdi)
; CHECK-NEXT: vpextrw $0, %xmm4, 6(%rdi)
; CHECK-NEXT: testb $16, %al
; CHECK-NEXT: je LBB13_10
; CHECK-NEXT: LBB13_9: ## %cond.store7
; CHECK-NEXT: movw %r9w, 8(%rdi)
; CHECK-NEXT: vpextrw $0, %xmm5, 8(%rdi)
; CHECK-NEXT: testb $32, %al
; CHECK-NEXT: je LBB13_12
; CHECK-NEXT: LBB13_11: ## %cond.store9
; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx
; CHECK-NEXT: movw %cx, 10(%rdi)
; CHECK-NEXT: vpextrw $0, %xmm6, 10(%rdi)
; CHECK-NEXT: testb $64, %al
; CHECK-NEXT: je LBB13_14
; CHECK-NEXT: LBB13_13: ## %cond.store11
; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx
; CHECK-NEXT: movw %cx, 12(%rdi)
; CHECK-NEXT: vpextrw $0, %xmm7, 12(%rdi)
; CHECK-NEXT: testb $-128, %al
; CHECK-NEXT: je LBB13_16
; CHECK-NEXT: LBB13_15: ## %cond.store13
; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx
; CHECK-NEXT: movw %cx, 14(%rdi)
; CHECK-NEXT: vpinsrw $0, {{[0-9]+}}(%rsp), %xmm0, %xmm0
; CHECK-NEXT: vpextrw $0, %xmm0, 14(%rdi)
; CHECK-NEXT: testl $256, %eax ## imm = 0x100
; CHECK-NEXT: je LBB13_18
; CHECK-NEXT: LBB13_17: ## %cond.store15
; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx
; CHECK-NEXT: movw %cx, 16(%rdi)
; CHECK-NEXT: vpinsrw $0, {{[0-9]+}}(%rsp), %xmm0, %xmm0
; CHECK-NEXT: vpextrw $0, %xmm0, 16(%rdi)
; CHECK-NEXT: testl $512, %eax ## imm = 0x200
; CHECK-NEXT: je LBB13_20
; CHECK-NEXT: LBB13_19: ## %cond.store17
; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx
; CHECK-NEXT: movw %cx, 18(%rdi)
; CHECK-NEXT: vpinsrw $0, {{[0-9]+}}(%rsp), %xmm0, %xmm0
; CHECK-NEXT: vpextrw $0, %xmm0, 18(%rdi)
; CHECK-NEXT: testl $1024, %eax ## imm = 0x400
; CHECK-NEXT: je LBB13_22
; CHECK-NEXT: LBB13_21: ## %cond.store19
; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx
; CHECK-NEXT: movw %cx, 20(%rdi)
; CHECK-NEXT: vpinsrw $0, {{[0-9]+}}(%rsp), %xmm0, %xmm0
; CHECK-NEXT: vpextrw $0, %xmm0, 20(%rdi)
; CHECK-NEXT: testl $2048, %eax ## imm = 0x800
; CHECK-NEXT: je LBB13_24
; CHECK-NEXT: LBB13_23: ## %cond.store21
; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx
; CHECK-NEXT: movw %cx, 22(%rdi)
; CHECK-NEXT: vpinsrw $0, {{[0-9]+}}(%rsp), %xmm0, %xmm0
; CHECK-NEXT: vpextrw $0, %xmm0, 22(%rdi)
; CHECK-NEXT: testl $4096, %eax ## imm = 0x1000
; CHECK-NEXT: je LBB13_26
; CHECK-NEXT: LBB13_25: ## %cond.store23
; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx
; CHECK-NEXT: movw %cx, 24(%rdi)
; CHECK-NEXT: vpinsrw $0, {{[0-9]+}}(%rsp), %xmm0, %xmm0
; CHECK-NEXT: vpextrw $0, %xmm0, 24(%rdi)
; CHECK-NEXT: testl $8192, %eax ## imm = 0x2000
; CHECK-NEXT: je LBB13_28
; CHECK-NEXT: LBB13_27: ## %cond.store25
; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx
; CHECK-NEXT: movw %cx, 26(%rdi)
; CHECK-NEXT: vpinsrw $0, {{[0-9]+}}(%rsp), %xmm0, %xmm0
; CHECK-NEXT: vpextrw $0, %xmm0, 26(%rdi)
; CHECK-NEXT: testl $16384, %eax ## imm = 0x4000
; CHECK-NEXT: je LBB13_30
; CHECK-NEXT: LBB13_29: ## %cond.store27
; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx
; CHECK-NEXT: movw %cx, 28(%rdi)
; CHECK-NEXT: vpinsrw $0, {{[0-9]+}}(%rsp), %xmm0, %xmm0
; CHECK-NEXT: vpextrw $0, %xmm0, 28(%rdi)
; CHECK-NEXT: testl $32768, %eax ## imm = 0x8000
; CHECK-NEXT: je LBB13_32
; CHECK-NEXT: LBB13_31: ## %cond.store29
; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: movw %ax, 30(%rdi)
; CHECK-NEXT: vpinsrw $0, {{[0-9]+}}(%rsp), %xmm0, %xmm0
; CHECK-NEXT: vpextrw $0, %xmm0, 30(%rdi)
; CHECK-NEXT: retq
call void @llvm.masked.store.v16f16.p0v16f16(<16 x half> %val, <16 x half>* %addr, i32 4, <16 x i1>%mask)
ret void

View File

@ -211,8 +211,8 @@ define half @movmsk(half %x) {
define half @bitcast_fabs(half %x) {
; CHECK-LABEL: bitcast_fabs:
; CHECK: # %bb.0:
; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-NEXT: vandps %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%bc1 = bitcast half %x to i16
%and = and i16 %bc1, 32767
@ -223,8 +223,8 @@ define half @bitcast_fabs(half %x) {
define half @bitcast_fneg(half %x) {
; CHECK-LABEL: bitcast_fneg:
; CHECK: # %bb.0:
; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-NEXT: vxorps %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%bc1 = bitcast half %x to i16
%xor = xor i16 %bc1, 32768
@ -285,8 +285,8 @@ define half @fsub_bitcast_fneg(half %x, half %y) {
define half @nabs(half %a) {
; CHECK-LABEL: nabs:
; CHECK: # %bb.0:
; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-NEXT: vorps %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%conv = bitcast half %a to i16
%and = or i16 %conv, -32768

View File

@ -17,7 +17,7 @@
; CHECK-NEXT: t2: i32,ch = CopyFromReg t0, Register:i32 %2
; CHECK-NEXT: t8: i32 = add t2, Constant:i32<4>
; CHECK-NEXT: t22: ch,glue = CopyToReg t17, Register:i32 %5, t8
; CHECK-NEXT: t30: ch,glue = inlineasm_br t22, TargetExternalSymbol:i64'xorl $0, $0; jmp ${1:l}', MDNode:ch<null>, TargetConstant:i64<8>, TargetConstant:i32<2293769>, Register:i32 %5, TargetConstant:i64<13>, TargetBlockAddress:i64<@test, %fail> 0, TargetConstant:i32<12>, Register:i32 $df, TargetConstant:i32<12>, Register:i16 $fpsw, TargetConstant:i32<12>, Register:i32 $eflags, t22:1
; CHECK-NEXT: t30: ch,glue = inlineasm_br t22, TargetExternalSymbol:i64'xorl $0, $0; jmp ${1:l}', MDNode:ch<null>, TargetConstant:i64<8>, TargetConstant:i32<2359305>, Register:i32 %5, TargetConstant:i64<13>, TargetBlockAddress:i64<@test, %fail> 0, TargetConstant:i32<12>, Register:i32 $df, TargetConstant:i32<12>, Register:i16 $fpsw, TargetConstant:i32<12>, Register:i32 $eflags, t22:1
define i32 @test(i32 %a, i32 %b, i32 %c) {
entry:

View File

@ -9,7 +9,8 @@ define void @test1(float %src, i16* %dest) {
; LIBCALL-NEXT: .cfi_def_cfa_offset 16
; LIBCALL-NEXT: .cfi_offset %rbx, -16
; LIBCALL-NEXT: movq %rdi, %rbx
; LIBCALL-NEXT: callq __gnu_f2h_ieee@PLT
; LIBCALL-NEXT: callq __truncsfhf2@PLT
; LIBCALL-NEXT: pextrw $0, %xmm0, %eax
; LIBCALL-NEXT: movw %ax, (%rbx)
; LIBCALL-NEXT: popq %rbx
; LIBCALL-NEXT: .cfi_def_cfa_offset 8
@ -28,8 +29,8 @@ define void @test1(float %src, i16* %dest) {
define float @test2(i16* nocapture %src) {
; LIBCALL-LABEL: test2:
; LIBCALL: # %bb.0:
; LIBCALL-NEXT: movzwl (%rdi), %edi
; LIBCALL-NEXT: jmp __gnu_h2f_ieee@PLT # TAILCALL
; LIBCALL-NEXT: pinsrw $0, (%rdi), %xmm0
; LIBCALL-NEXT: jmp __extendhfsf2@PLT # TAILCALL
;
; FP16-LABEL: test2:
; FP16: # %bb.0:
@ -46,11 +47,10 @@ define float @test3(float %src) nounwind uwtable readnone {
; LIBCALL: # %bb.0:
; LIBCALL-NEXT: pushq %rax
; LIBCALL-NEXT: .cfi_def_cfa_offset 16
; LIBCALL-NEXT: callq __gnu_f2h_ieee@PLT
; LIBCALL-NEXT: movzwl %ax, %edi
; LIBCALL-NEXT: callq __truncsfhf2@PLT
; LIBCALL-NEXT: popq %rax
; LIBCALL-NEXT: .cfi_def_cfa_offset 8
; LIBCALL-NEXT: jmp __gnu_h2f_ieee@PLT # TAILCALL
; LIBCALL-NEXT: jmp __extendhfsf2@PLT # TAILCALL
;
; FP16-LABEL: test3:
; FP16: # %bb.0:
@ -66,14 +66,8 @@ define float @test3(float %src) nounwind uwtable readnone {
define double @test4(i16* nocapture %src) {
; LIBCALL-LABEL: test4:
; LIBCALL: # %bb.0:
; LIBCALL-NEXT: pushq %rax
; LIBCALL-NEXT: .cfi_def_cfa_offset 16
; LIBCALL-NEXT: movzwl (%rdi), %edi
; LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT
; LIBCALL-NEXT: cvtss2sd %xmm0, %xmm0
; LIBCALL-NEXT: popq %rax
; LIBCALL-NEXT: .cfi_def_cfa_offset 8
; LIBCALL-NEXT: retq
; LIBCALL-NEXT: pinsrw $0, (%rdi), %xmm0
; LIBCALL-NEXT: jmp __extendhfdf2@PLT # TAILCALL
;
; FP16-LABEL: test4:
; FP16: # %bb.0:
@ -88,7 +82,14 @@ define double @test4(i16* nocapture %src) {
define i16 @test5(double %src) {
; LIBCALL-LABEL: test5:
; LIBCALL: # %bb.0:
; LIBCALL-NEXT: jmp __truncdfhf2@PLT # TAILCALL
; LIBCALL-NEXT: pushq %rax
; LIBCALL-NEXT: .cfi_def_cfa_offset 16
; LIBCALL-NEXT: callq __truncdfhf2@PLT
; LIBCALL-NEXT: pextrw $0, %xmm0, %eax
; LIBCALL-NEXT: # kill: def $ax killed $ax killed $eax
; LIBCALL-NEXT: popq %rcx
; LIBCALL-NEXT: .cfi_def_cfa_offset 8
; LIBCALL-NEXT: retq
;
; FP16-LABEL: test5:
; FP16: # %bb.0:
@ -106,10 +107,8 @@ define x86_fp80 @test6(i16* nocapture %src) {
; LIBCALL: # %bb.0:
; LIBCALL-NEXT: pushq %rax
; LIBCALL-NEXT: .cfi_def_cfa_offset 16
; LIBCALL-NEXT: movzwl (%rdi), %edi
; LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT
; LIBCALL-NEXT: movss %xmm0, {{[0-9]+}}(%rsp)
; LIBCALL-NEXT: flds {{[0-9]+}}(%rsp)
; LIBCALL-NEXT: pinsrw $0, (%rdi), %xmm0
; LIBCALL-NEXT: callq __extendhfxf2@PLT
; LIBCALL-NEXT: popq %rax
; LIBCALL-NEXT: .cfi_def_cfa_offset 8
; LIBCALL-NEXT: retq
@ -131,7 +130,16 @@ define x86_fp80 @test6(i16* nocapture %src) {
define i16 @test7(x86_fp80 %src) {
; LIBCALL-LABEL: test7:
; LIBCALL: # %bb.0:
; LIBCALL-NEXT: jmp __truncxfhf2@PLT # TAILCALL
; LIBCALL-NEXT: subq $24, %rsp
; LIBCALL-NEXT: .cfi_def_cfa_offset 32
; LIBCALL-NEXT: fldt {{[0-9]+}}(%rsp)
; LIBCALL-NEXT: fstpt (%rsp)
; LIBCALL-NEXT: callq __truncxfhf2@PLT
; LIBCALL-NEXT: pextrw $0, %xmm0, %eax
; LIBCALL-NEXT: # kill: def $ax killed $ax killed $eax
; LIBCALL-NEXT: addq $24, %rsp
; LIBCALL-NEXT: .cfi_def_cfa_offset 8
; LIBCALL-NEXT: retq
;
; FP16-LABEL: test7:
; FP16: # %bb.0:

View File

@ -28,8 +28,8 @@ define void @test1(float %src, i16* %dest) {
; LIBCALL-NEXT: .cfi_def_cfa_offset 16
; LIBCALL-NEXT: .cfi_offset %rbx, -16
; LIBCALL-NEXT: movq %rdi, %rbx
; LIBCALL-NEXT: callq __gnu_f2h_ieee@PLT
; LIBCALL-NEXT: movw %ax, (%rbx)
; LIBCALL-NEXT: callq __truncsfhf2@PLT
; LIBCALL-NEXT: pextrw $0, %xmm0, (%rbx)
; LIBCALL-NEXT: popq %rbx
; LIBCALL-NEXT: .cfi_def_cfa_offset 8
; LIBCALL-NEXT: retq
@ -37,7 +37,8 @@ define void @test1(float %src, i16* %dest) {
; F16C-LABEL: test1:
; F16C: # %bb.0:
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; F16C-NEXT: vpextrw $0, %xmm0, (%rdi)
; F16C-NEXT: vmovd %xmm0, %eax
; F16C-NEXT: movw %ax, (%rdi)
; F16C-NEXT: retq
;
; SOFTFLOAT-LABEL: test1:
@ -59,8 +60,8 @@ define void @test1(float %src, i16* %dest) {
define float @test2(i16* nocapture %src) {
; LIBCALL-LABEL: test2:
; LIBCALL: # %bb.0:
; LIBCALL-NEXT: movzwl (%rdi), %edi
; LIBCALL-NEXT: jmp __gnu_h2f_ieee@PLT # TAILCALL
; LIBCALL-NEXT: pinsrw $0, (%rdi), %xmm0
; LIBCALL-NEXT: jmp __extendhfsf2@PLT # TAILCALL
;
; F16C-LABEL: test2:
; F16C: # %bb.0:
@ -88,15 +89,17 @@ define float @test3(float %src) nounwind uwtable readnone {
; LIBCALL: # %bb.0:
; LIBCALL-NEXT: pushq %rax
; LIBCALL-NEXT: .cfi_def_cfa_offset 16
; LIBCALL-NEXT: callq __gnu_f2h_ieee@PLT
; LIBCALL-NEXT: movzwl %ax, %edi
; LIBCALL-NEXT: callq __truncsfhf2@PLT
; LIBCALL-NEXT: popq %rax
; LIBCALL-NEXT: .cfi_def_cfa_offset 8
; LIBCALL-NEXT: jmp __gnu_h2f_ieee@PLT # TAILCALL
; LIBCALL-NEXT: jmp __extendhfsf2@PLT # TAILCALL
;
; F16C-LABEL: test3:
; F16C: # %bb.0:
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; F16C-NEXT: vmovd %xmm0, %eax
; F16C-NEXT: movzwl %ax, %eax
; F16C-NEXT: vmovd %eax, %xmm0
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: retq
;
@ -118,14 +121,8 @@ define float @test3(float %src) nounwind uwtable readnone {
define double @test4(i16* nocapture %src) {
; LIBCALL-LABEL: test4:
; LIBCALL: # %bb.0:
; LIBCALL-NEXT: pushq %rax
; LIBCALL-NEXT: .cfi_def_cfa_offset 16
; LIBCALL-NEXT: movzwl (%rdi), %edi
; LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT
; LIBCALL-NEXT: cvtss2sd %xmm0, %xmm0
; LIBCALL-NEXT: popq %rax
; LIBCALL-NEXT: .cfi_def_cfa_offset 8
; LIBCALL-NEXT: retq
; LIBCALL-NEXT: pinsrw $0, (%rdi), %xmm0
; LIBCALL-NEXT: jmp __extendhfdf2@PLT # TAILCALL
;
; F16C-LABEL: test4:
; F16C: # %bb.0:
@ -154,11 +151,22 @@ define double @test4(i16* nocapture %src) {
define i16 @test5(double %src) {
; LIBCALL-LABEL: test5:
; LIBCALL: # %bb.0:
; LIBCALL-NEXT: jmp __truncdfhf2@PLT # TAILCALL
; LIBCALL-NEXT: pushq %rax
; LIBCALL-NEXT: .cfi_def_cfa_offset 16
; LIBCALL-NEXT: callq __truncdfhf2@PLT
; LIBCALL-NEXT: pextrw $0, %xmm0, %eax
; LIBCALL-NEXT: # kill: def $ax killed $ax killed $eax
; LIBCALL-NEXT: popq %rcx
; LIBCALL-NEXT: .cfi_def_cfa_offset 8
; LIBCALL-NEXT: retq
;
; F16C-LABEL: test5:
; F16C: # %bb.0:
; F16C-NEXT: jmp __truncdfhf2@PLT # TAILCALL
; F16C-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; F16C-NEXT: vmovd %xmm0, %eax
; F16C-NEXT: # kill: def $ax killed $ax killed $eax
; F16C-NEXT: retq
;
; SOFTFLOAT-LABEL: test5:
; SOFTFLOAT: # %bb.0:

View File

@ -16,6 +16,8 @@ define zeroext i16 @test1_fast(double %d) #0 {
; AVX-NEXT: pushq %rax
; AVX-NEXT: .cfi_def_cfa_offset 16
; AVX-NEXT: callq __truncdfhf2@PLT
; AVX-NEXT: vpextrw $0, %xmm0, %eax
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
; AVX-NEXT: popq %rcx
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
@ -25,40 +27,42 @@ entry:
}
define zeroext i16 @test2_fast(x86_fp80 %d) #0 {
; F16C-LABEL: test2_fast:
; F16C: # %bb.0: # %entry
; F16C-NEXT: fldt {{[0-9]+}}(%rsp)
; F16C-NEXT: fstps -{{[0-9]+}}(%rsp)
; F16C-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; F16C-NEXT: vmovd %xmm0, %eax
; F16C-NEXT: # kill: def $ax killed $ax killed $eax
; F16C-NEXT: retq
;
; AVX-LABEL: test2_fast:
; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $24, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 32
; AVX-NEXT: fldt {{[0-9]+}}(%rsp)
; AVX-NEXT: fstpt (%rsp)
; AVX-NEXT: callq __truncxfhf2@PLT
; AVX-NEXT: addq $24, %rsp
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
; ALL-LABEL: test2_fast:
; ALL: # %bb.0: # %entry
; ALL-NEXT: subq $24, %rsp
; ALL-NEXT: .cfi_def_cfa_offset 32
; ALL-NEXT: fldt {{[0-9]+}}(%rsp)
; ALL-NEXT: fstpt (%rsp)
; ALL-NEXT: callq __truncxfhf2@PLT
; ALL-NEXT: vpextrw $0, %xmm0, %eax
; ALL-NEXT: # kill: def $ax killed $ax killed $eax
; ALL-NEXT: addq $24, %rsp
; ALL-NEXT: .cfi_def_cfa_offset 8
; ALL-NEXT: retq
entry:
%0 = tail call i16 @llvm.convert.to.fp16.f80(x86_fp80 %d)
ret i16 %0
}
define zeroext i16 @test1(double %d) #1 {
; ALL-LABEL: test1:
; ALL: # %bb.0: # %entry
; ALL-NEXT: pushq %rax
; ALL-NEXT: .cfi_def_cfa_offset 16
; ALL-NEXT: callq __truncdfhf2@PLT
; ALL-NEXT: popq %rcx
; ALL-NEXT: .cfi_def_cfa_offset 8
; ALL-NEXT: retq
; F16C-LABEL: test1:
; F16C: # %bb.0: # %entry
; F16C-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; F16C-NEXT: vmovd %xmm0, %eax
; F16C-NEXT: # kill: def $ax killed $ax killed $eax
; F16C-NEXT: retq
;
; AVX-LABEL: test1:
; AVX: # %bb.0: # %entry
; AVX-NEXT: pushq %rax
; AVX-NEXT: .cfi_def_cfa_offset 16
; AVX-NEXT: callq __truncdfhf2@PLT
; AVX-NEXT: vpextrw $0, %xmm0, %eax
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
; AVX-NEXT: popq %rcx
; AVX-NEXT: .cfi_def_cfa_offset 8
; AVX-NEXT: retq
entry:
%0 = tail call i16 @llvm.convert.to.fp16.f64(double %d)
ret i16 %0
@ -72,6 +76,8 @@ define zeroext i16 @test2(x86_fp80 %d) #1 {
; ALL-NEXT: fldt {{[0-9]+}}(%rsp)
; ALL-NEXT: fstpt (%rsp)
; ALL-NEXT: callq __truncxfhf2@PLT
; ALL-NEXT: vpextrw $0, %xmm0, %eax
; ALL-NEXT: # kill: def $ax killed $ax killed $eax
; ALL-NEXT: addq $24, %rsp
; ALL-NEXT: .cfi_def_cfa_offset 8
; ALL-NEXT: retq

View File

@ -111,14 +111,12 @@ define dso_local float @div_arcp_by_const(half %x) {
; X64: # %bb.0:
; X64-NEXT: pushq %rax
; X64-NEXT: .cfi_def_cfa_offset 16
; X64-NEXT: movzwl %di, %edi
; X64-NEXT: callq __gnu_h2f_ieee@PLT
; X64-NEXT: callq __extendhfsf2@PLT
; X64-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-NEXT: callq __gnu_f2h_ieee@PLT
; X64-NEXT: movzwl %ax, %edi
; X64-NEXT: callq __truncsfhf2@PLT
; X64-NEXT: popq %rax
; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: jmp __gnu_h2f_ieee@PLT # TAILCALL
; X64-NEXT: jmp __extendhfsf2@PLT # TAILCALL
;
; X86-LABEL: div_arcp_by_const:
; X86: # %bb.0:

View File

@ -10,18 +10,16 @@ define half @round_f16(half %h) {
; SSE2: ## %bb.0: ## %entry
; SSE2-NEXT: pushq %rax
; SSE2-NEXT: .cfi_def_cfa_offset 16
; SSE2-NEXT: movzwl %di, %edi
; SSE2-NEXT: callq ___extendhfsf2
; SSE2-NEXT: callq _roundf
; SSE2-NEXT: callq ___truncsfhf2
; SSE2-NEXT: popq %rcx
; SSE2-NEXT: popq %rax
; SSE2-NEXT: retq
;
; SSE41-LABEL: round_f16:
; SSE41: ## %bb.0: ## %entry
; SSE41-NEXT: pushq %rax
; SSE41-NEXT: .cfi_def_cfa_offset 16
; SSE41-NEXT: movzwl %di, %edi
; SSE41-NEXT: callq ___extendhfsf2
; SSE41-NEXT: movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; SSE41-NEXT: andps %xmm0, %xmm1
@ -30,14 +28,13 @@ define half @round_f16(half %h) {
; SSE41-NEXT: xorps %xmm0, %xmm0
; SSE41-NEXT: roundss $11, %xmm1, %xmm0
; SSE41-NEXT: callq ___truncsfhf2
; SSE41-NEXT: popq %rcx
; SSE41-NEXT: popq %rax
; SSE41-NEXT: retq
;
; AVX1-LABEL: round_f16:
; AVX1: ## %bb.0: ## %entry
; AVX1-NEXT: pushq %rax
; AVX1-NEXT: .cfi_def_cfa_offset 16
; AVX1-NEXT: movzwl %di, %edi
; AVX1-NEXT: callq ___extendhfsf2
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
@ -45,12 +42,13 @@ define half @round_f16(half %h) {
; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0
; AVX1-NEXT: callq ___truncsfhf2
; AVX1-NEXT: popq %rcx
; AVX1-NEXT: popq %rax
; AVX1-NEXT: retq
;
; AVX512F-LABEL: round_f16:
; AVX512F: ## %bb.0: ## %entry
; AVX512F-NEXT: movzwl %di, %eax
; AVX512F-NEXT: vpextrw $0, %xmm0, %eax
; AVX512F-NEXT: movzwl %ax, %eax
; AVX512F-NEXT: vmovd %eax, %xmm0
; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
@ -59,7 +57,7 @@ define half @round_f16(half %h) {
; AVX512F-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512F-NEXT: vmovd %xmm0, %eax
; AVX512F-NEXT: ## kill: def $ax killed $ax killed $eax
; AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512FP16-LABEL: round_f16:

View File

@ -10,44 +10,42 @@ define half @roundeven_f16(half %h) {
; SSE2: ## %bb.0: ## %entry
; SSE2-NEXT: pushq %rax
; SSE2-NEXT: .cfi_def_cfa_offset 16
; SSE2-NEXT: movzwl %di, %edi
; SSE2-NEXT: callq ___extendhfsf2
; SSE2-NEXT: callq _roundevenf
; SSE2-NEXT: callq ___truncsfhf2
; SSE2-NEXT: popq %rcx
; SSE2-NEXT: popq %rax
; SSE2-NEXT: retq
;
; SSE41-LABEL: roundeven_f16:
; SSE41: ## %bb.0: ## %entry
; SSE41-NEXT: pushq %rax
; SSE41-NEXT: .cfi_def_cfa_offset 16
; SSE41-NEXT: movzwl %di, %edi
; SSE41-NEXT: callq ___extendhfsf2
; SSE41-NEXT: roundss $8, %xmm0, %xmm0
; SSE41-NEXT: callq ___truncsfhf2
; SSE41-NEXT: popq %rcx
; SSE41-NEXT: popq %rax
; SSE41-NEXT: retq
;
; AVX1-LABEL: roundeven_f16:
; AVX1: ## %bb.0: ## %entry
; AVX1-NEXT: pushq %rax
; AVX1-NEXT: .cfi_def_cfa_offset 16
; AVX1-NEXT: movzwl %di, %edi
; AVX1-NEXT: callq ___extendhfsf2
; AVX1-NEXT: vroundss $8, %xmm0, %xmm0, %xmm0
; AVX1-NEXT: callq ___truncsfhf2
; AVX1-NEXT: popq %rcx
; AVX1-NEXT: popq %rax
; AVX1-NEXT: retq
;
; AVX512F-LABEL: roundeven_f16:
; AVX512F: ## %bb.0: ## %entry
; AVX512F-NEXT: movzwl %di, %eax
; AVX512F-NEXT: vpextrw $0, %xmm0, %eax
; AVX512F-NEXT: movzwl %ax, %eax
; AVX512F-NEXT: vmovd %eax, %xmm0
; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: vroundss $8, %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512F-NEXT: vmovd %xmm0, %eax
; AVX512F-NEXT: ## kill: def $ax killed $ax killed $eax
; AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512FP16-LABEL: roundeven_f16:

View File

@ -19,10 +19,9 @@ define dso_local void @TestFPExtF16_F128() nounwind strictfp {
; X64-SSE-LABEL: TestFPExtF16_F128:
; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: pushq %rax
; X64-SSE-NEXT: movzwl vf16(%rip), %edi
; X64-SSE-NEXT: callq __gnu_h2f_ieee@PLT
; X64-SSE-NEXT: callq __extendsftf2@PLT
; X64-SSE-NEXT: movaps %xmm0, vf128(%rip)
; X64-SSE-NEXT: pinsrw $0, vf16(%rip), %xmm0
; X64-SSE-NEXT: callq __extendhftf2@PLT
; X64-SSE-NEXT: movdqa %xmm0, vf128(%rip)
; X64-SSE-NEXT: popq %rax
; X64-SSE-NEXT: retq
;
@ -218,8 +217,9 @@ define dso_local void @TestFPTruncF128_F16() nounwind strictfp {
; X64-SSE-LABEL: TestFPTruncF128_F16:
; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: pushq %rax
; X64-SSE-NEXT: movaps vf128(%rip), %xmm0
; X64-SSE-NEXT: movdqa vf128(%rip), %xmm0
; X64-SSE-NEXT: callq __trunctfhf2@PLT
; X64-SSE-NEXT: pextrw $0, %xmm0, %eax
; X64-SSE-NEXT: movw %ax, vf16(%rip)
; X64-SSE-NEXT: popq %rax
; X64-SSE-NEXT: retq

View File

@ -139,15 +139,17 @@ define i32 @stest_f16i32(half %x) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movzwl %di, %edi
; CHECK-NEXT: callq __gnu_h2f_ieee@PLT
; CHECK-NEXT: callq __extendhfsf2@PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF
; CHECK-NEXT: cmovbel %eax, %ecx
; CHECK-NEXT: movl $-2147483648, %ecx # imm = 0x80000000
; CHECK-NEXT: cmovael %eax, %ecx
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: movl $2147483647, %edx # imm = 0x7FFFFFFF
; CHECK-NEXT: cmovbel %ecx, %edx
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovnpl %ecx, %eax
; CHECK-NEXT: cmovnpl %edx, %eax
; CHECK-NEXT: popq %rcx
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
@ -166,8 +168,7 @@ define i32 @utesth_f16i32(half %x) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movzwl %di, %edi
; CHECK-NEXT: callq __gnu_h2f_ieee@PLT
; CHECK-NEXT: callq __extendhfsf2@PLT
; CHECK-NEXT: cvttss2si %xmm0, %rax
; CHECK-NEXT: movq %rax, %rcx
; CHECK-NEXT: sarq $63, %rcx
@ -195,8 +196,7 @@ define i32 @ustest_f16i32(half %x) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movzwl %di, %edi
; CHECK-NEXT: callq __gnu_h2f_ieee@PLT
; CHECK-NEXT: callq __extendhfsf2@PLT
; CHECK-NEXT: cvttss2si %xmm0, %rcx
; CHECK-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF
; CHECK-NEXT: cmpq %rax, %rcx
@ -343,13 +343,17 @@ define i16 @stest_f16i16(half %x) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movzwl %di, %edi
; CHECK-NEXT: callq __gnu_h2f_ieee@PLT
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: maxss %xmm0, %xmm1
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: minss %xmm1, %xmm0
; CHECK-NEXT: callq __extendhfsf2@PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: movl $32768, %ecx # imm = 0x8000
; CHECK-NEXT: cmovael %eax, %ecx
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: movl $32767, %edx # imm = 0x7FFF
; CHECK-NEXT: cmovbel %ecx, %edx
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovnpl %edx, %eax
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-NEXT: popq %rcx
; CHECK-NEXT: .cfi_def_cfa_offset 8
@ -369,8 +373,7 @@ define i16 @utesth_f16i16(half %x) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movzwl %di, %edi
; CHECK-NEXT: callq __gnu_h2f_ieee@PLT
; CHECK-NEXT: callq __extendhfsf2@PLT
; CHECK-NEXT: cvttss2si %xmm0, %rcx
; CHECK-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; CHECK-NEXT: movl $65535, %eax # imm = 0xFFFF
@ -392,8 +395,7 @@ define i16 @ustest_f16i16(half %x) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movzwl %di, %edi
; CHECK-NEXT: callq __gnu_h2f_ieee@PLT
; CHECK-NEXT: callq __extendhfsf2@PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; CHECK-NEXT: movl $65535, %ecx # imm = 0xFFFF
@ -562,15 +564,17 @@ define i64 @stest_f16i64(half %x) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movzwl %di, %edi
; CHECK-NEXT: callq __gnu_h2f_ieee@PLT
; CHECK-NEXT: callq __extendhfsf2@PLT
; CHECK-NEXT: cvttss2si %xmm0, %rax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
; CHECK-NEXT: cmovbeq %rax, %rcx
; CHECK-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
; CHECK-NEXT: cmovaeq %rax, %rcx
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF
; CHECK-NEXT: cmovbeq %rcx, %rdx
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovnpq %rcx, %rax
; CHECK-NEXT: cmovnpq %rdx, %rax
; CHECK-NEXT: popq %rcx
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
@ -589,9 +593,7 @@ define i64 @utesth_f16i64(half %x) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movzwl %di, %edi
; CHECK-NEXT: callq __gnu_h2f_ieee@PLT
; CHECK-NEXT: callq __fixunssfti@PLT
; CHECK-NEXT: callq __fixunshfti@PLT
; CHECK-NEXT: xorl %ecx, %ecx
; CHECK-NEXT: testq %rdx, %rdx
; CHECK-NEXT: cmovneq %rcx, %rax
@ -611,9 +613,7 @@ define i64 @ustest_f16i64(half %x) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movzwl %di, %edi
; CHECK-NEXT: callq __gnu_h2f_ieee@PLT
; CHECK-NEXT: callq __fixsfti@PLT
; CHECK-NEXT: callq __fixhfti@PLT
; CHECK-NEXT: xorl %ecx, %ecx
; CHECK-NEXT: testq %rdx, %rdx
; CHECK-NEXT: movl $1, %esi
@ -768,15 +768,17 @@ define i32 @stest_f16i32_mm(half %x) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movzwl %di, %edi
; CHECK-NEXT: callq __gnu_h2f_ieee@PLT
; CHECK-NEXT: callq __extendhfsf2@PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF
; CHECK-NEXT: cmovbel %eax, %ecx
; CHECK-NEXT: movl $-2147483648, %ecx # imm = 0x80000000
; CHECK-NEXT: cmovael %eax, %ecx
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: movl $2147483647, %edx # imm = 0x7FFFFFFF
; CHECK-NEXT: cmovbel %ecx, %edx
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovnpl %ecx, %eax
; CHECK-NEXT: cmovnpl %edx, %eax
; CHECK-NEXT: popq %rcx
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
@ -793,8 +795,7 @@ define i32 @utesth_f16i32_mm(half %x) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movzwl %di, %edi
; CHECK-NEXT: callq __gnu_h2f_ieee@PLT
; CHECK-NEXT: callq __extendhfsf2@PLT
; CHECK-NEXT: cvttss2si %xmm0, %rcx
; CHECK-NEXT: movq %rcx, %rdx
; CHECK-NEXT: sarq $63, %rdx
@ -821,8 +822,7 @@ define i32 @ustest_f16i32_mm(half %x) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movzwl %di, %edi
; CHECK-NEXT: callq __gnu_h2f_ieee@PLT
; CHECK-NEXT: callq __extendhfsf2@PLT
; CHECK-NEXT: cvttss2si %xmm0, %rax
; CHECK-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF
; CHECK-NEXT: cmpq %rcx, %rax
@ -957,13 +957,17 @@ define i16 @stest_f16i16_mm(half %x) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movzwl %di, %edi
; CHECK-NEXT: callq __gnu_h2f_ieee@PLT
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: maxss %xmm0, %xmm1
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: minss %xmm1, %xmm0
; CHECK-NEXT: callq __extendhfsf2@PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: movl $32768, %ecx # imm = 0x8000
; CHECK-NEXT: cmovael %eax, %ecx
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: movl $32767, %edx # imm = 0x7FFF
; CHECK-NEXT: cmovbel %ecx, %edx
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovnpl %edx, %eax
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-NEXT: popq %rcx
; CHECK-NEXT: .cfi_def_cfa_offset 8
@ -981,8 +985,7 @@ define i16 @utesth_f16i16_mm(half %x) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movzwl %di, %edi
; CHECK-NEXT: callq __gnu_h2f_ieee@PLT
; CHECK-NEXT: callq __extendhfsf2@PLT
; CHECK-NEXT: cvttss2si %xmm0, %rcx
; CHECK-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; CHECK-NEXT: movl $65535, %eax # imm = 0xFFFF
@ -1003,8 +1006,7 @@ define i16 @ustest_f16i16_mm(half %x) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movzwl %di, %edi
; CHECK-NEXT: callq __gnu_h2f_ieee@PLT
; CHECK-NEXT: callq __extendhfsf2@PLT
; CHECK-NEXT: cvttss2si %xmm0, %eax
; CHECK-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; CHECK-NEXT: movl $65535, %ecx # imm = 0xFFFF
@ -1163,15 +1165,17 @@ define i64 @stest_f16i64_mm(half %x) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movzwl %di, %edi
; CHECK-NEXT: callq __gnu_h2f_ieee@PLT
; CHECK-NEXT: callq __extendhfsf2@PLT
; CHECK-NEXT: cvttss2si %xmm0, %rax
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
; CHECK-NEXT: cmovbeq %rax, %rcx
; CHECK-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
; CHECK-NEXT: cmovaeq %rax, %rcx
; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF
; CHECK-NEXT: cmovbeq %rcx, %rdx
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: cmovnpq %rcx, %rax
; CHECK-NEXT: cmovnpq %rdx, %rax
; CHECK-NEXT: popq %rcx
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
@ -1188,9 +1192,7 @@ define i64 @utesth_f16i64_mm(half %x) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movzwl %di, %edi
; CHECK-NEXT: callq __gnu_h2f_ieee@PLT
; CHECK-NEXT: callq __fixunssfti@PLT
; CHECK-NEXT: callq __fixunshfti@PLT
; CHECK-NEXT: xorl %ecx, %ecx
; CHECK-NEXT: testq %rdx, %rdx
; CHECK-NEXT: cmovneq %rcx, %rax
@ -1211,9 +1213,7 @@ define i64 @ustest_f16i64_mm(half %x) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movzwl %di, %edi
; CHECK-NEXT: callq __gnu_h2f_ieee@PLT
; CHECK-NEXT: callq __fixsfti@PLT
; CHECK-NEXT: callq __fixhfti@PLT
; CHECK-NEXT: xorl %ecx, %ecx
; CHECK-NEXT: testq %rdx, %rdx
; CHECK-NEXT: movl $1, %esi

File diff suppressed because it is too large Load Diff

View File

@ -2052,6 +2052,7 @@ declare i64 @llvm.fptosi.sat.i64.f16 (half)
declare i100 @llvm.fptosi.sat.i100.f16(half)
declare i128 @llvm.fptosi.sat.i128.f16(half)
; FIXME: Can be optimizated with maxss + minss
define i1 @test_signed_i1_f16(half %f) nounwind {
; X86-X87-LABEL: test_signed_i1_f16:
; X86-X87: # %bb.0:
@ -2109,15 +2110,22 @@ define i1 @test_signed_i1_f16(half %f) nounwind {
; X86-SSE-LABEL: test_signed_i1_f16:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: subl $12, %esp
; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl %eax, (%esp)
; X86-SSE-NEXT: calll __gnu_h2f_ieee
; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
; X86-SSE-NEXT: pextrw $0, %xmm0, %eax
; X86-SSE-NEXT: movw %ax, (%esp)
; X86-SSE-NEXT: calll __extendhfsf2
; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp)
; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: maxss {{[0-9]+}}(%esp), %xmm0
; X86-SSE-NEXT: cvttss2si %xmm0, %ecx
; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE-NEXT: movl $255, %eax
; X86-SSE-NEXT: cmovael %ecx, %eax
; X86-SSE-NEXT: xorl %ecx, %ecx
; X86-SSE-NEXT: xorps %xmm1, %xmm1
; X86-SSE-NEXT: minss %xmm0, %xmm1
; X86-SSE-NEXT: cvttss2si %xmm1, %eax
; X86-SSE-NEXT: ucomiss %xmm1, %xmm0
; X86-SSE-NEXT: cmoval %ecx, %eax
; X86-SSE-NEXT: ucomiss %xmm0, %xmm0
; X86-SSE-NEXT: cmovpl %ecx, %eax
; X86-SSE-NEXT: # kill: def $al killed $al killed $eax
; X86-SSE-NEXT: addl $12, %esp
; X86-SSE-NEXT: retl
@ -2125,13 +2133,17 @@ define i1 @test_signed_i1_f16(half %f) nounwind {
; X64-LABEL: test_signed_i1_f16:
; X64: # %bb.0:
; X64-NEXT: pushq %rax
; X64-NEXT: movzwl %di, %edi
; X64-NEXT: callq __gnu_h2f_ieee@PLT
; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-NEXT: maxss %xmm0, %xmm1
; X64-NEXT: xorps %xmm0, %xmm0
; X64-NEXT: minss %xmm1, %xmm0
; X64-NEXT: cvttss2si %xmm0, %eax
; X64-NEXT: callq __extendhfsf2@PLT
; X64-NEXT: cvttss2si %xmm0, %ecx
; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-NEXT: movl $255, %eax
; X64-NEXT: cmovael %ecx, %eax
; X64-NEXT: xorl %ecx, %ecx
; X64-NEXT: xorps %xmm1, %xmm1
; X64-NEXT: ucomiss %xmm1, %xmm0
; X64-NEXT: cmoval %ecx, %eax
; X64-NEXT: ucomiss %xmm0, %xmm0
; X64-NEXT: cmovpl %ecx, %eax
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: popq %rcx
; X64-NEXT: retq
@ -2139,6 +2151,7 @@ define i1 @test_signed_i1_f16(half %f) nounwind {
ret i1 %x
}
; FIXME: Can be optimizated with maxss + minss
define i8 @test_signed_i8_f16(half %f) nounwind {
; X86-X87-LABEL: test_signed_i8_f16:
; X86-X87: # %bb.0:
@ -2192,15 +2205,22 @@ define i8 @test_signed_i8_f16(half %f) nounwind {
; X86-SSE-LABEL: test_signed_i8_f16:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: subl $12, %esp
; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl %eax, (%esp)
; X86-SSE-NEXT: calll __gnu_h2f_ieee
; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
; X86-SSE-NEXT: pextrw $0, %xmm0, %eax
; X86-SSE-NEXT: movw %ax, (%esp)
; X86-SSE-NEXT: calll __extendhfsf2
; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp)
; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: maxss {{[0-9]+}}(%esp), %xmm0
; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-SSE-NEXT: minss %xmm0, %xmm1
; X86-SSE-NEXT: cvttss2si %xmm1, %eax
; X86-SSE-NEXT: cvttss2si %xmm0, %eax
; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE-NEXT: movl $128, %ecx
; X86-SSE-NEXT: cmovael %eax, %ecx
; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE-NEXT: movl $127, %edx
; X86-SSE-NEXT: cmovbel %ecx, %edx
; X86-SSE-NEXT: xorl %eax, %eax
; X86-SSE-NEXT: ucomiss %xmm0, %xmm0
; X86-SSE-NEXT: cmovnpl %edx, %eax
; X86-SSE-NEXT: # kill: def $al killed $al killed $eax
; X86-SSE-NEXT: addl $12, %esp
; X86-SSE-NEXT: retl
@ -2208,13 +2228,17 @@ define i8 @test_signed_i8_f16(half %f) nounwind {
; X64-LABEL: test_signed_i8_f16:
; X64: # %bb.0:
; X64-NEXT: pushq %rax
; X64-NEXT: movzwl %di, %edi
; X64-NEXT: callq __gnu_h2f_ieee@PLT
; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-NEXT: maxss %xmm0, %xmm1
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: minss %xmm1, %xmm0
; X64-NEXT: callq __extendhfsf2@PLT
; X64-NEXT: cvttss2si %xmm0, %eax
; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-NEXT: movl $128, %ecx
; X64-NEXT: cmovael %eax, %ecx
; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-NEXT: movl $127, %edx
; X64-NEXT: cmovbel %ecx, %edx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: ucomiss %xmm0, %xmm0
; X64-NEXT: cmovnpl %edx, %eax
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: popq %rcx
; X64-NEXT: retq
@ -2222,6 +2246,7 @@ define i8 @test_signed_i8_f16(half %f) nounwind {
ret i8 %x
}
; FIXME: Can be optimizated with maxss + minss
define i13 @test_signed_i13_f16(half %f) nounwind {
; X86-X87-LABEL: test_signed_i13_f16:
; X86-X87: # %bb.0:
@ -2276,15 +2301,22 @@ define i13 @test_signed_i13_f16(half %f) nounwind {
; X86-SSE-LABEL: test_signed_i13_f16:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: subl $12, %esp
; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl %eax, (%esp)
; X86-SSE-NEXT: calll __gnu_h2f_ieee
; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
; X86-SSE-NEXT: pextrw $0, %xmm0, %eax
; X86-SSE-NEXT: movw %ax, (%esp)
; X86-SSE-NEXT: calll __extendhfsf2
; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp)
; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: maxss {{[0-9]+}}(%esp), %xmm0
; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-SSE-NEXT: minss %xmm0, %xmm1
; X86-SSE-NEXT: cvttss2si %xmm1, %eax
; X86-SSE-NEXT: cvttss2si %xmm0, %eax
; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE-NEXT: movl $61440, %ecx # imm = 0xF000
; X86-SSE-NEXT: cmovael %eax, %ecx
; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE-NEXT: movl $4095, %edx # imm = 0xFFF
; X86-SSE-NEXT: cmovbel %ecx, %edx
; X86-SSE-NEXT: xorl %eax, %eax
; X86-SSE-NEXT: ucomiss %xmm0, %xmm0
; X86-SSE-NEXT: cmovnpl %edx, %eax
; X86-SSE-NEXT: # kill: def $ax killed $ax killed $eax
; X86-SSE-NEXT: addl $12, %esp
; X86-SSE-NEXT: retl
@ -2292,13 +2324,17 @@ define i13 @test_signed_i13_f16(half %f) nounwind {
; X64-LABEL: test_signed_i13_f16:
; X64: # %bb.0:
; X64-NEXT: pushq %rax
; X64-NEXT: movzwl %di, %edi
; X64-NEXT: callq __gnu_h2f_ieee@PLT
; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-NEXT: maxss %xmm0, %xmm1
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: minss %xmm1, %xmm0
; X64-NEXT: callq __extendhfsf2@PLT
; X64-NEXT: cvttss2si %xmm0, %eax
; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-NEXT: movl $61440, %ecx # imm = 0xF000
; X64-NEXT: cmovael %eax, %ecx
; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-NEXT: movl $4095, %edx # imm = 0xFFF
; X64-NEXT: cmovbel %ecx, %edx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: ucomiss %xmm0, %xmm0
; X64-NEXT: cmovnpl %edx, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: popq %rcx
; X64-NEXT: retq
@ -2306,6 +2342,7 @@ define i13 @test_signed_i13_f16(half %f) nounwind {
ret i13 %x
}
; FIXME: Can be optimizated with maxss + minss
define i16 @test_signed_i16_f16(half %f) nounwind {
; X86-X87-LABEL: test_signed_i16_f16:
; X86-X87: # %bb.0:
@ -2360,15 +2397,22 @@ define i16 @test_signed_i16_f16(half %f) nounwind {
; X86-SSE-LABEL: test_signed_i16_f16:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: subl $12, %esp
; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl %eax, (%esp)
; X86-SSE-NEXT: calll __gnu_h2f_ieee
; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
; X86-SSE-NEXT: pextrw $0, %xmm0, %eax
; X86-SSE-NEXT: movw %ax, (%esp)
; X86-SSE-NEXT: calll __extendhfsf2
; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp)
; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: maxss {{[0-9]+}}(%esp), %xmm0
; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-SSE-NEXT: minss %xmm0, %xmm1
; X86-SSE-NEXT: cvttss2si %xmm1, %eax
; X86-SSE-NEXT: cvttss2si %xmm0, %eax
; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE-NEXT: movl $32768, %ecx # imm = 0x8000
; X86-SSE-NEXT: cmovael %eax, %ecx
; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE-NEXT: movl $32767, %edx # imm = 0x7FFF
; X86-SSE-NEXT: cmovbel %ecx, %edx
; X86-SSE-NEXT: xorl %eax, %eax
; X86-SSE-NEXT: ucomiss %xmm0, %xmm0
; X86-SSE-NEXT: cmovnpl %edx, %eax
; X86-SSE-NEXT: # kill: def $ax killed $ax killed $eax
; X86-SSE-NEXT: addl $12, %esp
; X86-SSE-NEXT: retl
@ -2376,13 +2420,17 @@ define i16 @test_signed_i16_f16(half %f) nounwind {
; X64-LABEL: test_signed_i16_f16:
; X64: # %bb.0:
; X64-NEXT: pushq %rax
; X64-NEXT: movzwl %di, %edi
; X64-NEXT: callq __gnu_h2f_ieee@PLT
; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-NEXT: maxss %xmm0, %xmm1
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: minss %xmm1, %xmm0
; X64-NEXT: callq __extendhfsf2@PLT
; X64-NEXT: cvttss2si %xmm0, %eax
; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-NEXT: movl $32768, %ecx # imm = 0x8000
; X64-NEXT: cmovael %eax, %ecx
; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-NEXT: movl $32767, %edx # imm = 0x7FFF
; X64-NEXT: cmovbel %ecx, %edx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: ucomiss %xmm0, %xmm0
; X64-NEXT: cmovnpl %edx, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: popq %rcx
; X64-NEXT: retq
@ -2390,6 +2438,7 @@ define i16 @test_signed_i16_f16(half %f) nounwind {
ret i16 %x
}
; FIXME: Can be optimizated with maxss + minss
define i19 @test_signed_i19_f16(half %f) nounwind {
; X86-X87-LABEL: test_signed_i19_f16:
; X86-X87: # %bb.0:
@ -2444,31 +2493,39 @@ define i19 @test_signed_i19_f16(half %f) nounwind {
; X86-SSE-LABEL: test_signed_i19_f16:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: subl $12, %esp
; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl %eax, (%esp)
; X86-SSE-NEXT: calll __gnu_h2f_ieee
; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
; X86-SSE-NEXT: pextrw $0, %xmm0, %eax
; X86-SSE-NEXT: movw %ax, (%esp)
; X86-SSE-NEXT: calll __extendhfsf2
; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp)
; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: cvttss2si %xmm0, %eax
; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE-NEXT: movl $-262144, %ecx # imm = 0xFFFC0000
; X86-SSE-NEXT: cmovael %eax, %ecx
; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE-NEXT: movl $262143, %edx # imm = 0x3FFFF
; X86-SSE-NEXT: cmovbel %ecx, %edx
; X86-SSE-NEXT: xorl %eax, %eax
; X86-SSE-NEXT: ucomiss %xmm0, %xmm0
; X86-SSE-NEXT: maxss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE-NEXT: minss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE-NEXT: cvttss2si %xmm0, %ecx
; X86-SSE-NEXT: cmovnpl %ecx, %eax
; X86-SSE-NEXT: cmovnpl %edx, %eax
; X86-SSE-NEXT: addl $12, %esp
; X86-SSE-NEXT: retl
;
; X64-LABEL: test_signed_i19_f16:
; X64: # %bb.0:
; X64-NEXT: pushq %rax
; X64-NEXT: movzwl %di, %edi
; X64-NEXT: callq __gnu_h2f_ieee@PLT
; X64-NEXT: callq __extendhfsf2@PLT
; X64-NEXT: cvttss2si %xmm0, %eax
; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-NEXT: movl $-262144, %ecx # imm = 0xFFFC0000
; X64-NEXT: cmovael %eax, %ecx
; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-NEXT: movl $262143, %edx # imm = 0x3FFFF
; X64-NEXT: cmovbel %ecx, %edx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: ucomiss %xmm0, %xmm0
; X64-NEXT: maxss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-NEXT: minss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-NEXT: cvttss2si %xmm0, %ecx
; X64-NEXT: cmovnpl %ecx, %eax
; X64-NEXT: cmovnpl %edx, %eax
; X64-NEXT: popq %rcx
; X64-NEXT: retq
%x = call i19 @llvm.fptosi.sat.i19.f16(half %f)
@ -2529,33 +2586,39 @@ define i32 @test_signed_i32_f16(half %f) nounwind {
; X86-SSE-LABEL: test_signed_i32_f16:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: subl $12, %esp
; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl %eax, (%esp)
; X86-SSE-NEXT: calll __gnu_h2f_ieee
; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
; X86-SSE-NEXT: pextrw $0, %xmm0, %eax
; X86-SSE-NEXT: movw %ax, (%esp)
; X86-SSE-NEXT: calll __extendhfsf2
; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp)
; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: cvttss2si %xmm0, %eax
; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF
; X86-SSE-NEXT: cmovbel %eax, %ecx
; X86-SSE-NEXT: movl $-2147483648, %ecx # imm = 0x80000000
; X86-SSE-NEXT: cmovael %eax, %ecx
; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE-NEXT: movl $2147483647, %edx # imm = 0x7FFFFFFF
; X86-SSE-NEXT: cmovbel %ecx, %edx
; X86-SSE-NEXT: xorl %eax, %eax
; X86-SSE-NEXT: ucomiss %xmm0, %xmm0
; X86-SSE-NEXT: cmovnpl %ecx, %eax
; X86-SSE-NEXT: cmovnpl %edx, %eax
; X86-SSE-NEXT: addl $12, %esp
; X86-SSE-NEXT: retl
;
; X64-LABEL: test_signed_i32_f16:
; X64: # %bb.0:
; X64-NEXT: pushq %rax
; X64-NEXT: movzwl %di, %edi
; X64-NEXT: callq __gnu_h2f_ieee@PLT
; X64-NEXT: callq __extendhfsf2@PLT
; X64-NEXT: cvttss2si %xmm0, %eax
; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF
; X64-NEXT: cmovbel %eax, %ecx
; X64-NEXT: movl $-2147483648, %ecx # imm = 0x80000000
; X64-NEXT: cmovael %eax, %ecx
; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-NEXT: movl $2147483647, %edx # imm = 0x7FFFFFFF
; X64-NEXT: cmovbel %ecx, %edx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: ucomiss %xmm0, %xmm0
; X64-NEXT: cmovnpl %ecx, %eax
; X64-NEXT: cmovnpl %edx, %eax
; X64-NEXT: popq %rcx
; X64-NEXT: retq
%x = call i32 @llvm.fptosi.sat.i32.f16(half %f)
@ -2634,9 +2697,10 @@ define i50 @test_signed_i50_f16(half %f) nounwind {
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: pushl %esi
; X86-SSE-NEXT: subl $24, %esp
; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl %eax, (%esp)
; X86-SSE-NEXT: calll __gnu_h2f_ieee
; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
; X86-SSE-NEXT: pextrw $0, %xmm0, %eax
; X86-SSE-NEXT: movw %ax, (%esp)
; X86-SSE-NEXT: calll __extendhfsf2
; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp)
; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
@ -2669,8 +2733,7 @@ define i50 @test_signed_i50_f16(half %f) nounwind {
; X64-LABEL: test_signed_i50_f16:
; X64: # %bb.0:
; X64-NEXT: pushq %rax
; X64-NEXT: movzwl %di, %edi
; X64-NEXT: callq __gnu_h2f_ieee@PLT
; X64-NEXT: callq __extendhfsf2@PLT
; X64-NEXT: cvttss2si %xmm0, %rax
; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-NEXT: movabsq $-562949953421312, %rcx # imm = 0xFFFE000000000000
@ -2759,9 +2822,10 @@ define i64 @test_signed_i64_f16(half %f) nounwind {
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: pushl %esi
; X86-SSE-NEXT: subl $24, %esp
; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl %eax, (%esp)
; X86-SSE-NEXT: calll __gnu_h2f_ieee
; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
; X86-SSE-NEXT: pextrw $0, %xmm0, %eax
; X86-SSE-NEXT: movw %ax, (%esp)
; X86-SSE-NEXT: calll __extendhfsf2
; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp)
; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
@ -2794,15 +2858,17 @@ define i64 @test_signed_i64_f16(half %f) nounwind {
; X64-LABEL: test_signed_i64_f16:
; X64: # %bb.0:
; X64-NEXT: pushq %rax
; X64-NEXT: movzwl %di, %edi
; X64-NEXT: callq __gnu_h2f_ieee@PLT
; X64-NEXT: callq __extendhfsf2@PLT
; X64-NEXT: cvttss2si %xmm0, %rax
; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
; X64-NEXT: cmovbeq %rax, %rcx
; X64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
; X64-NEXT: cmovaeq %rax, %rcx
; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-NEXT: movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF
; X64-NEXT: cmovbeq %rcx, %rdx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: ucomiss %xmm0, %xmm0
; X64-NEXT: cmovnpq %rcx, %rax
; X64-NEXT: cmovnpq %rdx, %rax
; X64-NEXT: popq %rcx
; X64-NEXT: retq
%x = call i64 @llvm.fptosi.sat.i64.f16(half %f)
@ -2908,10 +2974,11 @@ define i100 @test_signed_i100_f16(half %f) nounwind {
; X86-SSE-NEXT: pushl %edi
; X86-SSE-NEXT: pushl %esi
; X86-SSE-NEXT: subl $44, %esp
; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl %eax, (%esp)
; X86-SSE-NEXT: calll __gnu_h2f_ieee
; X86-SSE-NEXT: pextrw $0, %xmm0, %eax
; X86-SSE-NEXT: movw %ax, (%esp)
; X86-SSE-NEXT: calll __extendhfsf2
; X86-SSE-NEXT: leal {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl %eax, (%esp)
; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp)
@ -2963,8 +3030,7 @@ define i100 @test_signed_i100_f16(half %f) nounwind {
; X64-LABEL: test_signed_i100_f16:
; X64: # %bb.0:
; X64-NEXT: pushq %rax
; X64-NEXT: movzwl %di, %edi
; X64-NEXT: callq __gnu_h2f_ieee@PLT
; X64-NEXT: callq __extendhfsf2@PLT
; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; X64-NEXT: callq __fixsfti@PLT
; X64-NEXT: xorl %ecx, %ecx
@ -3093,10 +3159,11 @@ define i128 @test_signed_i128_f16(half %f) nounwind {
; X86-SSE-NEXT: pushl %edi
; X86-SSE-NEXT: pushl %esi
; X86-SSE-NEXT: subl $44, %esp
; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl %eax, (%esp)
; X86-SSE-NEXT: calll __gnu_h2f_ieee
; X86-SSE-NEXT: pextrw $0, %xmm0, %eax
; X86-SSE-NEXT: movw %ax, (%esp)
; X86-SSE-NEXT: calll __extendhfsf2
; X86-SSE-NEXT: leal {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl %eax, (%esp)
; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp)
@ -3144,8 +3211,7 @@ define i128 @test_signed_i128_f16(half %f) nounwind {
; X64-LABEL: test_signed_i128_f16:
; X64: # %bb.0:
; X64-NEXT: pushq %rax
; X64-NEXT: movzwl %di, %edi
; X64-NEXT: callq __gnu_h2f_ieee@PLT
; X64-NEXT: callq __extendhfsf2@PLT
; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; X64-NEXT: callq __fixsfti@PLT
; X64-NEXT: xorl %ecx, %ecx

File diff suppressed because it is too large Load Diff

View File

@ -1922,15 +1922,20 @@ define i1 @test_unsigned_i1_f16(half %f) nounwind {
; X86-SSE-LABEL: test_unsigned_i1_f16:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: subl $12, %esp
; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl %eax, (%esp)
; X86-SSE-NEXT: calll __gnu_h2f_ieee
; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
; X86-SSE-NEXT: pextrw $0, %xmm0, %eax
; X86-SSE-NEXT: movw %ax, (%esp)
; X86-SSE-NEXT: calll __extendhfsf2
; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp)
; X86-SSE-NEXT: xorps %xmm0, %xmm0
; X86-SSE-NEXT: maxss {{[0-9]+}}(%esp), %xmm0
; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-SSE-NEXT: minss %xmm0, %xmm1
; X86-SSE-NEXT: cvttss2si %xmm1, %eax
; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: cvttss2si %xmm0, %eax
; X86-SSE-NEXT: xorl %ecx, %ecx
; X86-SSE-NEXT: xorps %xmm1, %xmm1
; X86-SSE-NEXT: ucomiss %xmm1, %xmm0
; X86-SSE-NEXT: cmovael %eax, %ecx
; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE-NEXT: movl $1, %eax
; X86-SSE-NEXT: cmovbel %ecx, %eax
; X86-SSE-NEXT: # kill: def $al killed $al killed $eax
; X86-SSE-NEXT: addl $12, %esp
; X86-SSE-NEXT: retl
@ -1938,13 +1943,15 @@ define i1 @test_unsigned_i1_f16(half %f) nounwind {
; X64-LABEL: test_unsigned_i1_f16:
; X64: # %bb.0:
; X64-NEXT: pushq %rax
; X64-NEXT: movzwl %di, %edi
; X64-NEXT: callq __gnu_h2f_ieee@PLT
; X64-NEXT: xorps %xmm1, %xmm1
; X64-NEXT: maxss %xmm0, %xmm1
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: minss %xmm1, %xmm0
; X64-NEXT: callq __extendhfsf2@PLT
; X64-NEXT: cvttss2si %xmm0, %eax
; X64-NEXT: xorl %ecx, %ecx
; X64-NEXT: xorps %xmm1, %xmm1
; X64-NEXT: ucomiss %xmm1, %xmm0
; X64-NEXT: cmovael %eax, %ecx
; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-NEXT: movl $1, %eax
; X64-NEXT: cmovbel %ecx, %eax
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: popq %rcx
; X64-NEXT: retq
@ -1997,15 +2004,20 @@ define i8 @test_unsigned_i8_f16(half %f) nounwind {
; X86-SSE-LABEL: test_unsigned_i8_f16:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: subl $12, %esp
; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl %eax, (%esp)
; X86-SSE-NEXT: calll __gnu_h2f_ieee
; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
; X86-SSE-NEXT: pextrw $0, %xmm0, %eax
; X86-SSE-NEXT: movw %ax, (%esp)
; X86-SSE-NEXT: calll __extendhfsf2
; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp)
; X86-SSE-NEXT: xorps %xmm0, %xmm0
; X86-SSE-NEXT: maxss {{[0-9]+}}(%esp), %xmm0
; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-SSE-NEXT: minss %xmm0, %xmm1
; X86-SSE-NEXT: cvttss2si %xmm1, %eax
; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: cvttss2si %xmm0, %eax
; X86-SSE-NEXT: xorl %ecx, %ecx
; X86-SSE-NEXT: xorps %xmm1, %xmm1
; X86-SSE-NEXT: ucomiss %xmm1, %xmm0
; X86-SSE-NEXT: cmovael %eax, %ecx
; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE-NEXT: movl $255, %eax
; X86-SSE-NEXT: cmovbel %ecx, %eax
; X86-SSE-NEXT: # kill: def $al killed $al killed $eax
; X86-SSE-NEXT: addl $12, %esp
; X86-SSE-NEXT: retl
@ -2013,13 +2025,15 @@ define i8 @test_unsigned_i8_f16(half %f) nounwind {
; X64-LABEL: test_unsigned_i8_f16:
; X64: # %bb.0:
; X64-NEXT: pushq %rax
; X64-NEXT: movzwl %di, %edi
; X64-NEXT: callq __gnu_h2f_ieee@PLT
; X64-NEXT: xorps %xmm1, %xmm1
; X64-NEXT: maxss %xmm0, %xmm1
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: minss %xmm1, %xmm0
; X64-NEXT: callq __extendhfsf2@PLT
; X64-NEXT: cvttss2si %xmm0, %eax
; X64-NEXT: xorl %ecx, %ecx
; X64-NEXT: xorps %xmm1, %xmm1
; X64-NEXT: ucomiss %xmm1, %xmm0
; X64-NEXT: cmovael %eax, %ecx
; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-NEXT: movl $255, %eax
; X64-NEXT: cmovbel %ecx, %eax
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: popq %rcx
; X64-NEXT: retq
@ -2071,15 +2085,20 @@ define i13 @test_unsigned_i13_f16(half %f) nounwind {
; X86-SSE-LABEL: test_unsigned_i13_f16:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: subl $12, %esp
; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl %eax, (%esp)
; X86-SSE-NEXT: calll __gnu_h2f_ieee
; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
; X86-SSE-NEXT: pextrw $0, %xmm0, %eax
; X86-SSE-NEXT: movw %ax, (%esp)
; X86-SSE-NEXT: calll __extendhfsf2
; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp)
; X86-SSE-NEXT: xorps %xmm0, %xmm0
; X86-SSE-NEXT: maxss {{[0-9]+}}(%esp), %xmm0
; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-SSE-NEXT: minss %xmm0, %xmm1
; X86-SSE-NEXT: cvttss2si %xmm1, %eax
; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: cvttss2si %xmm0, %eax
; X86-SSE-NEXT: xorl %ecx, %ecx
; X86-SSE-NEXT: xorps %xmm1, %xmm1
; X86-SSE-NEXT: ucomiss %xmm1, %xmm0
; X86-SSE-NEXT: cmovael %eax, %ecx
; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE-NEXT: movl $8191, %eax # imm = 0x1FFF
; X86-SSE-NEXT: cmovbel %ecx, %eax
; X86-SSE-NEXT: # kill: def $ax killed $ax killed $eax
; X86-SSE-NEXT: addl $12, %esp
; X86-SSE-NEXT: retl
@ -2087,13 +2106,15 @@ define i13 @test_unsigned_i13_f16(half %f) nounwind {
; X64-LABEL: test_unsigned_i13_f16:
; X64: # %bb.0:
; X64-NEXT: pushq %rax
; X64-NEXT: movzwl %di, %edi
; X64-NEXT: callq __gnu_h2f_ieee@PLT
; X64-NEXT: xorps %xmm1, %xmm1
; X64-NEXT: maxss %xmm0, %xmm1
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: minss %xmm1, %xmm0
; X64-NEXT: callq __extendhfsf2@PLT
; X64-NEXT: cvttss2si %xmm0, %eax
; X64-NEXT: xorl %ecx, %ecx
; X64-NEXT: xorps %xmm1, %xmm1
; X64-NEXT: ucomiss %xmm1, %xmm0
; X64-NEXT: cmovael %eax, %ecx
; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-NEXT: movl $8191, %eax # imm = 0x1FFF
; X64-NEXT: cmovbel %ecx, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: popq %rcx
; X64-NEXT: retq
@ -2145,15 +2166,20 @@ define i16 @test_unsigned_i16_f16(half %f) nounwind {
; X86-SSE-LABEL: test_unsigned_i16_f16:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: subl $12, %esp
; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl %eax, (%esp)
; X86-SSE-NEXT: calll __gnu_h2f_ieee
; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
; X86-SSE-NEXT: pextrw $0, %xmm0, %eax
; X86-SSE-NEXT: movw %ax, (%esp)
; X86-SSE-NEXT: calll __extendhfsf2
; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp)
; X86-SSE-NEXT: xorps %xmm0, %xmm0
; X86-SSE-NEXT: maxss {{[0-9]+}}(%esp), %xmm0
; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-SSE-NEXT: minss %xmm0, %xmm1
; X86-SSE-NEXT: cvttss2si %xmm1, %eax
; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: cvttss2si %xmm0, %eax
; X86-SSE-NEXT: xorl %ecx, %ecx
; X86-SSE-NEXT: xorps %xmm1, %xmm1
; X86-SSE-NEXT: ucomiss %xmm1, %xmm0
; X86-SSE-NEXT: cmovael %eax, %ecx
; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE-NEXT: movl $65535, %eax # imm = 0xFFFF
; X86-SSE-NEXT: cmovbel %ecx, %eax
; X86-SSE-NEXT: # kill: def $ax killed $ax killed $eax
; X86-SSE-NEXT: addl $12, %esp
; X86-SSE-NEXT: retl
@ -2161,13 +2187,15 @@ define i16 @test_unsigned_i16_f16(half %f) nounwind {
; X64-LABEL: test_unsigned_i16_f16:
; X64: # %bb.0:
; X64-NEXT: pushq %rax
; X64-NEXT: movzwl %di, %edi
; X64-NEXT: callq __gnu_h2f_ieee@PLT
; X64-NEXT: xorps %xmm1, %xmm1
; X64-NEXT: maxss %xmm0, %xmm1
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: minss %xmm1, %xmm0
; X64-NEXT: callq __extendhfsf2@PLT
; X64-NEXT: cvttss2si %xmm0, %eax
; X64-NEXT: xorl %ecx, %ecx
; X64-NEXT: xorps %xmm1, %xmm1
; X64-NEXT: ucomiss %xmm1, %xmm0
; X64-NEXT: cmovael %eax, %ecx
; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-NEXT: movl $65535, %eax # imm = 0xFFFF
; X64-NEXT: cmovbel %ecx, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: popq %rcx
; X64-NEXT: retq
@ -2219,27 +2247,42 @@ define i19 @test_unsigned_i19_f16(half %f) nounwind {
; X86-SSE-LABEL: test_unsigned_i19_f16:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: subl $12, %esp
; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl %eax, (%esp)
; X86-SSE-NEXT: calll __gnu_h2f_ieee
; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
; X86-SSE-NEXT: pextrw $0, %xmm0, %eax
; X86-SSE-NEXT: movw %ax, (%esp)
; X86-SSE-NEXT: calll __extendhfsf2
; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp)
; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: xorps %xmm1, %xmm1
; X86-SSE-NEXT: maxss %xmm1, %xmm0
; X86-SSE-NEXT: minss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE-NEXT: cvttss2si %xmm0, %eax
; X86-SSE-NEXT: movl %eax, %ecx
; X86-SSE-NEXT: sarl $31, %ecx
; X86-SSE-NEXT: movaps %xmm0, %xmm1
; X86-SSE-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
; X86-SSE-NEXT: cvttss2si %xmm1, %edx
; X86-SSE-NEXT: andl %ecx, %edx
; X86-SSE-NEXT: orl %eax, %edx
; X86-SSE-NEXT: xorl %ecx, %ecx
; X86-SSE-NEXT: xorps %xmm1, %xmm1
; X86-SSE-NEXT: ucomiss %xmm1, %xmm0
; X86-SSE-NEXT: cmovael %edx, %ecx
; X86-SSE-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE-NEXT: movl $524287, %eax # imm = 0x7FFFF
; X86-SSE-NEXT: cmovbel %ecx, %eax
; X86-SSE-NEXT: addl $12, %esp
; X86-SSE-NEXT: retl
;
; X64-LABEL: test_unsigned_i19_f16:
; X64: # %bb.0:
; X64-NEXT: pushq %rax
; X64-NEXT: movzwl %di, %edi
; X64-NEXT: callq __gnu_h2f_ieee@PLT
; X64-NEXT: callq __extendhfsf2@PLT
; X64-NEXT: cvttss2si %xmm0, %rax
; X64-NEXT: xorl %ecx, %ecx
; X64-NEXT: xorps %xmm1, %xmm1
; X64-NEXT: maxss %xmm1, %xmm0
; X64-NEXT: minss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-NEXT: cvttss2si %xmm0, %eax
; X64-NEXT: ucomiss %xmm1, %xmm0
; X64-NEXT: cmovael %eax, %ecx
; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-NEXT: movl $524287, %eax # imm = 0x7FFFF
; X64-NEXT: cmovbel %ecx, %eax
; X64-NEXT: popq %rcx
; X64-NEXT: retq
%x = call i19 @llvm.fptoui.sat.i19.f16(half %f)
@ -2290,9 +2333,10 @@ define i32 @test_unsigned_i32_f16(half %f) nounwind {
; X86-SSE-LABEL: test_unsigned_i32_f16:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: subl $12, %esp
; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl %eax, (%esp)
; X86-SSE-NEXT: calll __gnu_h2f_ieee
; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
; X86-SSE-NEXT: pextrw $0, %xmm0, %eax
; X86-SSE-NEXT: movw %ax, (%esp)
; X86-SSE-NEXT: calll __extendhfsf2
; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp)
; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: cvttss2si %xmm0, %eax
@ -2316,8 +2360,7 @@ define i32 @test_unsigned_i32_f16(half %f) nounwind {
; X64-LABEL: test_unsigned_i32_f16:
; X64: # %bb.0:
; X64-NEXT: pushq %rax
; X64-NEXT: movzwl %di, %edi
; X64-NEXT: callq __gnu_h2f_ieee@PLT
; X64-NEXT: callq __extendhfsf2@PLT
; X64-NEXT: cvttss2si %xmm0, %rax
; X64-NEXT: xorl %ecx, %ecx
; X64-NEXT: xorps %xmm1, %xmm1
@ -2406,9 +2449,10 @@ define i50 @test_unsigned_i50_f16(half %f) nounwind {
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: pushl %esi
; X86-SSE-NEXT: subl $24, %esp
; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl %eax, (%esp)
; X86-SSE-NEXT: calll __gnu_h2f_ieee
; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
; X86-SSE-NEXT: pextrw $0, %xmm0, %eax
; X86-SSE-NEXT: movw %ax, (%esp)
; X86-SSE-NEXT: calll __extendhfsf2
; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp)
; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@ -2452,13 +2496,19 @@ define i50 @test_unsigned_i50_f16(half %f) nounwind {
; X64-LABEL: test_unsigned_i50_f16:
; X64: # %bb.0:
; X64-NEXT: pushq %rax
; X64-NEXT: movzwl %di, %edi
; X64-NEXT: callq __gnu_h2f_ieee@PLT
; X64-NEXT: callq __extendhfsf2@PLT
; X64-NEXT: cvttss2si %xmm0, %rax
; X64-NEXT: movq %rax, %rcx
; X64-NEXT: sarq $63, %rcx
; X64-NEXT: movaps %xmm0, %xmm1
; X64-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; X64-NEXT: cvttss2si %xmm1, %rdx
; X64-NEXT: andq %rcx, %rdx
; X64-NEXT: orq %rax, %rdx
; X64-NEXT: xorl %ecx, %ecx
; X64-NEXT: xorps %xmm1, %xmm1
; X64-NEXT: ucomiss %xmm1, %xmm0
; X64-NEXT: cmovaeq %rax, %rcx
; X64-NEXT: cmovaeq %rdx, %rcx
; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-NEXT: movabsq $1125899906842623, %rax # imm = 0x3FFFFFFFFFFFF
; X64-NEXT: cmovbeq %rcx, %rax
@ -2540,9 +2590,10 @@ define i64 @test_unsigned_i64_f16(half %f) nounwind {
; X86-SSE-LABEL: test_unsigned_i64_f16:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: subl $28, %esp
; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl %eax, (%esp)
; X86-SSE-NEXT: calll __gnu_h2f_ieee
; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
; X86-SSE-NEXT: pextrw $0, %xmm0, %eax
; X86-SSE-NEXT: movw %ax, (%esp)
; X86-SSE-NEXT: calll __extendhfsf2
; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp)
; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@ -2584,8 +2635,7 @@ define i64 @test_unsigned_i64_f16(half %f) nounwind {
; X64-LABEL: test_unsigned_i64_f16:
; X64: # %bb.0:
; X64-NEXT: pushq %rax
; X64-NEXT: movzwl %di, %edi
; X64-NEXT: callq __gnu_h2f_ieee@PLT
; X64-NEXT: callq __extendhfsf2@PLT
; X64-NEXT: cvttss2si %xmm0, %rax
; X64-NEXT: movq %rax, %rcx
; X64-NEXT: sarq $63, %rcx
@ -2689,10 +2739,11 @@ define i100 @test_unsigned_i100_f16(half %f) nounwind {
; X86-SSE-NEXT: pushl %edi
; X86-SSE-NEXT: pushl %esi
; X86-SSE-NEXT: subl $32, %esp
; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl %eax, (%esp)
; X86-SSE-NEXT: calll __gnu_h2f_ieee
; X86-SSE-NEXT: pextrw $0, %xmm0, %eax
; X86-SSE-NEXT: movw %ax, (%esp)
; X86-SSE-NEXT: calll __extendhfsf2
; X86-SSE-NEXT: leal {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl %eax, (%esp)
; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp)
@ -2739,8 +2790,7 @@ define i100 @test_unsigned_i100_f16(half %f) nounwind {
; X64-LABEL: test_unsigned_i100_f16:
; X64: # %bb.0:
; X64-NEXT: pushq %rax
; X64-NEXT: movzwl %di, %edi
; X64-NEXT: callq __gnu_h2f_ieee@PLT
; X64-NEXT: callq __extendhfsf2@PLT
; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; X64-NEXT: callq __fixunssfti@PLT
; X64-NEXT: xorl %ecx, %ecx
@ -2840,10 +2890,11 @@ define i128 @test_unsigned_i128_f16(half %f) nounwind {
; X86-SSE-NEXT: pushl %edi
; X86-SSE-NEXT: pushl %esi
; X86-SSE-NEXT: subl $32, %esp
; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl %eax, (%esp)
; X86-SSE-NEXT: calll __gnu_h2f_ieee
; X86-SSE-NEXT: pextrw $0, %xmm0, %eax
; X86-SSE-NEXT: movw %ax, (%esp)
; X86-SSE-NEXT: calll __extendhfsf2
; X86-SSE-NEXT: leal {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movl %eax, (%esp)
; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp)
@ -2888,8 +2939,7 @@ define i128 @test_unsigned_i128_f16(half %f) nounwind {
; X64-LABEL: test_unsigned_i128_f16:
; X64: # %bb.0:
; X64-NEXT: pushq %rax
; X64-NEXT: movzwl %di, %edi
; X64-NEXT: callq __gnu_h2f_ieee@PLT
; X64-NEXT: callq __extendhfsf2@PLT
; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; X64-NEXT: callq __fixunssfti@PLT
; X64-NEXT: xorl %ecx, %ecx

File diff suppressed because it is too large Load Diff

View File

@ -38,14 +38,10 @@ define half @freeze_half() {
; X86ASM: # %bb.0:
; X86ASM-NEXT: pushq %rax
; X86ASM-NEXT: .cfi_def_cfa_offset 16
; X86ASM-NEXT: xorl %edi, %edi
; X86ASM-NEXT: callq __gnu_h2f_ieee@PLT
; X86ASM-NEXT: callq __gnu_f2h_ieee@PLT
; X86ASM-NEXT: movzwl %ax, %edi
; X86ASM-NEXT: callq __gnu_h2f_ieee@PLT
; X86ASM-NEXT: callq __extendhfsf2@PLT
; X86ASM-NEXT: addss %xmm0, %xmm0
; X86ASM-NEXT: callq __gnu_f2h_ieee@PLT
; X86ASM-NEXT: popq %rcx
; X86ASM-NEXT: callq __truncsfhf2@PLT
; X86ASM-NEXT: popq %rax
; X86ASM-NEXT: .cfi_def_cfa_offset 8
; X86ASM-NEXT: retq
%y1 = freeze half undef

File diff suppressed because it is too large Load Diff

View File

@ -36,7 +36,7 @@ define float @half_to_float() strictfp {
; X64-NOF16C: ## %bb.0:
; X64-NOF16C-NEXT: pushq %rax
; X64-NOF16C-NEXT: .cfi_def_cfa_offset 16
; X64-NOF16C-NEXT: movzwl _a(%rip), %edi
; X64-NOF16C-NEXT: pinsrw $0, _a(%rip), %xmm0
; X64-NOF16C-NEXT: callq ___extendhfsf2
; X64-NOF16C-NEXT: popq %rax
; X64-NOF16C-NEXT: retq
@ -81,9 +81,8 @@ define double @half_to_double() strictfp {
; X64-NOF16C: ## %bb.0:
; X64-NOF16C-NEXT: pushq %rax
; X64-NOF16C-NEXT: .cfi_def_cfa_offset 16
; X64-NOF16C-NEXT: movzwl _a(%rip), %edi
; X64-NOF16C-NEXT: callq ___extendhfsf2
; X64-NOF16C-NEXT: cvtss2sd %xmm0, %xmm0
; X64-NOF16C-NEXT: pinsrw $0, _a(%rip), %xmm0
; X64-NOF16C-NEXT: callq ___extendhfdf2
; X64-NOF16C-NEXT: popq %rax
; X64-NOF16C-NEXT: retq
;
@ -112,37 +111,30 @@ define x86_fp80 @half_to_fp80() strictfp {
;
; X32-F16C-LABEL: half_to_fp80:
; X32-F16C: ## %bb.0:
; X32-F16C-NEXT: pushl %eax
; X32-F16C-NEXT: .cfi_def_cfa_offset 8
; X32-F16C-NEXT: movzwl _a, %eax
; X32-F16C-NEXT: vmovd %eax, %xmm0
; X32-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; X32-F16C-NEXT: vmovss %xmm0, (%esp)
; X32-F16C-NEXT: flds (%esp)
; X32-F16C-NEXT: wait
; X32-F16C-NEXT: popl %eax
; X32-F16C-NEXT: subl $12, %esp
; X32-F16C-NEXT: .cfi_def_cfa_offset 16
; X32-F16C-NEXT: vpinsrw $0, _a, %xmm0, %xmm0
; X32-F16C-NEXT: vpextrw $0, %xmm0, (%esp)
; X32-F16C-NEXT: calll ___extendhfxf2
; X32-F16C-NEXT: addl $12, %esp
; X32-F16C-NEXT: retl
;
; X64-NOF16C-LABEL: half_to_fp80:
; X64-NOF16C: ## %bb.0:
; X64-NOF16C-NEXT: pushq %rax
; X64-NOF16C-NEXT: .cfi_def_cfa_offset 16
; X64-NOF16C-NEXT: movzwl _a(%rip), %edi
; X64-NOF16C-NEXT: callq ___extendhfsf2
; X64-NOF16C-NEXT: movss %xmm0, {{[0-9]+}}(%rsp)
; X64-NOF16C-NEXT: flds {{[0-9]+}}(%rsp)
; X64-NOF16C-NEXT: wait
; X64-NOF16C-NEXT: pinsrw $0, _a(%rip), %xmm0
; X64-NOF16C-NEXT: callq ___extendhfxf2
; X64-NOF16C-NEXT: popq %rax
; X64-NOF16C-NEXT: retq
;
; X64-F16C-LABEL: half_to_fp80:
; X64-F16C: ## %bb.0:
; X64-F16C-NEXT: movzwl _a(%rip), %eax
; X64-F16C-NEXT: vmovd %eax, %xmm0
; X64-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; X64-F16C-NEXT: vmovss %xmm0, -{{[0-9]+}}(%rsp)
; X64-F16C-NEXT: flds -{{[0-9]+}}(%rsp)
; X64-F16C-NEXT: wait
; X64-F16C-NEXT: pushq %rax
; X64-F16C-NEXT: .cfi_def_cfa_offset 16
; X64-F16C-NEXT: vpinsrw $0, _a(%rip), %xmm0, %xmm0
; X64-F16C-NEXT: callq ___extendhfxf2
; X64-F16C-NEXT: popq %rax
; X64-F16C-NEXT: retq
%1 = load half, half* @a, align 2
%2 = tail call x86_fp80 @llvm.experimental.constrained.fpext.f80.f16(half %1, metadata !"fpexcept.strict") #0
@ -166,7 +158,8 @@ define void @float_to_half(float %0) strictfp {
; X32-F16C: ## %bb.0:
; X32-F16C-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; X32-F16C-NEXT: vpextrw $0, %xmm0, _a
; X32-F16C-NEXT: vmovd %xmm0, %eax
; X32-F16C-NEXT: movw %ax, _a
; X32-F16C-NEXT: retl
;
; X64-NOF16C-LABEL: float_to_half:
@ -174,6 +167,7 @@ define void @float_to_half(float %0) strictfp {
; X64-NOF16C-NEXT: pushq %rax
; X64-NOF16C-NEXT: .cfi_def_cfa_offset 16
; X64-NOF16C-NEXT: callq ___truncsfhf2
; X64-NOF16C-NEXT: pextrw $0, %xmm0, %eax
; X64-NOF16C-NEXT: movw %ax, _a(%rip)
; X64-NOF16C-NEXT: popq %rax
; X64-NOF16C-NEXT: retq
@ -183,7 +177,8 @@ define void @float_to_half(float %0) strictfp {
; X64-F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X64-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; X64-F16C-NEXT: vpextrw $0, %xmm0, _a(%rip)
; X64-F16C-NEXT: vmovd %xmm0, %eax
; X64-F16C-NEXT: movw %ax, _a(%rip)
; X64-F16C-NEXT: retq
%2 = tail call half @llvm.experimental.constrained.fptrunc.f16.f32(float %0, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
store half %2, half* @a, align 2
@ -205,13 +200,13 @@ define void @double_to_half(double %0) strictfp {
;
; X32-F16C-LABEL: double_to_half:
; X32-F16C: ## %bb.0:
; X32-F16C-NEXT: subl $12, %esp
; X32-F16C-NEXT: .cfi_def_cfa_offset 16
; X32-F16C-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X32-F16C-NEXT: vmovsd %xmm0, (%esp)
; X32-F16C-NEXT: calll ___truncdfhf2
; X32-F16C-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0
; X32-F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X32-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; X32-F16C-NEXT: vmovd %xmm0, %eax
; X32-F16C-NEXT: movw %ax, _a
; X32-F16C-NEXT: addl $12, %esp
; X32-F16C-NEXT: retl
;
; X64-NOF16C-LABEL: double_to_half:
@ -219,17 +214,19 @@ define void @double_to_half(double %0) strictfp {
; X64-NOF16C-NEXT: pushq %rax
; X64-NOF16C-NEXT: .cfi_def_cfa_offset 16
; X64-NOF16C-NEXT: callq ___truncdfhf2
; X64-NOF16C-NEXT: pextrw $0, %xmm0, %eax
; X64-NOF16C-NEXT: movw %ax, _a(%rip)
; X64-NOF16C-NEXT: popq %rax
; X64-NOF16C-NEXT: retq
;
; X64-F16C-LABEL: double_to_half:
; X64-F16C: ## %bb.0:
; X64-F16C-NEXT: pushq %rax
; X64-F16C-NEXT: .cfi_def_cfa_offset 16
; X64-F16C-NEXT: callq ___truncdfhf2
; X64-F16C-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0
; X64-F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X64-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; X64-F16C-NEXT: vmovd %xmm0, %eax
; X64-F16C-NEXT: movw %ax, _a(%rip)
; X64-F16C-NEXT: popq %rax
; X64-F16C-NEXT: retq
%2 = tail call half @llvm.experimental.constrained.fptrunc.f16.f64(double %0, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
store half %2, half* @a, align 2
@ -257,7 +254,7 @@ define void @fp80_to_half(x86_fp80 %0) strictfp {
; X32-F16C-NEXT: fstpt (%esp)
; X32-F16C-NEXT: wait
; X32-F16C-NEXT: calll ___truncxfhf2
; X32-F16C-NEXT: movw %ax, _a
; X32-F16C-NEXT: vpextrw $0, %xmm0, _a
; X32-F16C-NEXT: addl $28, %esp
; X32-F16C-NEXT: retl
;
@ -269,6 +266,7 @@ define void @fp80_to_half(x86_fp80 %0) strictfp {
; X64-NOF16C-NEXT: fstpt (%rsp)
; X64-NOF16C-NEXT: wait
; X64-NOF16C-NEXT: callq ___truncxfhf2
; X64-NOF16C-NEXT: pextrw $0, %xmm0, %eax
; X64-NOF16C-NEXT: movw %ax, _a(%rip)
; X64-NOF16C-NEXT: addq $24, %rsp
; X64-NOF16C-NEXT: retq
@ -281,7 +279,7 @@ define void @fp80_to_half(x86_fp80 %0) strictfp {
; X64-F16C-NEXT: fstpt (%rsp)
; X64-F16C-NEXT: wait
; X64-F16C-NEXT: callq ___truncxfhf2
; X64-F16C-NEXT: movw %ax, _a(%rip)
; X64-F16C-NEXT: vpextrw $0, %xmm0, _a(%rip)
; X64-F16C-NEXT: addq $24, %rsp
; X64-F16C-NEXT: retq
%2 = tail call half @llvm.experimental.constrained.fptrunc.f16.f80(x86_fp80 %0, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
@ -323,20 +321,22 @@ define void @add() strictfp {
; X32-F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X32-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; X32-F16C-NEXT: vpextrw $0, %xmm0, _c
; X32-F16C-NEXT: vmovd %xmm0, %eax
; X32-F16C-NEXT: movw %ax, _c
; X32-F16C-NEXT: retl
;
; X64-NOF16C-LABEL: add:
; X64-NOF16C: ## %bb.0:
; X64-NOF16C-NEXT: pushq %rax
; X64-NOF16C-NEXT: .cfi_def_cfa_offset 16
; X64-NOF16C-NEXT: movzwl _a(%rip), %edi
; X64-NOF16C-NEXT: pinsrw $0, _a(%rip), %xmm0
; X64-NOF16C-NEXT: callq ___extendhfsf2
; X64-NOF16C-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
; X64-NOF16C-NEXT: movzwl _b(%rip), %edi
; X64-NOF16C-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill
; X64-NOF16C-NEXT: pinsrw $0, _b(%rip), %xmm0
; X64-NOF16C-NEXT: callq ___extendhfsf2
; X64-NOF16C-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 4-byte Folded Reload
; X64-NOF16C-NEXT: callq ___truncsfhf2
; X64-NOF16C-NEXT: pextrw $0, %xmm0, %eax
; X64-NOF16C-NEXT: movw %ax, _c(%rip)
; X64-NOF16C-NEXT: popq %rax
; X64-NOF16C-NEXT: retq
@ -353,7 +353,8 @@ define void @add() strictfp {
; X64-F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X64-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; X64-F16C-NEXT: vpextrw $0, %xmm0, _c(%rip)
; X64-F16C-NEXT: vmovd %xmm0, %eax
; X64-F16C-NEXT: movw %ax, _c(%rip)
; X64-F16C-NEXT: retq
%1 = load half, half* @a, align 2
%2 = tail call float @llvm.experimental.constrained.fpext.f32.f16(half %1, metadata !"fpexcept.strict") #0

File diff suppressed because it is too large Load Diff

View File

@ -7,68 +7,77 @@
define <1 x half> @ir_fadd_v1f16(<1 x half> %arg0, <1 x half> %arg1) nounwind {
; X86-LABEL: ir_fadd_v1f16:
; X86: # %bb.0:
; X86-NEXT: pushl %esi
; X86-NEXT: subl $12, %esp
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl %eax, (%esp)
; X86-NEXT: calll __gnu_h2f_ieee
; X86-NEXT: movl %esi, (%esp)
; X86-NEXT: subl $28, %esp
; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
; X86-NEXT: pextrw $0, %xmm0, %eax
; X86-NEXT: movw %ax, (%esp)
; X86-NEXT: calll __extendhfsf2
; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
; X86-NEXT: pextrw $0, %xmm0, %eax
; X86-NEXT: movw %ax, (%esp)
; X86-NEXT: fstps {{[0-9]+}}(%esp)
; X86-NEXT: calll __gnu_h2f_ieee
; X86-NEXT: calll __extendhfsf2
; X86-NEXT: fstps {{[0-9]+}}(%esp)
; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-NEXT: addss {{[0-9]+}}(%esp), %xmm0
; X86-NEXT: movss %xmm0, (%esp)
; X86-NEXT: calll __gnu_f2h_ieee
; X86-NEXT: addl $12, %esp
; X86-NEXT: popl %esi
; X86-NEXT: calll __truncsfhf2
; X86-NEXT: addl $28, %esp
; X86-NEXT: retl
;
; X64-LABEL: ir_fadd_v1f16:
; X64: # %bb.0:
; X64-NEXT: pushq %rbx
; X64-NEXT: subq $16, %rsp
; X64-NEXT: movl %edi, %ebx
; X64-NEXT: movzwl %si, %edi
; X64-NEXT: callq __gnu_h2f_ieee@PLT
; X64-NEXT: pushq %rax
; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; X64-NEXT: movzwl %bx, %edi
; X64-NEXT: callq __gnu_h2f_ieee@PLT
; X64-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; X64-NEXT: callq __gnu_f2h_ieee@PLT
; X64-NEXT: addq $16, %rsp
; X64-NEXT: popq %rbx
; X64-NEXT: movaps %xmm1, %xmm0
; X64-NEXT: callq __extendhfsf2@PLT
; X64-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
; X64-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; X64-NEXT: # xmm0 = mem[0],zero,zero,zero
; X64-NEXT: callq __extendhfsf2@PLT
; X64-NEXT: addss (%rsp), %xmm0 # 4-byte Folded Reload
; X64-NEXT: callq __truncsfhf2@PLT
; X64-NEXT: popq %rax
; X64-NEXT: retq
;
; F16C-LABEL: ir_fadd_v1f16:
; F16C: # %bb.0:
; F16C-NEXT: movzwl %si, %eax
; F16C-NEXT: vmovd %eax, %xmm0
; F16C-NEXT: vpextrw $0, %xmm0, %eax
; F16C-NEXT: vpextrw $0, %xmm1, %ecx
; F16C-NEXT: movzwl %cx, %ecx
; F16C-NEXT: vmovd %ecx, %xmm0
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: movzwl %di, %eax
; F16C-NEXT: movzwl %ax, %eax
; F16C-NEXT: vmovd %eax, %xmm1
; F16C-NEXT: vcvtph2ps %xmm1, %xmm1
; F16C-NEXT: vaddss %xmm0, %xmm1, %xmm0
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; F16C-NEXT: vmovd %xmm0, %eax
; F16C-NEXT: # kill: def $ax killed $ax killed $eax
; F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
; F16C-NEXT: retq
;
; F16C-O0-LABEL: ir_fadd_v1f16:
; F16C-O0: # %bb.0:
; F16C-O0-NEXT: movw %si, %cx
; F16C-O0-NEXT: movw %di, %ax
; F16C-O0-NEXT: movzwl %cx, %ecx
; F16C-O0-NEXT: vmovd %ecx, %xmm0
; F16C-O0-NEXT: vcvtph2ps %xmm0, %xmm1
; F16C-O0-NEXT: vpextrw $0, %xmm1, %eax
; F16C-O0-NEXT: # kill: def $ax killed $ax killed $eax
; F16C-O0-NEXT: movzwl %ax, %eax
; F16C-O0-NEXT: vmovd %eax, %xmm1
; F16C-O0-NEXT: vcvtph2ps %xmm1, %xmm1
; F16C-O0-NEXT: vpextrw $0, %xmm0, %eax
; F16C-O0-NEXT: # kill: def $ax killed $ax killed $eax
; F16C-O0-NEXT: movzwl %ax, %eax
; F16C-O0-NEXT: vmovd %eax, %xmm0
; F16C-O0-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-O0-NEXT: vaddss %xmm1, %xmm0, %xmm0
; F16C-O0-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; F16C-O0-NEXT: vmovd %xmm0, %eax
; F16C-O0-NEXT: # kill: def $ax killed $ax killed $eax
; F16C-O0-NEXT: movw %ax, %cx
; F16C-O0-NEXT: # implicit-def: $eax
; F16C-O0-NEXT: movw %cx, %ax
; F16C-O0-NEXT: # implicit-def: $xmm0
; F16C-O0-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
; F16C-O0-NEXT: retq
%retval = fadd <1 x half> %arg0, %arg1
ret <1 x half> %retval
@ -77,148 +86,148 @@ define <1 x half> @ir_fadd_v1f16(<1 x half> %arg0, <1 x half> %arg1) nounwind {
define <2 x half> @ir_fadd_v2f16(<2 x half> %arg0, <2 x half> %arg1) nounwind {
; X86-LABEL: ir_fadd_v2f16:
; X86: # %bb.0:
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
; X86-NEXT: subl $64, %esp
; X86-NEXT: movzwl 8(%ebp), %esi
; X86-NEXT: movzwl 12(%ebp), %edi
; X86-NEXT: movzwl 20(%ebp), %ebx
; X86-NEXT: movzwl 16(%ebp), %eax
; X86-NEXT: movl %eax, (%esp)
; X86-NEXT: calll __gnu_h2f_ieee
; X86-NEXT: subl $80, %esp
; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
; X86-NEXT: pextrw $0, %xmm0, %eax
; X86-NEXT: movw %ax, (%esp)
; X86-NEXT: calll __extendhfsf2
; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
; X86-NEXT: movl %ebx, (%esp)
; X86-NEXT: calll __gnu_h2f_ieee
; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
; X86-NEXT: pextrw $0, %xmm0, %eax
; X86-NEXT: movw %ax, (%esp)
; X86-NEXT: calll __extendhfsf2
; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
; X86-NEXT: movl %edi, (%esp)
; X86-NEXT: calll __gnu_h2f_ieee
; X86-NEXT: movl %esi, (%esp)
; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
; X86-NEXT: pextrw $0, %xmm0, %eax
; X86-NEXT: movw %ax, (%esp)
; X86-NEXT: calll __extendhfsf2
; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
; X86-NEXT: pextrw $0, %xmm0, %eax
; X86-NEXT: movw %ax, (%esp)
; X86-NEXT: fstps {{[0-9]+}}(%esp)
; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
; X86-NEXT: fstps {{[0-9]+}}(%esp)
; X86-NEXT: calll __gnu_h2f_ieee
; X86-NEXT: calll __extendhfsf2
; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-NEXT: addss {{[0-9]+}}(%esp), %xmm0
; X86-NEXT: movss %xmm0, (%esp)
; X86-NEXT: fstps {{[0-9]+}}(%esp)
; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
; X86-NEXT: fstps {{[0-9]+}}(%esp)
; X86-NEXT: calll __gnu_f2h_ieee
; X86-NEXT: calll __truncsfhf2
; X86-NEXT: movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-NEXT: addss {{[0-9]+}}(%esp), %xmm0
; X86-NEXT: movss %xmm0, (%esp)
; X86-NEXT: movw %ax, {{[0-9]+}}(%esp)
; X86-NEXT: calll __gnu_f2h_ieee
; X86-NEXT: movw %ax, {{[0-9]+}}(%esp)
; X86-NEXT: movdqa {{[0-9]+}}(%esp), %xmm0
; X86-NEXT: movd %xmm0, %eax
; X86-NEXT: pextrw $1, %xmm0, %edx
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: # kill: def $dx killed $dx killed $edx
; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
; X86-NEXT: popl %ebp
; X86-NEXT: calll __truncsfhf2
; X86-NEXT: movaps %xmm0, %xmm1
; X86-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
; X86-NEXT: # xmm0 = mem[0],zero,zero,zero
; X86-NEXT: addl $80, %esp
; X86-NEXT: retl
;
; X64-LABEL: ir_fadd_v2f16:
; X64: # %bb.0:
; X64-NEXT: pushq %rbp
; X64-NEXT: pushq %r14
; X64-NEXT: pushq %rbx
; X64-NEXT: subq $32, %rsp
; X64-NEXT: movl %edx, %ebp
; X64-NEXT: movl %esi, %ebx
; X64-NEXT: movl %edi, %r14d
; X64-NEXT: movzwl %cx, %edi
; X64-NEXT: callq __gnu_h2f_ieee@PLT
; X64-NEXT: subq $24, %rsp
; X64-NEXT: movss %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; X64-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; X64-NEXT: movzwl %bx, %edi
; X64-NEXT: callq __gnu_h2f_ieee@PLT
; X64-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; X64-NEXT: callq __gnu_f2h_ieee@PLT
; X64-NEXT: movw %ax, {{[0-9]+}}(%rsp)
; X64-NEXT: movzwl %bp, %edi
; X64-NEXT: callq __gnu_h2f_ieee@PLT
; X64-NEXT: movaps %xmm2, %xmm0
; X64-NEXT: callq __extendhfsf2@PLT
; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; X64-NEXT: movzwl %r14w, %edi
; X64-NEXT: callq __gnu_h2f_ieee@PLT
; X64-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; X64-NEXT: # xmm0 = mem[0],zero,zero,zero
; X64-NEXT: callq __extendhfsf2@PLT
; X64-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; X64-NEXT: callq __gnu_f2h_ieee@PLT
; X64-NEXT: movw %ax, {{[0-9]+}}(%rsp)
; X64-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0
; X64-NEXT: movd %xmm0, %eax
; X64-NEXT: pextrw $1, %xmm0, %edx
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: # kill: def $dx killed $dx killed $edx
; X64-NEXT: addq $32, %rsp
; X64-NEXT: popq %rbx
; X64-NEXT: popq %r14
; X64-NEXT: popq %rbp
; X64-NEXT: callq __truncsfhf2@PLT
; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; X64-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; X64-NEXT: # xmm0 = mem[0],zero,zero,zero
; X64-NEXT: callq __extendhfsf2@PLT
; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; X64-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; X64-NEXT: # xmm0 = mem[0],zero,zero,zero
; X64-NEXT: callq __extendhfsf2@PLT
; X64-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; X64-NEXT: callq __truncsfhf2@PLT
; X64-NEXT: movaps %xmm0, %xmm1
; X64-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; X64-NEXT: # xmm0 = mem[0],zero,zero,zero
; X64-NEXT: addq $24, %rsp
; X64-NEXT: retq
;
; F16C-LABEL: ir_fadd_v2f16:
; F16C: # %bb.0:
; F16C-NEXT: movzwl %cx, %eax
; F16C-NEXT: vmovd %eax, %xmm0
; F16C-NEXT: vpextrw $0, %xmm1, %eax
; F16C-NEXT: vpextrw $0, %xmm3, %ecx
; F16C-NEXT: vpextrw $0, %xmm0, %edx
; F16C-NEXT: vpextrw $0, %xmm2, %esi
; F16C-NEXT: movzwl %si, %esi
; F16C-NEXT: vmovd %esi, %xmm0
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: movzwl %si, %eax
; F16C-NEXT: vmovd %eax, %xmm1
; F16C-NEXT: movzwl %dx, %edx
; F16C-NEXT: vmovd %edx, %xmm1
; F16C-NEXT: vcvtph2ps %xmm1, %xmm1
; F16C-NEXT: vaddss %xmm0, %xmm1, %xmm0
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; F16C-NEXT: vpextrw $0, %xmm0, -{{[0-9]+}}(%rsp)
; F16C-NEXT: movzwl %dx, %eax
; F16C-NEXT: vmovd %eax, %xmm0
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: movzwl %di, %eax
; F16C-NEXT: vmovd %eax, %xmm1
; F16C-NEXT: vmovd %xmm0, %edx
; F16C-NEXT: vpinsrw $0, %edx, %xmm0, %xmm0
; F16C-NEXT: movzwl %cx, %ecx
; F16C-NEXT: vmovd %ecx, %xmm1
; F16C-NEXT: vcvtph2ps %xmm1, %xmm1
; F16C-NEXT: vaddss %xmm0, %xmm1, %xmm0
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; F16C-NEXT: vpextrw $0, %xmm0, -{{[0-9]+}}(%rsp)
; F16C-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0
; F16C-NEXT: vmovd %xmm0, %eax
; F16C-NEXT: vpextrw $1, %xmm0, %edx
; F16C-NEXT: # kill: def $ax killed $ax killed $eax
; F16C-NEXT: # kill: def $dx killed $dx killed $edx
; F16C-NEXT: movzwl %ax, %eax
; F16C-NEXT: vmovd %eax, %xmm2
; F16C-NEXT: vcvtph2ps %xmm2, %xmm2
; F16C-NEXT: vaddss %xmm1, %xmm2, %xmm1
; F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; F16C-NEXT: vmovd %xmm1, %eax
; F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1
; F16C-NEXT: retq
;
; F16C-O0-LABEL: ir_fadd_v2f16:
; F16C-O0: # %bb.0:
; F16C-O0-NEXT: movl %esi, %eax
; F16C-O0-NEXT: # kill: def $cx killed $cx killed $ecx
; F16C-O0-NEXT: movw %dx, %si
; F16C-O0-NEXT: vpextrw $0, %xmm2, %eax
; F16C-O0-NEXT: # kill: def $ax killed $ax killed $eax
; F16C-O0-NEXT: movzwl %ax, %eax
; F16C-O0-NEXT: vmovd %eax, %xmm2
; F16C-O0-NEXT: vcvtph2ps %xmm2, %xmm2
; F16C-O0-NEXT: vpextrw $0, %xmm0, %eax
; F16C-O0-NEXT: # kill: def $ax killed $ax killed $eax
; F16C-O0-NEXT: movw %di, %dx
; F16C-O0-NEXT: movzwl %si, %esi
; F16C-O0-NEXT: vmovd %esi, %xmm0
; F16C-O0-NEXT: vcvtph2ps %xmm0, %xmm1
; F16C-O0-NEXT: movzwl %dx, %edx
; F16C-O0-NEXT: vmovd %edx, %xmm0
; F16C-O0-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-O0-NEXT: vaddss %xmm1, %xmm0, %xmm0
; F16C-O0-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; F16C-O0-NEXT: vpextrw $0, %xmm0, -{{[0-9]+}}(%rsp)
; F16C-O0-NEXT: movzwl %cx, %ecx
; F16C-O0-NEXT: vmovd %ecx, %xmm0
; F16C-O0-NEXT: vcvtph2ps %xmm0, %xmm1
; F16C-O0-NEXT: movzwl %ax, %eax
; F16C-O0-NEXT: vmovd %eax, %xmm0
; F16C-O0-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-O0-NEXT: vaddss %xmm1, %xmm0, %xmm0
; F16C-O0-NEXT: vaddss %xmm2, %xmm0, %xmm0
; F16C-O0-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; F16C-O0-NEXT: vpextrw $0, %xmm0, -{{[0-9]+}}(%rsp)
; F16C-O0-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0
; F16C-O0-NEXT: vmovd %xmm0, %eax
; F16C-O0-NEXT: movw %ax, %cx
; F16C-O0-NEXT: # implicit-def: $eax
; F16C-O0-NEXT: movw %cx, %ax
; F16C-O0-NEXT: # implicit-def: $xmm0
; F16C-O0-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
; F16C-O0-NEXT: vpextrw $0, %xmm3, %eax
; F16C-O0-NEXT: # kill: def $ax killed $ax killed $eax
; F16C-O0-NEXT: vpextrw $1, %xmm0, %ecx
; F16C-O0-NEXT: movw %cx, %dx
; F16C-O0-NEXT: movzwl %ax, %eax
; F16C-O0-NEXT: vmovd %eax, %xmm2
; F16C-O0-NEXT: vcvtph2ps %xmm2, %xmm2
; F16C-O0-NEXT: vpextrw $0, %xmm1, %eax
; F16C-O0-NEXT: # kill: def $ax killed $ax killed $eax
; F16C-O0-NEXT: movzwl %ax, %eax
; F16C-O0-NEXT: vmovd %eax, %xmm1
; F16C-O0-NEXT: vcvtph2ps %xmm1, %xmm1
; F16C-O0-NEXT: vaddss %xmm2, %xmm1, %xmm1
; F16C-O0-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; F16C-O0-NEXT: vmovd %xmm1, %eax
; F16C-O0-NEXT: movw %ax, %cx
; F16C-O0-NEXT: # implicit-def: $eax
; F16C-O0-NEXT: movw %cx, %ax
; F16C-O0-NEXT: # implicit-def: $xmm1
; F16C-O0-NEXT: vpinsrw $0, %eax, %xmm1, %xmm1
; F16C-O0-NEXT: retq
%retval = fadd <2 x half> %arg0, %arg1
ret <2 x half> %retval

View File

@ -1,23 +1,52 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s
; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=SSE2
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefix=AVX512
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512fp16 | FileCheck %s --check-prefix=AVX512FP16
; This test makes sure that a vector that needs to be promoted that is bitcasted to fp16 is legalized correctly without causing a width mismatch.
define void @constant_fold_vector_to_half() {
; CHECK-LABEL: constant_fold_vector_to_half:
; CHECK: # %bb.0:
; CHECK-NEXT: movw $16384, (%rax) # imm = 0x4000
; CHECK-NEXT: retq
; SSE2-LABEL: constant_fold_vector_to_half:
; SSE2: # %bb.0:
; SSE2-NEXT: movw $16384, -{{[0-9]+}}(%rsp) # imm = 0x4000
; SSE2-NEXT: pinsrw $0, -{{[0-9]+}}(%rsp), %xmm0
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movw %ax, (%rax)
; SSE2-NEXT: retq
;
; AVX512-LABEL: constant_fold_vector_to_half:
; AVX512: # %bb.0:
; AVX512-NEXT: movw $16384, -{{[0-9]+}}(%rsp) # imm = 0x4000
; AVX512-NEXT: vpinsrw $0, -{{[0-9]+}}(%rsp), %xmm0, %xmm0
; AVX512-NEXT: vpextrw $0, %xmm0, (%rax)
; AVX512-NEXT: retq
;
; AVX512FP16-LABEL: constant_fold_vector_to_half:
; AVX512FP16: # %bb.0:
; AVX512FP16-NEXT: movw $16384, -{{[0-9]+}}(%rsp) # imm = 0x4000
; AVX512FP16-NEXT: vmovsh -{{[0-9]+}}(%rsp), %xmm0
; AVX512FP16-NEXT: vmovsh %xmm0, (%rax)
; AVX512FP16-NEXT: retq
store volatile half bitcast (<4 x i4> <i4 0, i4 0, i4 0, i4 4> to half), half* undef
ret void
}
; Similarly this makes sure that the opposite bitcast of the above is also legalized without crashing.
define void @pr38533_2(half %x) {
; CHECK-LABEL: pr38533_2:
; CHECK: # %bb.0:
; CHECK-NEXT: movw %di, (%rax)
; CHECK-NEXT: retq
; SSE2-LABEL: pr38533_2:
; SSE2: # %bb.0:
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movw %ax, (%rax)
; SSE2-NEXT: retq
;
; AVX512-LABEL: pr38533_2:
; AVX512: # %bb.0:
; AVX512-NEXT: vpextrw $0, %xmm0, (%rax)
; AVX512-NEXT: retq
;
; AVX512FP16-LABEL: pr38533_2:
; AVX512FP16: # %bb.0:
; AVX512FP16-NEXT: vmovsh %xmm0, (%rax)
; AVX512FP16-NEXT: retq
%a = bitcast half %x to <4 x i4>
store volatile <4 x i4> %a, <4 x i4>* undef
ret void
@ -25,10 +54,21 @@ define void @pr38533_2(half %x) {
; This case is a bitcast from fp16 to a 16-bit wide legal vector type. In this case the result type is legal when the bitcast gets type legalized.
define void @pr38533_3(half %x) {
; CHECK-LABEL: pr38533_3:
; CHECK: # %bb.0:
; CHECK-NEXT: movw %di, (%rax)
; CHECK-NEXT: retq
; SSE2-LABEL: pr38533_3:
; SSE2: # %bb.0:
; SSE2-NEXT: pextrw $0, %xmm0, %eax
; SSE2-NEXT: movw %ax, (%rax)
; SSE2-NEXT: retq
;
; AVX512-LABEL: pr38533_3:
; AVX512: # %bb.0:
; AVX512-NEXT: vpextrw $0, %xmm0, (%rax)
; AVX512-NEXT: retq
;
; AVX512FP16-LABEL: pr38533_3:
; AVX512FP16: # %bb.0:
; AVX512FP16-NEXT: vmovsh %xmm0, (%rax)
; AVX512FP16-NEXT: retq
%a = bitcast half %x to <16 x i1>
store volatile <16 x i1> %a, <16 x i1>* undef
ret void

View File

@ -7,55 +7,86 @@ target triple = "i386-unknown-linux-unknown"
define <4 x half> @doTheTestMod(<4 x half> %0, <4 x half> %1) nounwind {
; CHECK-LABEL: doTheTestMod:
; CHECK: # %bb.0: # %Entry
; CHECK-NEXT: pushl %ebp
; CHECK-NEXT: pushl %ebx
; CHECK-NEXT: pushl %edi
; CHECK-NEXT: pushl %esi
; CHECK-NEXT: subl $124, %esp
; CHECK-NEXT: # implicit-def: $xmm3
; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm3
; CHECK-NEXT: # implicit-def: $xmm2
; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm2
; CHECK-NEXT: # implicit-def: $xmm1
; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm1
; CHECK-NEXT: # implicit-def: $xmm0
; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
; CHECK-NEXT: # implicit-def: $xmm4
; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm4
; CHECK-NEXT: # implicit-def: $xmm5
; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm5
; CHECK-NEXT: # implicit-def: $xmm6
; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm6
; CHECK-NEXT: # implicit-def: $xmm7
; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm7
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: movw {{[0-9]+}}(%esp), %si
; CHECK-NEXT: movw {{[0-9]+}}(%esp), %dx
; CHECK-NEXT: movw {{[0-9]+}}(%esp), %cx
; CHECK-NEXT: movw {{[0-9]+}}(%esp), %ax
; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
; CHECK-NEXT: movw {{[0-9]+}}(%esp), %di
; CHECK-NEXT: movw {{[0-9]+}}(%esp), %bx
; CHECK-NEXT: movw {{[0-9]+}}(%esp), %bp
; CHECK-NEXT: movw {{[0-9]+}}(%esp), %ax
; CHECK-NEXT: pextrw $0, %xmm7, %eax
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp)
; CHECK-NEXT: movw {{[-0-9]+}}(%e{{[sb]}}p), %ax # 2-byte Reload
; CHECK-NEXT: movw %bp, {{[0-9]+}}(%esp)
; CHECK-NEXT: movw %bx, {{[0-9]+}}(%esp)
; CHECK-NEXT: movw %di, {{[0-9]+}}(%esp)
; CHECK-NEXT: movw %si, {{[0-9]+}}(%esp)
; CHECK-NEXT: movw %dx, {{[0-9]+}}(%esp)
; CHECK-NEXT: movw %cx, {{[0-9]+}}(%esp)
; CHECK-NEXT: pextrw $0, %xmm6, %eax
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp)
; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: pextrw $0, %xmm5, %eax
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp)
; CHECK-NEXT: pextrw $0, %xmm4, %eax
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp)
; CHECK-NEXT: pextrw $0, %xmm3, %eax
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp)
; CHECK-NEXT: pextrw $0, %xmm2, %eax
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp)
; CHECK-NEXT: pextrw $0, %xmm1, %eax
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp)
; CHECK-NEXT: pextrw $0, %xmm0, %eax
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp)
; CHECK-NEXT: # implicit-def: $xmm0
; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: # implicit-def: $xmm0
; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: # implicit-def: $xmm0
; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: # implicit-def: $xmm0
; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
; CHECK-NEXT: # implicit-def: $xmm1
; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm1
; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: # implicit-def: $xmm1
; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm1
; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: # implicit-def: $xmm1
; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm1
; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: # implicit-def: $xmm1
; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm1
; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: pextrw $0, %xmm0, %eax
; CHECK-NEXT: movw %ax, %cx
; CHECK-NEXT: movl %esp, %eax
; CHECK-NEXT: movl %ecx, (%eax)
; CHECK-NEXT: calll __gnu_h2f_ieee
; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; CHECK-NEXT: movw %cx, (%eax)
; CHECK-NEXT: calll __extendhfsf2
; CHECK-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
; CHECK-NEXT: pextrw $0, %xmm0, %eax
; CHECK-NEXT: movw %ax, %cx
; CHECK-NEXT: movl %esp, %eax
; CHECK-NEXT: movl %ecx, (%eax)
; CHECK-NEXT: calll __gnu_h2f_ieee
; CHECK-NEXT: movw %cx, (%eax)
; CHECK-NEXT: calll __extendhfsf2
; CHECK-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
; CHECK-NEXT: movl %esp, %eax
; CHECK-NEXT: fxch %st(1)
@ -64,17 +95,24 @@ define <4 x half> @doTheTestMod(<4 x half> %0, <4 x half> %1) nounwind {
; CHECK-NEXT: calll fmodf
; CHECK-NEXT: movl %esp, %eax
; CHECK-NEXT: fstps (%eax)
; CHECK-NEXT: calll __gnu_f2h_ieee
; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
; CHECK-NEXT: calll __truncsfhf2
; CHECK-NEXT: movaps %xmm0, %xmm1
; CHECK-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: pextrw $0, %xmm0, %eax
; CHECK-NEXT: movw %ax, %cx
; CHECK-NEXT: movl %esp, %eax
; CHECK-NEXT: movl %ecx, (%eax)
; CHECK-NEXT: calll __gnu_h2f_ieee
; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; CHECK-NEXT: movw %cx, (%eax)
; CHECK-NEXT: calll __extendhfsf2
; CHECK-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
; CHECK-NEXT: pextrw $0, %xmm0, %eax
; CHECK-NEXT: movw %ax, %cx
; CHECK-NEXT: movl %esp, %eax
; CHECK-NEXT: movl %ecx, (%eax)
; CHECK-NEXT: calll __gnu_h2f_ieee
; CHECK-NEXT: movw %cx, (%eax)
; CHECK-NEXT: calll __extendhfsf2
; CHECK-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
; CHECK-NEXT: movl %esp, %eax
; CHECK-NEXT: fxch %st(1)
@ -83,17 +121,24 @@ define <4 x half> @doTheTestMod(<4 x half> %0, <4 x half> %1) nounwind {
; CHECK-NEXT: calll fmodf
; CHECK-NEXT: movl %esp, %eax
; CHECK-NEXT: fstps (%eax)
; CHECK-NEXT: calll __gnu_f2h_ieee
; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; CHECK-NEXT: movw %ax, %si
; CHECK-NEXT: calll __truncsfhf2
; CHECK-NEXT: movaps %xmm0, %xmm1
; CHECK-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: pextrw $0, %xmm0, %eax
; CHECK-NEXT: movw %ax, %cx
; CHECK-NEXT: movl %esp, %eax
; CHECK-NEXT: movl %ecx, (%eax)
; CHECK-NEXT: calll __gnu_h2f_ieee
; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; CHECK-NEXT: movw %cx, (%eax)
; CHECK-NEXT: calll __extendhfsf2
; CHECK-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
; CHECK-NEXT: pextrw $0, %xmm0, %eax
; CHECK-NEXT: movw %ax, %cx
; CHECK-NEXT: movl %esp, %eax
; CHECK-NEXT: movl %ecx, (%eax)
; CHECK-NEXT: calll __gnu_h2f_ieee
; CHECK-NEXT: movw %cx, (%eax)
; CHECK-NEXT: calll __extendhfsf2
; CHECK-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
; CHECK-NEXT: movl %esp, %eax
; CHECK-NEXT: fxch %st(1)
@ -102,17 +147,24 @@ define <4 x half> @doTheTestMod(<4 x half> %0, <4 x half> %1) nounwind {
; CHECK-NEXT: calll fmodf
; CHECK-NEXT: movl %esp, %eax
; CHECK-NEXT: fstps (%eax)
; CHECK-NEXT: calll __gnu_f2h_ieee
; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; CHECK-NEXT: movw %ax, %di
; CHECK-NEXT: calll __truncsfhf2
; CHECK-NEXT: movaps %xmm0, %xmm1
; CHECK-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: movss %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: pextrw $0, %xmm0, %eax
; CHECK-NEXT: movw %ax, %cx
; CHECK-NEXT: movl %esp, %eax
; CHECK-NEXT: movl %ecx, (%eax)
; CHECK-NEXT: calll __gnu_h2f_ieee
; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; CHECK-NEXT: movw %cx, (%eax)
; CHECK-NEXT: calll __extendhfsf2
; CHECK-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
; CHECK-NEXT: pextrw $0, %xmm0, %eax
; CHECK-NEXT: movw %ax, %cx
; CHECK-NEXT: movl %esp, %eax
; CHECK-NEXT: movl %ecx, (%eax)
; CHECK-NEXT: calll __gnu_h2f_ieee
; CHECK-NEXT: movw %cx, (%eax)
; CHECK-NEXT: calll __extendhfsf2
; CHECK-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
; CHECK-NEXT: movl %esp, %eax
; CHECK-NEXT: fxch %st(1)
@ -121,20 +173,29 @@ define <4 x half> @doTheTestMod(<4 x half> %0, <4 x half> %1) nounwind {
; CHECK-NEXT: calll fmodf
; CHECK-NEXT: movl %esp, %eax
; CHECK-NEXT: fstps (%eax)
; CHECK-NEXT: calll __gnu_f2h_ieee
; CHECK-NEXT: movw {{[-0-9]+}}(%e{{[sb]}}p), %dx # 2-byte Reload
; CHECK-NEXT: calll __truncsfhf2
; CHECK-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 4-byte Reload
; CHECK-NEXT: # xmm2 = mem[0],zero,zero,zero
; CHECK-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 4-byte Reload
; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; CHECK-NEXT: movw %ax, %bx
; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; CHECK-NEXT: movw %bx, 6(%ecx)
; CHECK-NEXT: movw %di, 4(%ecx)
; CHECK-NEXT: movw %si, 2(%ecx)
; CHECK-NEXT: movaps %xmm0, %xmm3
; CHECK-NEXT: movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: pextrw $0, %xmm3, %edx
; CHECK-NEXT: # kill: def $dx killed $dx killed $edx
; CHECK-NEXT: movw %dx, 6(%ecx)
; CHECK-NEXT: pextrw $0, %xmm2, %edx
; CHECK-NEXT: # kill: def $dx killed $dx killed $edx
; CHECK-NEXT: movw %dx, 4(%ecx)
; CHECK-NEXT: pextrw $0, %xmm1, %edx
; CHECK-NEXT: # kill: def $dx killed $dx killed $edx
; CHECK-NEXT: movw %dx, 2(%ecx)
; CHECK-NEXT: pextrw $0, %xmm0, %edx
; CHECK-NEXT: # kill: def $dx killed $dx killed $edx
; CHECK-NEXT: movw %dx, (%ecx)
; CHECK-NEXT: addl $124, %esp
; CHECK-NEXT: popl %esi
; CHECK-NEXT: popl %edi
; CHECK-NEXT: popl %ebx
; CHECK-NEXT: popl %ebp
; CHECK-NEXT: retl $4
Entry:
%x = alloca <4 x half>, align 8

View File

@ -128,7 +128,7 @@ body: |
; CHECK-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm $noreg, 1, $noreg, @csum_ipv6_magic_daddr, $noreg :: (dereferenceable load (s32) from @csum_ipv6_magic_daddr, !tbaa !4)
; CHECK-NEXT: [[MOV32rm1:%[0-9]+]]:gr32 = MOV32rm $noreg, 1, $noreg, @csum_ipv6_magic_proto, $noreg :: (dereferenceable load (s32) from @csum_ipv6_magic_proto, !tbaa !4)
; CHECK-NEXT: [[MOV32r0_:%[0-9]+]]:gr32 = MOV32r0 implicit-def dead $eflags
; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2293771 /* regdef-ec:GR32 */, def early-clobber %2, 65545 /* reguse:GR8 */, [[MOV8rm]], 2293769 /* reguse:GR32 */, [[MOV32rm]], 2293769 /* reguse:GR32 */, [[MOV32r0_]], 2293769 /* reguse:GR32 */, [[MOV32rm1]], 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags, !8
; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2359307 /* regdef-ec:GR32 */, def early-clobber %2, 65545 /* reguse:GR8 */, [[MOV8rm]], 2359305 /* reguse:GR32 */, [[MOV32rm]], 2359305 /* reguse:GR32 */, [[MOV32r0_]], 2359305 /* reguse:GR32 */, [[MOV32rm1]], 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags, !8
; CHECK-NEXT: MOV32mr $noreg, 1, $noreg, @csum_ipv6_magic_sum, $noreg, %2 :: (store (s32) into @csum_ipv6_magic_sum, !tbaa !4)
; CHECK-NEXT: [[MOV32rm2:%[0-9]+]]:gr32 = MOV32rm $noreg, 1, $noreg, @synproxy_send_tcp_ipv6_nskb, $noreg :: (dereferenceable load (s32) from `i8** bitcast (%struct.sk_buff** @synproxy_send_tcp_ipv6_nskb to i8**)`, !tbaa !9)
; CHECK-NEXT: OR8mi [[MOV32rm2]], 1, $noreg, 0, $noreg, 3, implicit-def dead $eflags :: (store (s8) into %ir.4), (load (s8) from %ir.4)
@ -143,7 +143,7 @@ body: |
%4:gr32 = MOV32rm $noreg, 1, $noreg, @csum_ipv6_magic_daddr, $noreg :: (dereferenceable load (s32) from @csum_ipv6_magic_daddr, !tbaa !5)
%6:gr32 = MOV32rm $noreg, 1, $noreg, @csum_ipv6_magic_proto, $noreg :: (dereferenceable load (s32) from @csum_ipv6_magic_proto, !tbaa !5)
%5:gr32 = MOV32r0 implicit-def dead $eflags
INLINEASM &"", 0 /* attdialect */, 2293771 /* regdef-ec:GR32 */, def early-clobber %2, 65545 /* reguse:GR8 */, %3, 2293769 /* reguse:GR32 */, %4, 2293769 /* reguse:GR32 */, %5, 2293769 /* reguse:GR32 */, %6, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags, !9
INLINEASM &"", 0 /* attdialect */, 2359307 /* regdef-ec:GR32 */, def early-clobber %2, 65545 /* reguse:GR8 */, %3, 2359305 /* reguse:GR32 */, %4, 2359305 /* reguse:GR32 */, %5, 2359305 /* reguse:GR32 */, %6, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags, !9
MOV32mr $noreg, 1, $noreg, @csum_ipv6_magic_sum, $noreg, %2 :: (store (s32) into @csum_ipv6_magic_sum, !tbaa !5)
%7:gr32 = MOV32rm $noreg, 1, $noreg, @synproxy_send_tcp_ipv6_nskb, $noreg :: (dereferenceable load (s32) from `i8** bitcast (%struct.sk_buff** @synproxy_send_tcp_ipv6_nskb to i8**)`, !tbaa !10)
OR8mi %7, 1, $noreg, 0, $noreg, 3, implicit-def dead $eflags :: (store (s8) into %ir.4), (load (s8) from %ir.4)

View File

@ -4,13 +4,30 @@
define void @f(<4 x half>* %a, <4 x half>* %b, <8 x half>* %c) {
; CHECK-LABEL: f:
; CHECK: # %bb.0:
; CHECK-NEXT: movq (%rdi), %rax
; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0
; CHECK-NEXT: movq (%rsi), %rax
; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; CHECK-NEXT: movdqa %xmm0, (%rdx)
; CHECK-NEXT: pinsrw $0, (%rdi), %xmm0
; CHECK-NEXT: pinsrw $0, 2(%rdi), %xmm1
; CHECK-NEXT: pinsrw $0, 4(%rdi), %xmm2
; CHECK-NEXT: pinsrw $0, 6(%rdi), %xmm3
; CHECK-NEXT: pinsrw $0, (%rsi), %xmm4
; CHECK-NEXT: pinsrw $0, 2(%rsi), %xmm5
; CHECK-NEXT: pinsrw $0, 4(%rsi), %xmm6
; CHECK-NEXT: pinsrw $0, 6(%rsi), %xmm7
; CHECK-NEXT: pextrw $0, %xmm7, %eax
; CHECK-NEXT: movw %ax, 14(%rdx)
; CHECK-NEXT: pextrw $0, %xmm3, %eax
; CHECK-NEXT: movw %ax, 12(%rdx)
; CHECK-NEXT: pextrw $0, %xmm6, %eax
; CHECK-NEXT: movw %ax, 10(%rdx)
; CHECK-NEXT: pextrw $0, %xmm2, %eax
; CHECK-NEXT: movw %ax, 8(%rdx)
; CHECK-NEXT: pextrw $0, %xmm5, %eax
; CHECK-NEXT: movw %ax, 6(%rdx)
; CHECK-NEXT: pextrw $0, %xmm1, %eax
; CHECK-NEXT: movw %ax, 4(%rdx)
; CHECK-NEXT: pextrw $0, %xmm4, %eax
; CHECK-NEXT: movw %ax, 2(%rdx)
; CHECK-NEXT: pextrw $0, %xmm0, %eax
; CHECK-NEXT: movw %ax, (%rdx)
; CHECK-NEXT: retq
%tmp4 = load <4 x half>, <4 x half>* %a
%tmp5 = load <4 x half>, <4 x half>* %b

View File

@ -803,7 +803,7 @@ define <32 x half> @stack_fold_fnmsub312ph_maskz(<32 x half> %a0, <32 x half> %a
define half @stack_fold_fmadd123sh(half %a0, half %a1, half %a2) {
;CHECK-LABEL: stack_fold_fmadd123sh:
;CHECK: vfmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
;CHECK: vfmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call half @llvm.fma.f16(half %a0, half %a1, half %a2)
ret half %2
@ -812,7 +812,7 @@ declare half @llvm.fma.f16(half, half, half)
define half @stack_fold_fmadd213sh(half %a0, half %a1, half %a2) {
;CHECK-LABEL: stack_fold_fmadd213sh:
;CHECK: vfmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
;CHECK: vfmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call half @llvm.fma.f16(half %a1, half %a0, half %a2)
ret half %2
@ -820,7 +820,7 @@ define half @stack_fold_fmadd213sh(half %a0, half %a1, half %a2) {
define half @stack_fold_fmadd231sh(half %a0, half %a1, half %a2) {
;CHECK-LABEL: stack_fold_fmadd231sh:
;CHECK: vfmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
;CHECK: vfmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call half @llvm.fma.f16(half %a1, half %a2, half %a0)
ret half %2
@ -828,7 +828,7 @@ define half @stack_fold_fmadd231sh(half %a0, half %a1, half %a2) {
define half @stack_fold_fmadd321sh(half %a0, half %a1, half %a2) {
;CHECK-LABEL: stack_fold_fmadd321sh:
;CHECK: vfmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
;CHECK: vfmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call half @llvm.fma.f16(half %a2, half %a1, half %a0)
ret half %2
@ -836,7 +836,7 @@ define half @stack_fold_fmadd321sh(half %a0, half %a1, half %a2) {
define half @stack_fold_fmadd132sh(half %a0, half %a1, half %a2) {
;CHECK-LABEL: stack_fold_fmadd132sh:
;CHECK: vfmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
;CHECK: vfmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call half @llvm.fma.f16(half %a0, half %a2, half %a1)
ret half %2
@ -844,7 +844,7 @@ define half @stack_fold_fmadd132sh(half %a0, half %a1, half %a2) {
define half @stack_fold_fmadd312sh(half %a0, half %a1, half %a2) {
;CHECK-LABEL: stack_fold_fmadd312sh:
;CHECK: vfmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
;CHECK: vfmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call half @llvm.fma.f16(half %a2, half %a0, half %a1)
ret half %2
@ -852,7 +852,7 @@ define half @stack_fold_fmadd312sh(half %a0, half %a1, half %a2) {
define half @stack_fold_fmsub123sh(half %a0, half %a1, half %a2) {
;CHECK-LABEL: stack_fold_fmsub123sh:
;CHECK: vfmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
;CHECK: vfmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fneg half %a2
%3 = call half @llvm.fma.f16(half %a0, half %a1, half %2)
@ -861,7 +861,7 @@ define half @stack_fold_fmsub123sh(half %a0, half %a1, half %a2) {
define half @stack_fold_fmsub213sh(half %a0, half %a1, half %a2) {
;CHECK-LABEL: stack_fold_fmsub213sh:
;CHECK: vfmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
;CHECK: vfmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fneg half %a2
%3 = call half @llvm.fma.f16(half %a1, half %a0, half %2)
@ -870,7 +870,7 @@ define half @stack_fold_fmsub213sh(half %a0, half %a1, half %a2) {
define half @stack_fold_fmsub231sh(half %a0, half %a1, half %a2) {
;CHECK-LABEL: stack_fold_fmsub231sh:
;CHECK: vfmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
;CHECK: vfmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fneg half %a0
%3 = call half @llvm.fma.f16(half %a1, half %a2, half %2)
@ -879,7 +879,7 @@ define half @stack_fold_fmsub231sh(half %a0, half %a1, half %a2) {
define half @stack_fold_fmsub321sh(half %a0, half %a1, half %a2) {
;CHECK-LABEL: stack_fold_fmsub321sh:
;CHECK: vfmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
;CHECK: vfmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fneg half %a0
%3 = call half @llvm.fma.f16(half %a2, half %a1, half %2)
@ -888,7 +888,7 @@ define half @stack_fold_fmsub321sh(half %a0, half %a1, half %a2) {
define half @stack_fold_fmsub132sh(half %a0, half %a1, half %a2) {
;CHECK-LABEL: stack_fold_fmsub132sh:
;CHECK: vfmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
;CHECK: vfmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fneg half %a1
%3 = call half @llvm.fma.f16(half %a0, half %a2, half %2)
@ -897,7 +897,7 @@ define half @stack_fold_fmsub132sh(half %a0, half %a1, half %a2) {
define half @stack_fold_fmsub312sh(half %a0, half %a1, half %a2) {
;CHECK-LABEL: stack_fold_fmsub312sh:
;CHECK: vfmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
;CHECK: vfmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fneg half %a1
%3 = call half @llvm.fma.f16(half %a2, half %a0, half %2)
@ -906,7 +906,7 @@ define half @stack_fold_fmsub312sh(half %a0, half %a1, half %a2) {
define half @stack_fold_fnmadd123sh(half %a0, half %a1, half %a2) {
;CHECK-LABEL: stack_fold_fnmadd123sh:
;CHECK: vfnmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
;CHECK: vfnmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fneg half %a0
%3 = call half @llvm.fma.f16(half %2, half %a1, half %a2)
@ -915,7 +915,7 @@ define half @stack_fold_fnmadd123sh(half %a0, half %a1, half %a2) {
define half @stack_fold_fnmadd213sh(half %a0, half %a1, half %a2) {
;CHECK-LABEL: stack_fold_fnmadd213sh:
;CHECK: vfnmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
;CHECK: vfnmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fneg half %a1
%3 = call half @llvm.fma.f16(half %2, half %a0, half %a2)
@ -924,7 +924,7 @@ define half @stack_fold_fnmadd213sh(half %a0, half %a1, half %a2) {
define half @stack_fold_fnmadd231sh(half %a0, half %a1, half %a2) {
;CHECK-LABEL: stack_fold_fnmadd231sh:
;CHECK: vfnmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
;CHECK: vfnmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fneg half %a1
%3 = call half @llvm.fma.f16(half %2, half %a2, half %a0)
@ -933,7 +933,7 @@ define half @stack_fold_fnmadd231sh(half %a0, half %a1, half %a2) {
define half @stack_fold_fnmadd321sh(half %a0, half %a1, half %a2) {
;CHECK-LABEL: stack_fold_fnmadd321sh:
;CHECK: vfnmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
;CHECK: vfnmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fneg half %a2
%3 = call half @llvm.fma.f16(half %2, half %a1, half %a0)
@ -942,7 +942,7 @@ define half @stack_fold_fnmadd321sh(half %a0, half %a1, half %a2) {
define half @stack_fold_fnmadd132sh(half %a0, half %a1, half %a2) {
;CHECK-LABEL: stack_fold_fnmadd132sh:
;CHECK: vfnmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
;CHECK: vfnmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fneg half %a0
%3 = call half @llvm.fma.f16(half %2, half %a2, half %a1)
@ -951,7 +951,7 @@ define half @stack_fold_fnmadd132sh(half %a0, half %a1, half %a2) {
define half @stack_fold_fnmadd312sh(half %a0, half %a1, half %a2) {
;CHECK-LABEL: stack_fold_fnmadd312sh:
;CHECK: vfnmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
;CHECK: vfnmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fneg half %a2
%3 = call half @llvm.fma.f16(half %2, half %a0, half %a1)
@ -960,7 +960,7 @@ define half @stack_fold_fnmadd312sh(half %a0, half %a1, half %a2) {
define half @stack_fold_fnmsub123sh(half %a0, half %a1, half %a2) {
;CHECK-LABEL: stack_fold_fnmsub123sh:
;CHECK: vfnmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
;CHECK: vfnmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fneg half %a0
%3 = fneg half %a2
@ -970,7 +970,7 @@ define half @stack_fold_fnmsub123sh(half %a0, half %a1, half %a2) {
define half @stack_fold_fnmsub213sh(half %a0, half %a1, half %a2) {
;CHECK-LABEL: stack_fold_fnmsub213sh:
;CHECK: vfnmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
;CHECK: vfnmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fneg half %a1
%3 = fneg half %a2
@ -980,7 +980,7 @@ define half @stack_fold_fnmsub213sh(half %a0, half %a1, half %a2) {
define half @stack_fold_fnmsub231sh(half %a0, half %a1, half %a2) {
;CHECK-LABEL: stack_fold_fnmsub231sh:
;CHECK: vfnmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
;CHECK: vfnmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fneg half %a1
%3 = fneg half %a0
@ -990,7 +990,7 @@ define half @stack_fold_fnmsub231sh(half %a0, half %a1, half %a2) {
define half @stack_fold_fnmsub321sh(half %a0, half %a1, half %a2) {
;CHECK-LABEL: stack_fold_fnmsub321sh:
;CHECK: vfnmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
;CHECK: vfnmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fneg half %a2
%3 = fneg half %a0
@ -1000,7 +1000,7 @@ define half @stack_fold_fnmsub321sh(half %a0, half %a1, half %a2) {
define half @stack_fold_fnmsub132sh(half %a0, half %a1, half %a2) {
;CHECK-LABEL: stack_fold_fnmsub132sh:
;CHECK: vfnmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
;CHECK: vfnmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fneg half %a0
%3 = fneg half %a1
@ -1010,7 +1010,7 @@ define half @stack_fold_fnmsub132sh(half %a0, half %a1, half %a2) {
define half @stack_fold_fnmsub312sh(half %a0, half %a1, half %a2) {
;CHECK-LABEL: stack_fold_fnmsub312sh:
;CHECK: vfnmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
;CHECK: vfnmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fneg half %a2
%3 = fneg half %a1

View File

@ -50,7 +50,7 @@ define <32 x half> @stack_fold_addph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i3
define half @stack_fold_addsh(half %a0, half %a1) {
;CHECK-LABEL: stack_fold_addsh
;CHECK: vaddsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
;CHECK: vaddsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fadd half %a0, %a1
ret half %2
@ -107,7 +107,7 @@ define <32 x half> @stack_fold_cmpph_mask_commuted(<32 x half> %a0, <32 x half>
define half @stack_fold_divsh(half %a0, half %a1) {
;CHECK-LABEL: stack_fold_divsh
;CHECK: vdivsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
;CHECK: vdivsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fdiv half %a0, %a1
ret half %2
@ -390,7 +390,7 @@ define <32 x half> @stack_fold_maxph_zmm_commutable_kz_commuted(<32 x half> %a0,
define half @stack_fold_maxsh(half %a0, half %a1) #0 {
;CHECK-LABEL: stack_fold_maxsh:
;CHECK: vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
;CHECK: vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fcmp ogt half %a0, %a1
%3 = select i1 %2, half %a0, half %a1
@ -399,7 +399,7 @@ define half @stack_fold_maxsh(half %a0, half %a1) #0 {
define half @stack_fold_maxsh_commuted(half %a0, half %a1) #0 {
;CHECK-LABEL: stack_fold_maxsh_commuted:
;CHECK-NOT: vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
;CHECK-NOT: vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fcmp ogt half %a1, %a0
%3 = select i1 %2, half %a1, half %a0
@ -408,7 +408,7 @@ define half @stack_fold_maxsh_commuted(half %a0, half %a1) #0 {
define half @stack_fold_maxsh_commutable(half %a0, half %a1) #1 {
;CHECK-LABEL: stack_fold_maxsh_commutable:
;CHECK: vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
;CHECK: vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fcmp ogt half %a0, %a1
%3 = select i1 %2, half %a0, half %a1
@ -417,7 +417,7 @@ define half @stack_fold_maxsh_commutable(half %a0, half %a1) #1 {
define half @stack_fold_maxsh_commutable_commuted(half %a0, half %a1) #1 {
;CHECK-LABEL: stack_fold_maxsh_commutable_commuted:
;CHECK: vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
;CHECK: vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fcmp ogt half %a1, %a0
%3 = select i1 %2, half %a1, half %a0
@ -569,7 +569,7 @@ define <32 x half> @stack_fold_minph_zmm_commutable_kz_commuted(<32 x half> %a0,
define half @stack_fold_minsh(half %a0, half %a1) #0 {
;CHECK-LABEL: stack_fold_minsh:
;CHECK: vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
;CHECK: vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fcmp olt half %a0, %a1
%3 = select i1 %2, half %a0, half %a1
@ -578,7 +578,7 @@ define half @stack_fold_minsh(half %a0, half %a1) #0 {
define half @stack_fold_minsh_commuted(half %a0, half %a1) #0 {
;CHECK-LABEL: stack_fold_minsh_commuted:
;CHECK-NOT: vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
;CHECK-NOT: vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fcmp olt half %a1, %a0
%3 = select i1 %2, half %a1, half %a0
@ -587,7 +587,7 @@ define half @stack_fold_minsh_commuted(half %a0, half %a1) #0 {
define half @stack_fold_minsh_commutable(half %a0, half %a1) #1 {
;CHECK-LABEL: stack_fold_minsh_commutable:
;CHECK: vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
;CHECK: vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fcmp olt half %a0, %a1
%3 = select i1 %2, half %a0, half %a1
@ -596,7 +596,7 @@ define half @stack_fold_minsh_commutable(half %a0, half %a1) #1 {
define half @stack_fold_minsh_commutable_commuted(half %a0, half %a1) #1 {
;CHECK-LABEL: stack_fold_minsh_commutable_commuted:
;CHECK: vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
;CHECK: vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fcmp olt half %a1, %a0
%3 = select i1 %2, half %a1, half %a0
@ -671,7 +671,7 @@ define <32 x half> @stack_fold_mulph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i3
define half @stack_fold_mulsh(half %a0, half %a1) {
;CHECK-LABEL: stack_fold_mulsh
;CHECK-NOT: vmulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
;CHECK-NOT: vmulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fmul half %a0, %a1
ret half %2
@ -972,7 +972,7 @@ define <32 x half> @stack_fold_subph_zmm(<32 x half> %a0, <32 x half> %a1) {
define half @stack_fold_subsh(half %a0, half %a1) {
;CHECK-LABEL: stack_fold_subsh
;CHECK: vsubsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
;CHECK: vsubsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = fsub half %a0, %a1
ret half %2

View File

@ -340,7 +340,7 @@ body: |
; CHECK: CMP64rr [[NOT64r2]], [[COPY6]], implicit-def $eflags
; CHECK: undef %102.sub_32bit:gr64_with_sub_8bit = MOV32ri 0
; CHECK: [[CMOV64rr:%[0-9]+]]:gr64 = CMOV64rr [[CMOV64rr]], %102, 4, implicit killed $eflags
; CHECK: INLINEASM &"lock btsq $0,($1)", 1 /* sideeffect attdialect */, 4390921 /* reguse:GR64 */, %102, 4390921 /* reguse:GR64 */, undef %56:gr64, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags
; CHECK: INLINEASM &"lock btsq $0,($1)", 1 /* sideeffect attdialect */, 4456457 /* reguse:GR64 */, %102, 4456457 /* reguse:GR64 */, undef %56:gr64, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags
; CHECK: LCMPXCHG32 undef %67:gr64, 1, $noreg, 0, $noreg, [[COPY5]], implicit-def dead $eax, implicit-def dead $eflags, implicit undef $eax :: (load store acquire monotonic (s32) on `i32 addrspace(1)* undef`, addrspace 1)
; CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK: $rdi = COPY [[COPY4]]
@ -456,7 +456,7 @@ body: |
%63:gr64 = NOT64r %63
CMP64rr %63, %31, implicit-def $eflags
%63:gr64 = CMOV64rr %63, %53, 4, implicit killed $eflags
INLINEASM &"lock btsq $0,($1)", 1 /* sideeffect attdialect */, 4390921 /* reguse:GR64 */, %53, 4390921 /* reguse:GR64 */, undef %56:gr64, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags
INLINEASM &"lock btsq $0,($1)", 1 /* sideeffect attdialect */, 4456457 /* reguse:GR64 */, %53, 4456457 /* reguse:GR64 */, undef %56:gr64, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags
LCMPXCHG32 undef %67:gr64, 1, $noreg, 0, $noreg, %65, implicit-def dead $eax, implicit-def dead $eflags, implicit undef $eax :: (load store acquire monotonic (s32) on `i32 addrspace(1)* undef`, addrspace 1)
ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
$rdi = COPY %64

View File

@ -2101,58 +2101,56 @@ define <8 x i32> @fptoui_8f32_to_8i32_const(<8 x float> %a) {
define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind {
; SSE-LABEL: fptosi_2f16_to_4i32:
; SSE: # %bb.0:
; SSE-NEXT: pushq %rbp
; SSE-NEXT: pushq %rbx
; SSE-NEXT: pushq %rax
; SSE-NEXT: movl %esi, %ebx
; SSE-NEXT: movzwl %di, %edi
; SSE-NEXT: callq __gnu_h2f_ieee@PLT
; SSE-NEXT: cvttss2si %xmm0, %ebp
; SSE-NEXT: movzwl %bx, %edi
; SSE-NEXT: callq __gnu_h2f_ieee@PLT
; SSE-NEXT: subq $16, %rsp
; SSE-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; SSE-NEXT: callq __extendhfsf2@PLT
; SSE-NEXT: cvttss2si %xmm0, %ebx
; SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: callq __extendhfsf2@PLT
; SSE-NEXT: cvttss2si %xmm0, %eax
; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: movd %ebp, %xmm1
; SSE-NEXT: movd %ebx, %xmm1
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero
; SSE-NEXT: addq $8, %rsp
; SSE-NEXT: addq $16, %rsp
; SSE-NEXT: popq %rbx
; SSE-NEXT: popq %rbp
; SSE-NEXT: retq
;
; VEX-LABEL: fptosi_2f16_to_4i32:
; VEX: # %bb.0:
; VEX-NEXT: pushq %rbp
; VEX-NEXT: pushq %rbx
; VEX-NEXT: pushq %rax
; VEX-NEXT: movl %esi, %ebx
; VEX-NEXT: movzwl %di, %edi
; VEX-NEXT: callq __gnu_h2f_ieee@PLT
; VEX-NEXT: vcvttss2si %xmm0, %ebp
; VEX-NEXT: movzwl %bx, %edi
; VEX-NEXT: callq __gnu_h2f_ieee@PLT
; VEX-NEXT: subq $16, %rsp
; VEX-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; VEX-NEXT: callq __extendhfsf2@PLT
; VEX-NEXT: vcvttss2si %xmm0, %ebx
; VEX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; VEX-NEXT: # xmm0 = mem[0],zero,zero,zero
; VEX-NEXT: callq __extendhfsf2@PLT
; VEX-NEXT: vcvttss2si %xmm0, %eax
; VEX-NEXT: vmovd %eax, %xmm0
; VEX-NEXT: vmovd %ebp, %xmm1
; VEX-NEXT: vmovd %ebx, %xmm1
; VEX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; VEX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; VEX-NEXT: addq $8, %rsp
; VEX-NEXT: addq $16, %rsp
; VEX-NEXT: popq %rbx
; VEX-NEXT: popq %rbp
; VEX-NEXT: retq
;
; AVX512-LABEL: fptosi_2f16_to_4i32:
; AVX512: # %bb.0:
; AVX512-NEXT: movzwl %di, %eax
; AVX512-NEXT: vmovd %eax, %xmm0
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vcvttss2si %xmm0, %eax
; AVX512-NEXT: movzwl %si, %ecx
; AVX512-NEXT: vpextrw $0, %xmm1, %eax
; AVX512-NEXT: vpextrw $0, %xmm0, %ecx
; AVX512-NEXT: movzwl %cx, %ecx
; AVX512-NEXT: vmovd %ecx, %xmm0
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vcvttss2si %xmm0, %ecx
; AVX512-NEXT: vmovd %ecx, %xmm0
; AVX512-NEXT: vmovd %eax, %xmm1
; AVX512-NEXT: movzwl %ax, %eax
; AVX512-NEXT: vmovd %eax, %xmm0
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vcvttss2si %xmm0, %eax
; AVX512-NEXT: vmovd %eax, %xmm0
; AVX512-NEXT: vmovd %ecx, %xmm1
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512-NEXT: retq

File diff suppressed because it is too large Load Diff

View File

@ -368,69 +368,58 @@ define half @test_v2f16(<2 x half> %a0) nounwind {
; SSE-LABEL: test_v2f16:
; SSE: # %bb.0:
; SSE-NEXT: pushq %rbp
; SSE-NEXT: pushq %r14
; SSE-NEXT: pushq %rbx
; SSE-NEXT: subq $16, %rsp
; SSE-NEXT: movl %esi, %ebx
; SSE-NEXT: movl %edi, %r14d
; SSE-NEXT: movzwl %bx, %ebp
; SSE-NEXT: movl %ebp, %edi
; SSE-NEXT: callq __gnu_h2f_ieee@PLT
; SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; SSE-NEXT: movzwl %r14w, %edi
; SSE-NEXT: callq __gnu_h2f_ieee@PLT
; SSE-NEXT: subq $40, %rsp
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pextrw $0, %xmm1, %ebx
; SSE-NEXT: pextrw $0, %xmm0, %ebp
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: callq __extendhfsf2@PLT
; SSE-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: callq __extendhfsf2@PLT
; SSE-NEXT: ucomiss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE-NEXT: movw %bp, {{[0-9]+}}(%rsp)
; SSE-NEXT: cmoval %r14d, %ebx
; SSE-NEXT: movw %bx, (%rsp)
; SSE-NEXT: movl (%rsp), %eax
; SSE-NEXT: # kill: def $ax killed $ax killed $eax
; SSE-NEXT: addq $16, %rsp
; SSE-NEXT: cmoval %ebp, %ebx
; SSE-NEXT: pinsrw $0, %ebx, %xmm0
; SSE-NEXT: addq $40, %rsp
; SSE-NEXT: popq %rbx
; SSE-NEXT: popq %r14
; SSE-NEXT: popq %rbp
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2f16:
; AVX: # %bb.0:
; AVX-NEXT: pushq %rbp
; AVX-NEXT: pushq %r14
; AVX-NEXT: pushq %rbx
; AVX-NEXT: subq $16, %rsp
; AVX-NEXT: movl %esi, %ebx
; AVX-NEXT: movl %edi, %r14d
; AVX-NEXT: movzwl %bx, %ebp
; AVX-NEXT: movl %ebp, %edi
; AVX-NEXT: callq __gnu_h2f_ieee@PLT
; AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX-NEXT: movzwl %r14w, %edi
; AVX-NEXT: callq __gnu_h2f_ieee@PLT
; AVX-NEXT: subq $40, %rsp
; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpextrw $0, %xmm1, %ebx
; AVX-NEXT: vpextrw $0, %xmm0, %ebp
; AVX-NEXT: vmovdqa %xmm1, %xmm0
; AVX-NEXT: callq __extendhfsf2@PLT
; AVX-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX-NEXT: callq __extendhfsf2@PLT
; AVX-NEXT: vucomiss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; AVX-NEXT: movw %bp, {{[0-9]+}}(%rsp)
; AVX-NEXT: cmoval %r14d, %ebx
; AVX-NEXT: movw %bx, (%rsp)
; AVX-NEXT: movl (%rsp), %eax
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
; AVX-NEXT: addq $16, %rsp
; AVX-NEXT: cmoval %ebp, %ebx
; AVX-NEXT: vpinsrw $0, %ebx, %xmm0, %xmm0
; AVX-NEXT: addq $40, %rsp
; AVX-NEXT: popq %rbx
; AVX-NEXT: popq %r14
; AVX-NEXT: popq %rbp
; AVX-NEXT: retq
;
; AVX512BW-LABEL: test_v2f16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: movzwl %si, %eax
; AVX512BW-NEXT: vmovd %eax, %xmm0
; AVX512BW-NEXT: vpextrw $0, %xmm0, %eax
; AVX512BW-NEXT: vpextrw $0, %xmm1, %ecx
; AVX512BW-NEXT: movzwl %cx, %ecx
; AVX512BW-NEXT: vmovd %ecx, %xmm0
; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512BW-NEXT: movzwl %di, %ecx
; AVX512BW-NEXT: vmovd %ecx, %xmm1
; AVX512BW-NEXT: movzwl %ax, %eax
; AVX512BW-NEXT: vmovd %eax, %xmm1
; AVX512BW-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512BW-NEXT: vucomiss %xmm0, %xmm1
; AVX512BW-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
; AVX512BW-NEXT: cmoval %edi, %esi
; AVX512BW-NEXT: movw %si, -{{[0-9]+}}(%rsp)
; AVX512BW-NEXT: movl -{{[0-9]+}}(%rsp), %eax
; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax
; AVX512BW-NEXT: cmoval %eax, %ecx
; AVX512BW-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0
; AVX512BW-NEXT: retq
;
; AVX512FP16-LABEL: test_v2f16:

View File

@ -367,69 +367,58 @@ define half @test_v2f16(<2 x half> %a0) nounwind {
; SSE-LABEL: test_v2f16:
; SSE: # %bb.0:
; SSE-NEXT: pushq %rbp
; SSE-NEXT: pushq %r14
; SSE-NEXT: pushq %rbx
; SSE-NEXT: subq $16, %rsp
; SSE-NEXT: movl %esi, %ebx
; SSE-NEXT: movl %edi, %r14d
; SSE-NEXT: movzwl %bx, %ebp
; SSE-NEXT: movl %ebp, %edi
; SSE-NEXT: callq __gnu_h2f_ieee@PLT
; SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; SSE-NEXT: movzwl %r14w, %edi
; SSE-NEXT: callq __gnu_h2f_ieee@PLT
; SSE-NEXT: subq $40, %rsp
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pextrw $0, %xmm1, %ebx
; SSE-NEXT: pextrw $0, %xmm0, %ebp
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: callq __extendhfsf2@PLT
; SSE-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: callq __extendhfsf2@PLT
; SSE-NEXT: ucomiss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; SSE-NEXT: movw %bp, {{[0-9]+}}(%rsp)
; SSE-NEXT: cmovbl %r14d, %ebx
; SSE-NEXT: movw %bx, (%rsp)
; SSE-NEXT: movl (%rsp), %eax
; SSE-NEXT: # kill: def $ax killed $ax killed $eax
; SSE-NEXT: addq $16, %rsp
; SSE-NEXT: cmovbl %ebp, %ebx
; SSE-NEXT: pinsrw $0, %ebx, %xmm0
; SSE-NEXT: addq $40, %rsp
; SSE-NEXT: popq %rbx
; SSE-NEXT: popq %r14
; SSE-NEXT: popq %rbp
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2f16:
; AVX: # %bb.0:
; AVX-NEXT: pushq %rbp
; AVX-NEXT: pushq %r14
; AVX-NEXT: pushq %rbx
; AVX-NEXT: subq $16, %rsp
; AVX-NEXT: movl %esi, %ebx
; AVX-NEXT: movl %edi, %r14d
; AVX-NEXT: movzwl %bx, %ebp
; AVX-NEXT: movl %ebp, %edi
; AVX-NEXT: callq __gnu_h2f_ieee@PLT
; AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX-NEXT: movzwl %r14w, %edi
; AVX-NEXT: callq __gnu_h2f_ieee@PLT
; AVX-NEXT: subq $40, %rsp
; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpextrw $0, %xmm1, %ebx
; AVX-NEXT: vpextrw $0, %xmm0, %ebp
; AVX-NEXT: vmovdqa %xmm1, %xmm0
; AVX-NEXT: callq __extendhfsf2@PLT
; AVX-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX-NEXT: callq __extendhfsf2@PLT
; AVX-NEXT: vucomiss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; AVX-NEXT: movw %bp, {{[0-9]+}}(%rsp)
; AVX-NEXT: cmovbl %r14d, %ebx
; AVX-NEXT: movw %bx, (%rsp)
; AVX-NEXT: movl (%rsp), %eax
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
; AVX-NEXT: addq $16, %rsp
; AVX-NEXT: cmovbl %ebp, %ebx
; AVX-NEXT: vpinsrw $0, %ebx, %xmm0, %xmm0
; AVX-NEXT: addq $40, %rsp
; AVX-NEXT: popq %rbx
; AVX-NEXT: popq %r14
; AVX-NEXT: popq %rbp
; AVX-NEXT: retq
;
; AVX512BW-LABEL: test_v2f16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: movzwl %si, %eax
; AVX512BW-NEXT: vmovd %eax, %xmm0
; AVX512BW-NEXT: vpextrw $0, %xmm0, %eax
; AVX512BW-NEXT: vpextrw $0, %xmm1, %ecx
; AVX512BW-NEXT: movzwl %cx, %ecx
; AVX512BW-NEXT: vmovd %ecx, %xmm0
; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512BW-NEXT: movzwl %di, %ecx
; AVX512BW-NEXT: vmovd %ecx, %xmm1
; AVX512BW-NEXT: movzwl %ax, %eax
; AVX512BW-NEXT: vmovd %eax, %xmm1
; AVX512BW-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512BW-NEXT: vucomiss %xmm0, %xmm1
; AVX512BW-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
; AVX512BW-NEXT: cmovbl %edi, %esi
; AVX512BW-NEXT: movw %si, -{{[0-9]+}}(%rsp)
; AVX512BW-NEXT: movl -{{[0-9]+}}(%rsp), %eax
; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax
; AVX512BW-NEXT: cmovbl %eax, %ecx
; AVX512BW-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0
; AVX512BW-NEXT: retq
;
; AVX512FP16-LABEL: test_v2f16:

View File

@ -5,16 +5,16 @@
// CHECK: Trying to match opcode MMX_PSHUFBrr
// CHECK: Matching formal operand class MCK_VR64 against actual operand at index 1 (Memory: ModeSize=64,BaseReg=rip,Scale=1,Disp=CPI1_0): Opcode result: multiple operand mismatches, ignoring this opcode
// CHECK: Trying to match opcode PSHUFBrr
// CHECK: Matching formal operand class MCK_FR32 against actual operand at index 1 (Memory: ModeSize=64,BaseReg=rip,Scale=1,Disp=CPI1_0): Opcode result: multiple operand mismatches, ignoring this opcode
// CHECK: Matching formal operand class MCK_FR16 against actual operand at index 1 (Memory: ModeSize=64,BaseReg=rip,Scale=1,Disp=CPI1_0): Opcode result: multiple operand mismatches, ignoring this opcode
// CHECK: Trying to match opcode PSHUFBrm
// CHECK: Matching formal operand class MCK_Mem128 against actual operand at index 1 (Memory: ModeSize=64,BaseReg=rip,Scale=1,Disp=CPI1_0): match success using generic matcher
// CHECK: Matching formal operand class MCK_FR32 against actual operand at index 2 (Reg:xmm1): match success using generic matcher
// CHECK: Matching formal operand class MCK_FR16 against actual operand at index 2 (Reg:xmm1): match success using generic matcher
// CHECK: Matching formal operand class InvalidMatchClass against actual operand at index 3: actual operand index out of range Opcode result: complete match, selecting this opcode
// CHECK: AsmMatcher: found 2 encodings with mnemonic 'sha1rnds4'
// CHECK: Trying to match opcode SHA1RNDS4rri
// CHECK: Matching formal operand class MCK_ImmUnsignedi8 against actual operand at index 1 (Imm:1): match success using generic matcher
// CHECK: Matching formal operand class MCK_FR32 against actual operand at index 2 (Reg:xmm1): match success using generic matcher
// CHECK: Matching formal operand class MCK_FR32 against actual operand at index 3 (Reg:xmm2): match success using generic matcher
// CHECK: Matching formal operand class MCK_FR16 against actual operand at index 2 (Reg:xmm1): match success using generic matcher
// CHECK: Matching formal operand class MCK_FR16 against actual operand at index 3 (Reg:xmm2): match success using generic matcher
// CHECK: Matching formal operand class InvalidMatchClass against actual operand at index 4: actual operand index out of range Opcode result: complete match, selecting this opcode
// CHECK: AsmMatcher: found 4 encodings with mnemonic 'pinsrw'
// CHECK: Trying to match opcode MMX_PINSRWrr
@ -24,7 +24,7 @@
// CHECK: Trying to match opcode PINSRWrr
// CHECK: Matching formal operand class MCK_ImmUnsignedi8 against actual operand at index 1 (Imm:3): match success using generic matcher
// CHECK: Matching formal operand class MCK_GR32orGR64 against actual operand at index 2 (Reg:ecx): match success using generic matcher
// CHECK: Matching formal operand class MCK_FR32 against actual operand at index 3 (Reg:xmm5): match success using generic matcher
// CHECK: Matching formal operand class MCK_FR16 against actual operand at index 3 (Reg:xmm5): match success using generic matcher
// CHECK: Matching formal operand class InvalidMatchClass against actual operand at index 4: actual operand index out of range Opcode result: complete match, selecting this opcode
// CHECK: AsmMatcher: found 2 encodings with mnemonic 'crc32l'
// CHECK: Trying to match opcode CRC32r32r32

View File

@ -1,3 +1,5 @@
// FIXME: The runtime needs support for _Float16 on X86, see PR55992
// UNSUPPORTED: i386, x86_64
// RUN: mlir-opt %s --sparse-compiler | \
// RUN: mlir-cpu-runner \
// RUN: -e entry -entry-point-result=void \

View File

@ -1,4 +1,6 @@
// RUN: mlir-opt %s --sparse-compiler | \
// FIXME: The runtime needs support for _Float16 on X86, see PR55992
// UNSUPPORTED: i386, x86_64
// RUN: mlir-opt %s --sparse-compiler | \
// RUN: mlir-cpu-runner \
// RUN: -e entry -entry-point-result=void \
// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \