forked from OSchip/llvm-project
[X86] Refactor X86ScalarSSEf16/32/64 with hasFP16/SSE1/SSE2. NFCI
This is used for f16 emulation. We emulate f16 for SSE2 targets and above. Refactoring makes the future code to be more clean. Reviewed By: LuoYuanke Differential Revision: https://reviews.llvm.org/D122475
This commit is contained in:
parent
1fd118ffc4
commit
674d52e8ce
|
@ -49,22 +49,11 @@ class X86FastISel final : public FastISel {
|
|||
/// make the right decision when generating code for different targets.
|
||||
const X86Subtarget *Subtarget;
|
||||
|
||||
/// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87
|
||||
/// floating point ops.
|
||||
/// When SSE is available, use it for f32 operations.
|
||||
/// When SSE2 is available, use it for f64 operations.
|
||||
bool X86ScalarSSEf64;
|
||||
bool X86ScalarSSEf32;
|
||||
bool X86ScalarSSEf16;
|
||||
|
||||
public:
|
||||
explicit X86FastISel(FunctionLoweringInfo &funcInfo,
|
||||
const TargetLibraryInfo *libInfo)
|
||||
: FastISel(funcInfo, libInfo) {
|
||||
Subtarget = &funcInfo.MF->getSubtarget<X86Subtarget>();
|
||||
X86ScalarSSEf64 = Subtarget->hasSSE2();
|
||||
X86ScalarSSEf32 = Subtarget->hasSSE1();
|
||||
X86ScalarSSEf16 = Subtarget->hasFP16();
|
||||
}
|
||||
|
||||
bool fastSelectInstruction(const Instruction *I) override;
|
||||
|
@ -158,9 +147,9 @@ private:
|
|||
/// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
|
||||
/// computed in an SSE register, not on the X87 floating point stack.
|
||||
bool isScalarFPTypeInSSEReg(EVT VT) const {
|
||||
return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
|
||||
(VT == MVT::f32 && X86ScalarSSEf32) || // f32 is when SSE1
|
||||
(VT == MVT::f16 && X86ScalarSSEf16); // f16 is when AVX512FP16
|
||||
return (VT == MVT::f64 && Subtarget->hasSSE2()) ||
|
||||
(VT == MVT::f32 && Subtarget->hasSSE1()) ||
|
||||
(VT == MVT::f16 && Subtarget->hasFP16());
|
||||
}
|
||||
|
||||
bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false);
|
||||
|
@ -305,9 +294,9 @@ bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
|
|||
VT = evt.getSimpleVT();
|
||||
// For now, require SSE/SSE2 for performing floating-point operations,
|
||||
// since x87 requires additional work.
|
||||
if (VT == MVT::f64 && !X86ScalarSSEf64)
|
||||
if (VT == MVT::f64 && !Subtarget->hasSSE2())
|
||||
return false;
|
||||
if (VT == MVT::f32 && !X86ScalarSSEf32)
|
||||
if (VT == MVT::f32 && !Subtarget->hasSSE1())
|
||||
return false;
|
||||
// Similarly, no f80 support yet.
|
||||
if (VT == MVT::f80)
|
||||
|
@ -325,6 +314,8 @@ bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
|
|||
bool X86FastISel::X86FastEmitLoad(MVT VT, X86AddressMode &AM,
|
||||
MachineMemOperand *MMO, unsigned &ResultReg,
|
||||
unsigned Alignment) {
|
||||
bool HasSSE1 = Subtarget->hasSSE1();
|
||||
bool HasSSE2 = Subtarget->hasSSE2();
|
||||
bool HasSSE41 = Subtarget->hasSSE41();
|
||||
bool HasAVX = Subtarget->hasAVX();
|
||||
bool HasAVX2 = Subtarget->hasAVX2();
|
||||
|
@ -354,20 +345,16 @@ bool X86FastISel::X86FastEmitLoad(MVT VT, X86AddressMode &AM,
|
|||
Opc = X86::MOV64rm;
|
||||
break;
|
||||
case MVT::f32:
|
||||
if (X86ScalarSSEf32)
|
||||
Opc = HasAVX512 ? X86::VMOVSSZrm_alt :
|
||||
HasAVX ? X86::VMOVSSrm_alt :
|
||||
X86::MOVSSrm_alt;
|
||||
else
|
||||
Opc = X86::LD_Fp32m;
|
||||
Opc = HasAVX512 ? X86::VMOVSSZrm_alt
|
||||
: HasAVX ? X86::VMOVSSrm_alt
|
||||
: HasSSE1 ? X86::MOVSSrm_alt
|
||||
: X86::LD_Fp32m;
|
||||
break;
|
||||
case MVT::f64:
|
||||
if (X86ScalarSSEf64)
|
||||
Opc = HasAVX512 ? X86::VMOVSDZrm_alt :
|
||||
HasAVX ? X86::VMOVSDrm_alt :
|
||||
X86::MOVSDrm_alt;
|
||||
else
|
||||
Opc = X86::LD_Fp64m;
|
||||
Opc = HasAVX512 ? X86::VMOVSDZrm_alt
|
||||
: HasAVX ? X86::VMOVSDrm_alt
|
||||
: HasSSE2 ? X86::MOVSDrm_alt
|
||||
: X86::LD_Fp64m;
|
||||
break;
|
||||
case MVT::f80:
|
||||
// No f80 support yet.
|
||||
|
@ -521,7 +508,7 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, X86AddressMode &AM,
|
|||
Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTI_64mr : X86::MOV64mr;
|
||||
break;
|
||||
case MVT::f32:
|
||||
if (X86ScalarSSEf32) {
|
||||
if (HasSSE1) {
|
||||
if (IsNonTemporal && HasSSE4A)
|
||||
Opc = X86::MOVNTSS;
|
||||
else
|
||||
|
@ -531,7 +518,7 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, X86AddressMode &AM,
|
|||
Opc = X86::ST_Fp32m;
|
||||
break;
|
||||
case MVT::f64:
|
||||
if (X86ScalarSSEf32) {
|
||||
if (HasSSE2) {
|
||||
if (IsNonTemporal && HasSSE4A)
|
||||
Opc = X86::MOVNTSD;
|
||||
else
|
||||
|
@ -1362,8 +1349,8 @@ bool X86FastISel::X86SelectLoad(const Instruction *I) {
|
|||
static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) {
|
||||
bool HasAVX512 = Subtarget->hasAVX512();
|
||||
bool HasAVX = Subtarget->hasAVX();
|
||||
bool X86ScalarSSEf32 = Subtarget->hasSSE1();
|
||||
bool X86ScalarSSEf64 = Subtarget->hasSSE2();
|
||||
bool HasSSE1 = Subtarget->hasSSE1();
|
||||
bool HasSSE2 = Subtarget->hasSSE2();
|
||||
|
||||
switch (VT.getSimpleVT().SimpleTy) {
|
||||
default: return 0;
|
||||
|
@ -1372,15 +1359,15 @@ static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) {
|
|||
case MVT::i32: return X86::CMP32rr;
|
||||
case MVT::i64: return X86::CMP64rr;
|
||||
case MVT::f32:
|
||||
return X86ScalarSSEf32
|
||||
? (HasAVX512 ? X86::VUCOMISSZrr
|
||||
: HasAVX ? X86::VUCOMISSrr : X86::UCOMISSrr)
|
||||
: 0;
|
||||
return HasAVX512 ? X86::VUCOMISSZrr
|
||||
: HasAVX ? X86::VUCOMISSrr
|
||||
: HasSSE1 ? X86::UCOMISSrr
|
||||
: 0;
|
||||
case MVT::f64:
|
||||
return X86ScalarSSEf64
|
||||
? (HasAVX512 ? X86::VUCOMISDZrr
|
||||
: HasAVX ? X86::VUCOMISDrr : X86::UCOMISDrr)
|
||||
: 0;
|
||||
return HasAVX512 ? X86::VUCOMISDZrr
|
||||
: HasAVX ? X86::VUCOMISDrr
|
||||
: HasSSE2 ? X86::UCOMISDrr
|
||||
: 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2495,7 +2482,7 @@ bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I,
|
|||
}
|
||||
|
||||
bool X86FastISel::X86SelectFPExt(const Instruction *I) {
|
||||
if (X86ScalarSSEf64 && I->getType()->isDoubleTy() &&
|
||||
if (Subtarget->hasSSE2() && I->getType()->isDoubleTy() &&
|
||||
I->getOperand(0)->getType()->isFloatTy()) {
|
||||
bool HasAVX512 = Subtarget->hasAVX512();
|
||||
// fpext from float to double.
|
||||
|
@ -2509,7 +2496,7 @@ bool X86FastISel::X86SelectFPExt(const Instruction *I) {
|
|||
}
|
||||
|
||||
bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
|
||||
if (X86ScalarSSEf64 && I->getType()->isFloatTy() &&
|
||||
if (Subtarget->hasSSE2() && I->getType()->isFloatTy() &&
|
||||
I->getOperand(0)->getType()->isDoubleTy()) {
|
||||
bool HasAVX512 = Subtarget->hasAVX512();
|
||||
// fptrunc from double to float.
|
||||
|
@ -3733,25 +3720,23 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
|
|||
|
||||
// Get opcode and regclass of the output for the given load instruction.
|
||||
unsigned Opc = 0;
|
||||
bool HasSSE1 = Subtarget->hasSSE1();
|
||||
bool HasSSE2 = Subtarget->hasSSE2();
|
||||
bool HasAVX = Subtarget->hasAVX();
|
||||
bool HasAVX512 = Subtarget->hasAVX512();
|
||||
switch (VT.SimpleTy) {
|
||||
default: return 0;
|
||||
case MVT::f32:
|
||||
if (X86ScalarSSEf32)
|
||||
Opc = HasAVX512 ? X86::VMOVSSZrm_alt :
|
||||
HasAVX ? X86::VMOVSSrm_alt :
|
||||
X86::MOVSSrm_alt;
|
||||
else
|
||||
Opc = X86::LD_Fp32m;
|
||||
Opc = HasAVX512 ? X86::VMOVSSZrm_alt
|
||||
: HasAVX ? X86::VMOVSSrm_alt
|
||||
: HasSSE1 ? X86::MOVSSrm_alt
|
||||
: X86::LD_Fp32m;
|
||||
break;
|
||||
case MVT::f64:
|
||||
if (X86ScalarSSEf64)
|
||||
Opc = HasAVX512 ? X86::VMOVSDZrm_alt :
|
||||
HasAVX ? X86::VMOVSDrm_alt :
|
||||
X86::MOVSDrm_alt;
|
||||
else
|
||||
Opc = X86::LD_Fp64m;
|
||||
Opc = HasAVX512 ? X86::VMOVSDZrm_alt
|
||||
: HasAVX ? X86::VMOVSDrm_alt
|
||||
: HasSSE2 ? X86::MOVSDrm_alt
|
||||
: X86::LD_Fp64m;
|
||||
break;
|
||||
case MVT::f80:
|
||||
// No f80 support yet.
|
||||
|
@ -3852,11 +3837,11 @@ unsigned X86FastISel::fastMaterializeConstant(const Constant *C) {
|
|||
default:
|
||||
break;
|
||||
case MVT::f32:
|
||||
if (!X86ScalarSSEf32)
|
||||
if (!Subtarget->hasSSE1())
|
||||
Opc = X86::LD_Fp032;
|
||||
break;
|
||||
case MVT::f64:
|
||||
if (!X86ScalarSSEf64)
|
||||
if (!Subtarget->hasSSE2())
|
||||
Opc = X86::LD_Fp064;
|
||||
break;
|
||||
case MVT::f80:
|
||||
|
@ -3907,21 +3892,21 @@ unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) {
|
|||
return 0;
|
||||
|
||||
// Get opcode and regclass for the given zero.
|
||||
bool HasSSE1 = Subtarget->hasSSE1();
|
||||
bool HasSSE2 = Subtarget->hasSSE2();
|
||||
bool HasAVX512 = Subtarget->hasAVX512();
|
||||
unsigned Opc = 0;
|
||||
switch (VT.SimpleTy) {
|
||||
default: return 0;
|
||||
case MVT::f32:
|
||||
if (X86ScalarSSEf32)
|
||||
Opc = HasAVX512 ? X86::AVX512_FsFLD0SS : X86::FsFLD0SS;
|
||||
else
|
||||
Opc = X86::LD_Fp032;
|
||||
Opc = HasAVX512 ? X86::AVX512_FsFLD0SS
|
||||
: HasSSE1 ? X86::FsFLD0SS
|
||||
: X86::LD_Fp032;
|
||||
break;
|
||||
case MVT::f64:
|
||||
if (X86ScalarSSEf64)
|
||||
Opc = HasAVX512 ? X86::AVX512_FsFLD0SD : X86::FsFLD0SD;
|
||||
else
|
||||
Opc = X86::LD_Fp064;
|
||||
Opc = HasAVX512 ? X86::AVX512_FsFLD0SD
|
||||
: HasSSE2 ? X86::FsFLD0SD
|
||||
: X86::LD_Fp064;
|
||||
break;
|
||||
case MVT::f80:
|
||||
// No f80 support yet.
|
||||
|
|
|
@ -108,9 +108,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
|
|||
const X86Subtarget &STI)
|
||||
: TargetLowering(TM), Subtarget(STI) {
|
||||
bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
|
||||
X86ScalarSSEf64 = Subtarget.hasSSE2();
|
||||
X86ScalarSSEf32 = Subtarget.hasSSE1();
|
||||
X86ScalarSSEf16 = Subtarget.hasFP16();
|
||||
MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
|
||||
|
||||
// Set up the TargetLowering object.
|
||||
|
@ -314,7 +311,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
|
|||
setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
|
||||
|
||||
// TODO: when we have SSE, these could be more efficient, by using movd/movq.
|
||||
if (!X86ScalarSSEf64) {
|
||||
if (!Subtarget.hasSSE2()) {
|
||||
setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
|
||||
setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
|
||||
if (Subtarget.is64Bit()) {
|
||||
|
@ -555,7 +552,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
|
|||
setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
|
||||
setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
|
||||
|
||||
if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
|
||||
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
|
||||
// f32 and f64 use SSE.
|
||||
// Set up the FP register classes.
|
||||
addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
|
||||
|
@ -593,7 +590,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
|
|||
setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
|
||||
setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
|
||||
|
||||
} else if (!Subtarget.useSoftFloat() && X86ScalarSSEf32 &&
|
||||
} else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
|
||||
(UseX87 || Is64Bit)) {
|
||||
// Use SSE for f32, x87 for f64.
|
||||
// Set up the FP register classes.
|
||||
|
@ -2572,9 +2569,9 @@ EVT X86TargetLowering::getOptimalMemOpType(
|
|||
|
||||
bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
|
||||
if (VT == MVT::f32)
|
||||
return X86ScalarSSEf32;
|
||||
return Subtarget.hasSSE1();
|
||||
if (VT == MVT::f64)
|
||||
return X86ScalarSSEf64;
|
||||
return Subtarget.hasSSE2();
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -5669,6 +5666,24 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const {
|
|||
return Subtarget.hasLZCNT();
|
||||
}
|
||||
|
||||
bool X86TargetLowering::hasBitPreservingFPLogic(EVT VT) const {
|
||||
return VT == MVT::f32 || VT == MVT::f64 || VT.isVector() ||
|
||||
(VT == MVT::f16 && Subtarget.hasFP16());
|
||||
}
|
||||
|
||||
bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const {
|
||||
// Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
|
||||
// expensive than a straight movsd. On the other hand, it's important to
|
||||
// shrink long double fp constant since fldt is very slow.
|
||||
return !Subtarget.hasSSE2() || VT == MVT::f80;
|
||||
}
|
||||
|
||||
bool X86TargetLowering::isScalarFPTypeInSSEReg(EVT VT) const {
|
||||
return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
|
||||
(VT == MVT::f32 && Subtarget.hasSSE1()) ||
|
||||
(VT == MVT::f16 && Subtarget.hasFP16());
|
||||
}
|
||||
|
||||
bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
|
||||
const SelectionDAG &DAG,
|
||||
const MachineMemOperand &MMO) const {
|
||||
|
@ -21196,9 +21211,10 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
|
|||
|
||||
// The transform for i64->f64 isn't correct for 0 when rounding to negative
|
||||
// infinity. It produces -0.0, so disable under strictfp.
|
||||
if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64 && !IsStrict)
|
||||
if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
|
||||
!IsStrict)
|
||||
return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
|
||||
if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80)
|
||||
if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80)
|
||||
return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
|
||||
if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
|
||||
(DstVT == MVT::f32 || DstVT == MVT::f64))
|
||||
|
|
|
@ -1039,10 +1039,7 @@ namespace llvm {
|
|||
|
||||
bool isCtlzFast() const override;
|
||||
|
||||
bool hasBitPreservingFPLogic(EVT VT) const override {
|
||||
return VT == MVT::f32 || VT == MVT::f64 || VT.isVector() ||
|
||||
(VT == MVT::f16 && X86ScalarSSEf16);
|
||||
}
|
||||
bool hasBitPreservingFPLogic(EVT VT) const override;
|
||||
|
||||
bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const override {
|
||||
// If the pair to store is a mixture of float and int values, we will
|
||||
|
@ -1322,12 +1319,7 @@ namespace llvm {
|
|||
/// If true, then instruction selection should
|
||||
/// seek to shrink the FP constant of the specified type to a smaller type
|
||||
/// in order to save space and / or reduce runtime.
|
||||
bool ShouldShrinkFPConstant(EVT VT) const override {
|
||||
// Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
|
||||
// expensive than a straight movsd. On the other hand, it's important to
|
||||
// shrink long double fp constant since fldt is very slow.
|
||||
return !X86ScalarSSEf64 || VT == MVT::f80;
|
||||
}
|
||||
bool ShouldShrinkFPConstant(EVT VT) const override;
|
||||
|
||||
/// Return true if we believe it is correct and profitable to reduce the
|
||||
/// load node to a smaller type.
|
||||
|
@ -1336,11 +1328,7 @@ namespace llvm {
|
|||
|
||||
/// Return true if the specified scalar FP type is computed in an SSE
|
||||
/// register, not on the X87 floating point stack.
|
||||
bool isScalarFPTypeInSSEReg(EVT VT) const {
|
||||
return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
|
||||
(VT == MVT::f32 && X86ScalarSSEf32) || // f32 is when SSE1
|
||||
(VT == MVT::f16 && X86ScalarSSEf16); // f16 is when AVX512FP16
|
||||
}
|
||||
bool isScalarFPTypeInSSEReg(EVT VT) const;
|
||||
|
||||
/// Returns true if it is beneficial to convert a load of a constant
|
||||
/// to just the constant itself.
|
||||
|
@ -1494,13 +1482,6 @@ namespace llvm {
|
|||
/// make the right decision when generating code for different targets.
|
||||
const X86Subtarget &Subtarget;
|
||||
|
||||
/// Select between SSE or x87 floating point ops.
|
||||
/// When SSE is available, use it for f32 operations.
|
||||
/// When SSE2 is available, use it for f64 operations.
|
||||
bool X86ScalarSSEf32;
|
||||
bool X86ScalarSSEf64;
|
||||
bool X86ScalarSSEf16;
|
||||
|
||||
/// A list of legal FP immediates.
|
||||
std::vector<APFloat> LegalFPImmediates;
|
||||
|
||||
|
|
Loading…
Reference in New Issue