[X86] Refactor X86ScalarSSEf16/32/64 with hasFP16/SSE1/SSE2. NFCI

This is used for f16 emulation. We emulate f16 for SSE2 targets and
above. Refactoring makes the future code to be more clean.

Reviewed By: LuoYuanke

Differential Revision: https://reviews.llvm.org/D122475
This commit is contained in:
Phoebe Wang 2022-03-27 12:23:21 +08:00
parent 1fd118ffc4
commit 674d52e8ce
3 changed files with 78 additions and 96 deletions

View File

@ -49,22 +49,11 @@ class X86FastISel final : public FastISel {
/// make the right decision when generating code for different targets.
const X86Subtarget *Subtarget;
/// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87
/// floating point ops.
/// When SSE is available, use it for f32 operations.
/// When SSE2 is available, use it for f64 operations.
bool X86ScalarSSEf64;
bool X86ScalarSSEf32;
bool X86ScalarSSEf16;
public:
explicit X86FastISel(FunctionLoweringInfo &funcInfo,
const TargetLibraryInfo *libInfo)
: FastISel(funcInfo, libInfo) {
Subtarget = &funcInfo.MF->getSubtarget<X86Subtarget>();
X86ScalarSSEf64 = Subtarget->hasSSE2();
X86ScalarSSEf32 = Subtarget->hasSSE1();
X86ScalarSSEf16 = Subtarget->hasFP16();
}
bool fastSelectInstruction(const Instruction *I) override;
@ -158,9 +147,9 @@ private:
/// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
/// computed in an SSE register, not on the X87 floating point stack.
bool isScalarFPTypeInSSEReg(EVT VT) const {
return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
(VT == MVT::f32 && X86ScalarSSEf32) || // f32 is when SSE1
(VT == MVT::f16 && X86ScalarSSEf16); // f16 is when AVX512FP16
return (VT == MVT::f64 && Subtarget->hasSSE2()) ||
(VT == MVT::f32 && Subtarget->hasSSE1()) ||
(VT == MVT::f16 && Subtarget->hasFP16());
}
bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false);
@ -305,9 +294,9 @@ bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
VT = evt.getSimpleVT();
// For now, require SSE/SSE2 for performing floating-point operations,
// since x87 requires additional work.
if (VT == MVT::f64 && !X86ScalarSSEf64)
if (VT == MVT::f64 && !Subtarget->hasSSE2())
return false;
if (VT == MVT::f32 && !X86ScalarSSEf32)
if (VT == MVT::f32 && !Subtarget->hasSSE1())
return false;
// Similarly, no f80 support yet.
if (VT == MVT::f80)
@ -325,6 +314,8 @@ bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
bool X86FastISel::X86FastEmitLoad(MVT VT, X86AddressMode &AM,
MachineMemOperand *MMO, unsigned &ResultReg,
unsigned Alignment) {
bool HasSSE1 = Subtarget->hasSSE1();
bool HasSSE2 = Subtarget->hasSSE2();
bool HasSSE41 = Subtarget->hasSSE41();
bool HasAVX = Subtarget->hasAVX();
bool HasAVX2 = Subtarget->hasAVX2();
@ -354,20 +345,16 @@ bool X86FastISel::X86FastEmitLoad(MVT VT, X86AddressMode &AM,
Opc = X86::MOV64rm;
break;
case MVT::f32:
if (X86ScalarSSEf32)
Opc = HasAVX512 ? X86::VMOVSSZrm_alt :
HasAVX ? X86::VMOVSSrm_alt :
X86::MOVSSrm_alt;
else
Opc = X86::LD_Fp32m;
Opc = HasAVX512 ? X86::VMOVSSZrm_alt
: HasAVX ? X86::VMOVSSrm_alt
: HasSSE1 ? X86::MOVSSrm_alt
: X86::LD_Fp32m;
break;
case MVT::f64:
if (X86ScalarSSEf64)
Opc = HasAVX512 ? X86::VMOVSDZrm_alt :
HasAVX ? X86::VMOVSDrm_alt :
X86::MOVSDrm_alt;
else
Opc = X86::LD_Fp64m;
Opc = HasAVX512 ? X86::VMOVSDZrm_alt
: HasAVX ? X86::VMOVSDrm_alt
: HasSSE2 ? X86::MOVSDrm_alt
: X86::LD_Fp64m;
break;
case MVT::f80:
// No f80 support yet.
@ -521,7 +508,7 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, X86AddressMode &AM,
Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTI_64mr : X86::MOV64mr;
break;
case MVT::f32:
if (X86ScalarSSEf32) {
if (HasSSE1) {
if (IsNonTemporal && HasSSE4A)
Opc = X86::MOVNTSS;
else
@ -531,7 +518,7 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, X86AddressMode &AM,
Opc = X86::ST_Fp32m;
break;
case MVT::f64:
if (X86ScalarSSEf32) {
if (HasSSE2) {
if (IsNonTemporal && HasSSE4A)
Opc = X86::MOVNTSD;
else
@ -1362,8 +1349,8 @@ bool X86FastISel::X86SelectLoad(const Instruction *I) {
static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) {
bool HasAVX512 = Subtarget->hasAVX512();
bool HasAVX = Subtarget->hasAVX();
bool X86ScalarSSEf32 = Subtarget->hasSSE1();
bool X86ScalarSSEf64 = Subtarget->hasSSE2();
bool HasSSE1 = Subtarget->hasSSE1();
bool HasSSE2 = Subtarget->hasSSE2();
switch (VT.getSimpleVT().SimpleTy) {
default: return 0;
@ -1372,15 +1359,15 @@ static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) {
case MVT::i32: return X86::CMP32rr;
case MVT::i64: return X86::CMP64rr;
case MVT::f32:
return X86ScalarSSEf32
? (HasAVX512 ? X86::VUCOMISSZrr
: HasAVX ? X86::VUCOMISSrr : X86::UCOMISSrr)
: 0;
return HasAVX512 ? X86::VUCOMISSZrr
: HasAVX ? X86::VUCOMISSrr
: HasSSE1 ? X86::UCOMISSrr
: 0;
case MVT::f64:
return X86ScalarSSEf64
? (HasAVX512 ? X86::VUCOMISDZrr
: HasAVX ? X86::VUCOMISDrr : X86::UCOMISDrr)
: 0;
return HasAVX512 ? X86::VUCOMISDZrr
: HasAVX ? X86::VUCOMISDrr
: HasSSE2 ? X86::UCOMISDrr
: 0;
}
}
@ -2495,7 +2482,7 @@ bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I,
}
bool X86FastISel::X86SelectFPExt(const Instruction *I) {
if (X86ScalarSSEf64 && I->getType()->isDoubleTy() &&
if (Subtarget->hasSSE2() && I->getType()->isDoubleTy() &&
I->getOperand(0)->getType()->isFloatTy()) {
bool HasAVX512 = Subtarget->hasAVX512();
// fpext from float to double.
@ -2509,7 +2496,7 @@ bool X86FastISel::X86SelectFPExt(const Instruction *I) {
}
bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
if (X86ScalarSSEf64 && I->getType()->isFloatTy() &&
if (Subtarget->hasSSE2() && I->getType()->isFloatTy() &&
I->getOperand(0)->getType()->isDoubleTy()) {
bool HasAVX512 = Subtarget->hasAVX512();
// fptrunc from double to float.
@ -3733,25 +3720,23 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
// Get opcode and regclass of the output for the given load instruction.
unsigned Opc = 0;
bool HasSSE1 = Subtarget->hasSSE1();
bool HasSSE2 = Subtarget->hasSSE2();
bool HasAVX = Subtarget->hasAVX();
bool HasAVX512 = Subtarget->hasAVX512();
switch (VT.SimpleTy) {
default: return 0;
case MVT::f32:
if (X86ScalarSSEf32)
Opc = HasAVX512 ? X86::VMOVSSZrm_alt :
HasAVX ? X86::VMOVSSrm_alt :
X86::MOVSSrm_alt;
else
Opc = X86::LD_Fp32m;
Opc = HasAVX512 ? X86::VMOVSSZrm_alt
: HasAVX ? X86::VMOVSSrm_alt
: HasSSE1 ? X86::MOVSSrm_alt
: X86::LD_Fp32m;
break;
case MVT::f64:
if (X86ScalarSSEf64)
Opc = HasAVX512 ? X86::VMOVSDZrm_alt :
HasAVX ? X86::VMOVSDrm_alt :
X86::MOVSDrm_alt;
else
Opc = X86::LD_Fp64m;
Opc = HasAVX512 ? X86::VMOVSDZrm_alt
: HasAVX ? X86::VMOVSDrm_alt
: HasSSE2 ? X86::MOVSDrm_alt
: X86::LD_Fp64m;
break;
case MVT::f80:
// No f80 support yet.
@ -3852,11 +3837,11 @@ unsigned X86FastISel::fastMaterializeConstant(const Constant *C) {
default:
break;
case MVT::f32:
if (!X86ScalarSSEf32)
if (!Subtarget->hasSSE1())
Opc = X86::LD_Fp032;
break;
case MVT::f64:
if (!X86ScalarSSEf64)
if (!Subtarget->hasSSE2())
Opc = X86::LD_Fp064;
break;
case MVT::f80:
@ -3907,21 +3892,21 @@ unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) {
return 0;
// Get opcode and regclass for the given zero.
bool HasSSE1 = Subtarget->hasSSE1();
bool HasSSE2 = Subtarget->hasSSE2();
bool HasAVX512 = Subtarget->hasAVX512();
unsigned Opc = 0;
switch (VT.SimpleTy) {
default: return 0;
case MVT::f32:
if (X86ScalarSSEf32)
Opc = HasAVX512 ? X86::AVX512_FsFLD0SS : X86::FsFLD0SS;
else
Opc = X86::LD_Fp032;
Opc = HasAVX512 ? X86::AVX512_FsFLD0SS
: HasSSE1 ? X86::FsFLD0SS
: X86::LD_Fp032;
break;
case MVT::f64:
if (X86ScalarSSEf64)
Opc = HasAVX512 ? X86::AVX512_FsFLD0SD : X86::FsFLD0SD;
else
Opc = X86::LD_Fp064;
Opc = HasAVX512 ? X86::AVX512_FsFLD0SD
: HasSSE2 ? X86::FsFLD0SD
: X86::LD_Fp064;
break;
case MVT::f80:
// No f80 support yet.

View File

@ -108,9 +108,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
const X86Subtarget &STI)
: TargetLowering(TM), Subtarget(STI) {
bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
X86ScalarSSEf64 = Subtarget.hasSSE2();
X86ScalarSSEf32 = Subtarget.hasSSE1();
X86ScalarSSEf16 = Subtarget.hasFP16();
MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
// Set up the TargetLowering object.
@ -314,7 +311,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
// TODO: when we have SSE, these could be more efficient, by using movd/movq.
if (!X86ScalarSSEf64) {
if (!Subtarget.hasSSE2()) {
setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
if (Subtarget.is64Bit()) {
@ -555,7 +552,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
// f32 and f64 use SSE.
// Set up the FP register classes.
addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
@ -593,7 +590,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
} else if (!Subtarget.useSoftFloat() && X86ScalarSSEf32 &&
} else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
(UseX87 || Is64Bit)) {
// Use SSE for f32, x87 for f64.
// Set up the FP register classes.
@ -2572,9 +2569,9 @@ EVT X86TargetLowering::getOptimalMemOpType(
bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
if (VT == MVT::f32)
return X86ScalarSSEf32;
return Subtarget.hasSSE1();
if (VT == MVT::f64)
return X86ScalarSSEf64;
return Subtarget.hasSSE2();
return true;
}
@ -5669,6 +5666,24 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const {
return Subtarget.hasLZCNT();
}
bool X86TargetLowering::hasBitPreservingFPLogic(EVT VT) const {
return VT == MVT::f32 || VT == MVT::f64 || VT.isVector() ||
(VT == MVT::f16 && Subtarget.hasFP16());
}
bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const {
// Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
// expensive than a straight movsd. On the other hand, it's important to
// shrink long double fp constant since fldt is very slow.
return !Subtarget.hasSSE2() || VT == MVT::f80;
}
bool X86TargetLowering::isScalarFPTypeInSSEReg(EVT VT) const {
return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
(VT == MVT::f32 && Subtarget.hasSSE1()) ||
(VT == MVT::f16 && Subtarget.hasFP16());
}
bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
const SelectionDAG &DAG,
const MachineMemOperand &MMO) const {
@ -21196,9 +21211,10 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
// The transform for i64->f64 isn't correct for 0 when rounding to negative
// infinity. It produces -0.0, so disable under strictfp.
if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64 && !IsStrict)
if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
!IsStrict)
return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80)
if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80)
return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
(DstVT == MVT::f32 || DstVT == MVT::f64))

View File

@ -1039,10 +1039,7 @@ namespace llvm {
bool isCtlzFast() const override;
bool hasBitPreservingFPLogic(EVT VT) const override {
return VT == MVT::f32 || VT == MVT::f64 || VT.isVector() ||
(VT == MVT::f16 && X86ScalarSSEf16);
}
bool hasBitPreservingFPLogic(EVT VT) const override;
bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const override {
// If the pair to store is a mixture of float and int values, we will
@ -1322,12 +1319,7 @@ namespace llvm {
/// If true, then instruction selection should
/// seek to shrink the FP constant of the specified type to a smaller type
/// in order to save space and / or reduce runtime.
bool ShouldShrinkFPConstant(EVT VT) const override {
// Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
// expensive than a straight movsd. On the other hand, it's important to
// shrink long double fp constant since fldt is very slow.
return !X86ScalarSSEf64 || VT == MVT::f80;
}
bool ShouldShrinkFPConstant(EVT VT) const override;
/// Return true if we believe it is correct and profitable to reduce the
/// load node to a smaller type.
@ -1336,11 +1328,7 @@ namespace llvm {
/// Return true if the specified scalar FP type is computed in an SSE
/// register, not on the X87 floating point stack.
bool isScalarFPTypeInSSEReg(EVT VT) const {
return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
(VT == MVT::f32 && X86ScalarSSEf32) || // f32 is when SSE1
(VT == MVT::f16 && X86ScalarSSEf16); // f16 is when AVX512FP16
}
bool isScalarFPTypeInSSEReg(EVT VT) const;
/// Returns true if it is beneficial to convert a load of a constant
/// to just the constant itself.
@ -1494,13 +1482,6 @@ namespace llvm {
/// make the right decision when generating code for different targets.
const X86Subtarget &Subtarget;
/// Select between SSE or x87 floating point ops.
/// When SSE is available, use it for f32 operations.
/// When SSE2 is available, use it for f64 operations.
bool X86ScalarSSEf32;
bool X86ScalarSSEf64;
bool X86ScalarSSEf16;
/// A list of legal FP immediates.
std::vector<APFloat> LegalFPImmediates;