forked from OSchip/llvm-project
make reciprocal estimate code generation more flexible by adding command-line options (3rd try)
The first try (r238051) to land this was reverted due to ExecutionEngine build failure; that was hopefully addressed by r238788. The second try (r238842) to land this was reverted due to BUILD_SHARED_LIBS failure; that was hopefully addressed by r238953. This patch adds a TargetRecip class for processing many recip codegen possibilities. The class is intended to handle both command-line options to llc as well as options passed in from a front-end such as clang with the -mrecip option. The x86 backend is updated to use the new functionality. Only -mcpu=btver2 with -ffast-math should see a functional change from this patch. All other x86 CPUs continue to *not* use reciprocal estimates by default with -ffast-math. Differential Revision: http://reviews.llvm.org/D8982 llvm-svn: 239001
This commit is contained in:
parent
215046bf98
commit
667a7e2a0f
|
@ -24,6 +24,7 @@
|
|||
#include "llvm/Support/Host.h"
|
||||
#include "llvm/Target/TargetMachine.h"
|
||||
#include "llvm/Target/TargetOptions.h"
|
||||
#include "llvm/Target/TargetRecip.h"
|
||||
#include <string>
|
||||
using namespace llvm;
|
||||
|
||||
|
@ -152,6 +153,12 @@ FuseFPOps("fp-contract",
|
|||
"Only fuse FP ops when the result won't be effected."),
|
||||
clEnumValEnd));
|
||||
|
||||
cl::list<std::string>
|
||||
ReciprocalOps("recip",
|
||||
cl::CommaSeparated,
|
||||
cl::desc("Choose reciprocal operation types and parameters."),
|
||||
cl::value_desc("all,none,default,divf,!vec-sqrtd,vec-divd:0,sqrt:9..."));
|
||||
|
||||
cl::opt<bool>
|
||||
DontPlaceZerosInBSS("nozero-initialized-in-bss",
|
||||
cl::desc("Don't place zero-initialized symbols into bss section"),
|
||||
|
@ -230,6 +237,7 @@ static inline TargetOptions InitTargetOptionsFromCodeGenFlags() {
|
|||
TargetOptions Options;
|
||||
Options.LessPreciseFPMADOption = EnableFPMAD;
|
||||
Options.AllowFPOpFusion = FuseFPOps;
|
||||
Options.Reciprocals = TargetRecip(ReciprocalOps);
|
||||
Options.UnsafeFPMath = EnableUnsafeFPMath;
|
||||
Options.NoInfsFPMath = EnableNoInfsFPMath;
|
||||
Options.NoNaNsFPMath = EnableNoNaNsFPMath;
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
#ifndef LLVM_TARGET_TARGETOPTIONS_H
|
||||
#define LLVM_TARGET_TARGETOPTIONS_H
|
||||
|
||||
#include "llvm/Target/TargetRecip.h"
|
||||
#include "llvm/MC/MCTargetOptions.h"
|
||||
#include <string>
|
||||
|
||||
|
@ -72,7 +73,8 @@ namespace llvm {
|
|||
CompressDebugSections(false), FunctionSections(false),
|
||||
DataSections(false), UniqueSectionNames(true), TrapUnreachable(false),
|
||||
TrapFuncName(), FloatABIType(FloatABI::Default),
|
||||
AllowFPOpFusion(FPOpFusion::Standard), JTType(JumpTable::Single),
|
||||
AllowFPOpFusion(FPOpFusion::Standard), Reciprocals(TargetRecip()),
|
||||
JTType(JumpTable::Single),
|
||||
ThreadModel(ThreadModel::POSIX) {}
|
||||
|
||||
/// PrintMachineCode - This flag is enabled when the -print-machineinstrs
|
||||
|
@ -206,6 +208,9 @@ namespace llvm {
|
|||
/// the value of this option.
|
||||
FPOpFusion::FPOpFusionMode AllowFPOpFusion;
|
||||
|
||||
/// This class encapsulates options for reciprocal-estimate code generation.
|
||||
TargetRecip Reciprocals;
|
||||
|
||||
/// JTType - This flag specifies the type of jump-instruction table to
|
||||
/// create for functions that have the jumptable attribute.
|
||||
JumpTable::JumpTableType JTType;
|
||||
|
@ -240,6 +245,7 @@ inline bool operator==(const TargetOptions &LHS,
|
|||
ARE_EQUAL(TrapFuncName) &&
|
||||
ARE_EQUAL(FloatABIType) &&
|
||||
ARE_EQUAL(AllowFPOpFusion) &&
|
||||
ARE_EQUAL(Reciprocals) &&
|
||||
ARE_EQUAL(JTType) &&
|
||||
ARE_EQUAL(ThreadModel) &&
|
||||
ARE_EQUAL(MCOptions);
|
||||
|
|
|
@ -0,0 +1,73 @@
|
|||
//===--------------------- llvm/Target/TargetRecip.h ------------*- C++ -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This class is used to customize machine-specific reciprocal estimate code
|
||||
// generation in a target-independent way.
|
||||
// If a target does not support operations in this specification, then code
|
||||
// generation will default to using supported operations.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_TARGET_TARGETRECIP_H
|
||||
#define LLVM_TARGET_TARGETRECIP_H
|
||||
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <map>
|
||||
|
||||
namespace llvm {
|
||||
|
||||
struct TargetRecip {
|
||||
public:
|
||||
TargetRecip();
|
||||
|
||||
/// Initialize all or part of the operations from command-line options or
|
||||
/// a front end.
|
||||
TargetRecip(const std::vector<std::string> &Args);
|
||||
|
||||
/// Set whether a particular reciprocal operation is enabled and how many
|
||||
/// refinement steps are needed when using it. Use "all" to set enablement
|
||||
/// and refinement steps for all operations.
|
||||
void setDefaults(const StringRef &Key, bool Enable, unsigned RefSteps);
|
||||
|
||||
/// Return true if the reciprocal operation has been enabled by default or
|
||||
/// from the command-line. Return false if the operation has been disabled
|
||||
/// by default or from the command-line.
|
||||
bool isEnabled(const StringRef &Key) const;
|
||||
|
||||
/// Return the number of iterations necessary to refine the
|
||||
/// the result of a machine instruction for the given reciprocal operation.
|
||||
unsigned getRefinementSteps(const StringRef &Key) const;
|
||||
|
||||
bool operator==(const TargetRecip &Other) const;
|
||||
|
||||
private:
|
||||
enum {
|
||||
Uninitialized = -1
|
||||
};
|
||||
|
||||
struct RecipParams {
|
||||
int8_t Enabled;
|
||||
int8_t RefinementSteps;
|
||||
|
||||
RecipParams() : Enabled(Uninitialized), RefinementSteps(Uninitialized) {}
|
||||
};
|
||||
|
||||
std::map<StringRef, RecipParams> RecipMap;
|
||||
typedef std::map<StringRef, RecipParams>::iterator RecipIter;
|
||||
typedef std::map<StringRef, RecipParams>::const_iterator ConstRecipIter;
|
||||
|
||||
bool parseGlobalParams(const std::string &Arg);
|
||||
void parseIndividualParams(const std::vector<std::string> &Args);
|
||||
};
|
||||
|
||||
} // End llvm namespace
|
||||
|
||||
#endif
|
|
@ -6,6 +6,7 @@ add_llvm_library(LLVMTarget
|
|||
TargetLoweringObjectFile.cpp
|
||||
TargetMachine.cpp
|
||||
TargetMachineC.cpp
|
||||
TargetRecip.cpp
|
||||
TargetSubtargetInfo.cpp
|
||||
|
||||
ADDITIONAL_HEADER_DIRS
|
||||
|
|
|
@ -0,0 +1,225 @@
|
|||
//===-------------------------- TargetRecip.cpp ---------------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This class is used to customize machine-specific reciprocal estimate code
|
||||
// generation in a target-independent way.
|
||||
// If a target does not support operations in this specification, then code
|
||||
// generation will default to using supported operations.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
#include "llvm/ADT/STLExtras.h"
|
||||
#include "llvm/Support/ErrorHandling.h"
|
||||
#include "llvm/Target/TargetRecip.h"
|
||||
#include <map>
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
// These are the names of the individual reciprocal operations. These are
|
||||
// the key strings for queries and command-line inputs.
|
||||
// In addition, the command-line interface recognizes the global parameters
|
||||
// "all", "none", and "default".
|
||||
static const char *RecipOps[] = {
|
||||
"divd",
|
||||
"divf",
|
||||
"vec-divd",
|
||||
"vec-divf",
|
||||
"sqrtd",
|
||||
"sqrtf",
|
||||
"vec-sqrtd",
|
||||
"vec-sqrtf",
|
||||
};
|
||||
|
||||
// The uninitialized state is needed for the enabled settings and refinement
|
||||
// steps because custom settings may arrive via the command-line before target
|
||||
// defaults are set.
|
||||
TargetRecip::TargetRecip() {
|
||||
unsigned NumStrings = llvm::array_lengthof(RecipOps);
|
||||
for (unsigned i = 0; i < NumStrings; ++i)
|
||||
RecipMap.insert(std::make_pair(RecipOps[i], RecipParams()));
|
||||
}
|
||||
|
||||
static bool parseRefinementStep(const StringRef &In, size_t &Position,
|
||||
uint8_t &Value) {
|
||||
const char RefStepToken = ':';
|
||||
Position = In.find(RefStepToken);
|
||||
if (Position == StringRef::npos)
|
||||
return false;
|
||||
|
||||
StringRef RefStepString = In.substr(Position + 1);
|
||||
// Allow exactly one numeric character for the additional refinement
|
||||
// step parameter.
|
||||
if (RefStepString.size() == 1) {
|
||||
char RefStepChar = RefStepString[0];
|
||||
if (RefStepChar >= '0' && RefStepChar <= '9') {
|
||||
Value = RefStepChar - '0';
|
||||
return true;
|
||||
}
|
||||
}
|
||||
report_fatal_error("Invalid refinement step for -recip.");
|
||||
}
|
||||
|
||||
bool TargetRecip::parseGlobalParams(const std::string &Arg) {
|
||||
StringRef ArgSub = Arg;
|
||||
|
||||
// Look for an optional setting of the number of refinement steps needed
|
||||
// for this type of reciprocal operation.
|
||||
size_t RefPos;
|
||||
uint8_t RefSteps;
|
||||
StringRef RefStepString;
|
||||
if (parseRefinementStep(ArgSub, RefPos, RefSteps)) {
|
||||
// Split the string for further processing.
|
||||
RefStepString = ArgSub.substr(RefPos + 1);
|
||||
ArgSub = ArgSub.substr(0, RefPos);
|
||||
}
|
||||
bool Enable;
|
||||
bool UseDefaults;
|
||||
if (ArgSub == "all") {
|
||||
UseDefaults = false;
|
||||
Enable = true;
|
||||
} else if (ArgSub == "none") {
|
||||
UseDefaults = false;
|
||||
Enable = false;
|
||||
} else if (ArgSub == "default") {
|
||||
UseDefaults = true;
|
||||
} else {
|
||||
// Any other string is invalid or an individual setting.
|
||||
return false;
|
||||
}
|
||||
|
||||
// All enable values will be initialized to target defaults if 'default' was
|
||||
// specified.
|
||||
if (!UseDefaults)
|
||||
for (auto &KV : RecipMap)
|
||||
KV.second.Enabled = Enable;
|
||||
|
||||
// Custom refinement count was specified with all, none, or default.
|
||||
if (!RefStepString.empty())
|
||||
for (auto &KV : RecipMap)
|
||||
KV.second.RefinementSteps = RefSteps;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void TargetRecip::parseIndividualParams(const std::vector<std::string> &Args) {
|
||||
static const char DisabledPrefix = '!';
|
||||
unsigned NumArgs = Args.size();
|
||||
|
||||
for (unsigned i = 0; i != NumArgs; ++i) {
|
||||
StringRef Val = Args[i];
|
||||
|
||||
bool IsDisabled = Val[0] == DisabledPrefix;
|
||||
// Ignore the disablement token for string matching.
|
||||
if (IsDisabled)
|
||||
Val = Val.substr(1);
|
||||
|
||||
size_t RefPos;
|
||||
uint8_t RefSteps;
|
||||
StringRef RefStepString;
|
||||
if (parseRefinementStep(Val, RefPos, RefSteps)) {
|
||||
// Split the string for further processing.
|
||||
RefStepString = Val.substr(RefPos + 1);
|
||||
Val = Val.substr(0, RefPos);
|
||||
}
|
||||
|
||||
RecipIter Iter = RecipMap.find(Val);
|
||||
if (Iter == RecipMap.end()) {
|
||||
// Try again specifying float suffix.
|
||||
Iter = RecipMap.find(Val.str() + 'f');
|
||||
if (Iter == RecipMap.end()) {
|
||||
Iter = RecipMap.find(Val.str() + 'd');
|
||||
assert(Iter == RecipMap.end() && "Float entry missing from map");
|
||||
report_fatal_error("Invalid option for -recip.");
|
||||
}
|
||||
|
||||
// The option was specified without a float or double suffix.
|
||||
if (RecipMap[Val.str() + 'd'].Enabled != Uninitialized) {
|
||||
// Make sure that the double entry was not already specified.
|
||||
// The float entry will be checked below.
|
||||
report_fatal_error("Duplicate option for -recip.");
|
||||
}
|
||||
}
|
||||
|
||||
if (Iter->second.Enabled != Uninitialized)
|
||||
report_fatal_error("Duplicate option for -recip.");
|
||||
|
||||
// Mark the matched option as found. Do not allow duplicate specifiers.
|
||||
Iter->second.Enabled = !IsDisabled;
|
||||
if (!RefStepString.empty())
|
||||
Iter->second.RefinementSteps = RefSteps;
|
||||
|
||||
// If the precision was not specified, the double entry is also initialized.
|
||||
if (Val.back() != 'f' && Val.back() != 'd') {
|
||||
RecipMap[Val.str() + 'd'].Enabled = !IsDisabled;
|
||||
if (!RefStepString.empty())
|
||||
RecipMap[Val.str() + 'd'].RefinementSteps = RefSteps;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TargetRecip::TargetRecip(const std::vector<std::string> &Args) :
|
||||
TargetRecip() {
|
||||
unsigned NumArgs = Args.size();
|
||||
|
||||
// Check if "all", "default", or "none" was specified.
|
||||
if (NumArgs == 1 && parseGlobalParams(Args[0]))
|
||||
return;
|
||||
|
||||
parseIndividualParams(Args);
|
||||
}
|
||||
|
||||
bool TargetRecip::isEnabled(const StringRef &Key) const {
|
||||
ConstRecipIter Iter = RecipMap.find(Key);
|
||||
assert(Iter != RecipMap.end() && "Unknown name for reciprocal map");
|
||||
assert(Iter->second.Enabled != Uninitialized &&
|
||||
"Enablement setting was not initialized");
|
||||
return Iter->second.Enabled;
|
||||
}
|
||||
|
||||
unsigned TargetRecip::getRefinementSteps(const StringRef &Key) const {
|
||||
ConstRecipIter Iter = RecipMap.find(Key);
|
||||
assert(Iter != RecipMap.end() && "Unknown name for reciprocal map");
|
||||
assert(Iter->second.RefinementSteps != Uninitialized &&
|
||||
"Refinement step setting was not initialized");
|
||||
return Iter->second.RefinementSteps;
|
||||
}
|
||||
|
||||
/// Custom settings (previously initialized values) override target defaults.
|
||||
void TargetRecip::setDefaults(const StringRef &Key, bool Enable,
|
||||
unsigned RefSteps) {
|
||||
if (Key == "all") {
|
||||
for (auto &KV : RecipMap) {
|
||||
RecipParams &RP = KV.second;
|
||||
if (RP.Enabled == Uninitialized)
|
||||
RP.Enabled = Enable;
|
||||
if (RP.RefinementSteps == Uninitialized)
|
||||
RP.RefinementSteps = RefSteps;
|
||||
}
|
||||
} else {
|
||||
RecipParams &RP = RecipMap[Key];
|
||||
if (RP.Enabled == Uninitialized)
|
||||
RP.Enabled = Enable;
|
||||
if (RP.RefinementSteps == Uninitialized)
|
||||
RP.RefinementSteps = RefSteps;
|
||||
}
|
||||
}
|
||||
|
||||
bool TargetRecip::operator==(const TargetRecip &Other) const {
|
||||
for (const auto &KV : RecipMap) {
|
||||
const StringRef &Op = KV.first;
|
||||
const RecipParams &RP = KV.second;
|
||||
const RecipParams &OtherRP = Other.RecipMap.find(Op)->second;
|
||||
if (RP.RefinementSteps != OtherRP.RefinementSteps)
|
||||
return false;
|
||||
if (RP.Enabled != OtherRP.Enabled)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
|
@ -190,10 +190,6 @@ def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
|
|||
"LEA instruction with certain arguments is slow">;
|
||||
def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
|
||||
"INC and DEC instructions are slower than ADD and SUB">;
|
||||
def FeatureUseSqrtEst : SubtargetFeature<"use-sqrt-est", "UseSqrtEst", "true",
|
||||
"Use RSQRT* to optimize square root calculations">;
|
||||
def FeatureUseRecipEst : SubtargetFeature<"use-recip-est", "UseReciprocalEst",
|
||||
"true", "Use RCP* to optimize division calculations">;
|
||||
def FeatureSoftFloat
|
||||
: SubtargetFeature<"soft-float", "UseSoftFloat", "true",
|
||||
"Use software floating point features.">;
|
||||
|
@ -446,7 +442,7 @@ def : ProcessorModel<"btver2", BtVer2Model,
|
|||
FeaturePRFCHW, FeatureAES, FeaturePCLMUL,
|
||||
FeatureBMI, FeatureF16C, FeatureMOVBE,
|
||||
FeatureLZCNT, FeaturePOPCNT, FeatureFastUAMem,
|
||||
FeatureSlowSHLD, FeatureUseSqrtEst, FeatureUseRecipEst]>;
|
||||
FeatureSlowSHLD]>;
|
||||
|
||||
// TODO: We should probably add 'FeatureFastUAMem' to all of the AMD chips.
|
||||
|
||||
|
|
|
@ -67,12 +67,6 @@ static cl::opt<bool> ExperimentalVectorWideningLegalization(
|
|||
"rather than promotion."),
|
||||
cl::Hidden);
|
||||
|
||||
static cl::opt<int> ReciprocalEstimateRefinementSteps(
|
||||
"x86-recip-refinement-steps", cl::init(1),
|
||||
cl::desc("Specify the number of Newton-Raphson iterations applied to the "
|
||||
"result of the hardware reciprocal estimate instruction."),
|
||||
cl::NotHidden);
|
||||
|
||||
// Forward declarations.
|
||||
static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
|
||||
SDValue V2);
|
||||
|
@ -13006,29 +13000,31 @@ SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
|
|||
DAGCombinerInfo &DCI,
|
||||
unsigned &RefinementSteps,
|
||||
bool &UseOneConstNR) const {
|
||||
// FIXME: We should use instruction latency models to calculate the cost of
|
||||
// each potential sequence, but this is very hard to do reliably because
|
||||
// at least Intel's Core* chips have variable timing based on the number of
|
||||
// significant digits in the divisor and/or sqrt operand.
|
||||
if (!Subtarget->useSqrtEst())
|
||||
return SDValue();
|
||||
|
||||
EVT VT = Op.getValueType();
|
||||
const char *RecipOp;
|
||||
|
||||
// SSE1 has rsqrtss and rsqrtps.
|
||||
// SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
|
||||
// TODO: Add support for AVX512 (v16f32).
|
||||
// It is likely not profitable to do this for f64 because a double-precision
|
||||
// rsqrt estimate with refinement on x86 prior to FMA requires at least 16
|
||||
// instructions: convert to single, rsqrtss, convert back to double, refine
|
||||
// (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
|
||||
// along with FMA, this could be a throughput win.
|
||||
if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
|
||||
(Subtarget->hasAVX() && VT == MVT::v8f32)) {
|
||||
RefinementSteps = 1;
|
||||
UseOneConstNR = false;
|
||||
return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
|
||||
}
|
||||
return SDValue();
|
||||
if (VT == MVT::f32 && Subtarget->hasSSE1())
|
||||
RecipOp = "sqrtf";
|
||||
else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) ||
|
||||
(VT == MVT::v8f32 && Subtarget->hasAVX()))
|
||||
RecipOp = "vec-sqrtf";
|
||||
else
|
||||
return SDValue();
|
||||
|
||||
TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
|
||||
if (!Recips.isEnabled(RecipOp))
|
||||
return SDValue();
|
||||
|
||||
RefinementSteps = Recips.getRefinementSteps(RecipOp);
|
||||
UseOneConstNR = false;
|
||||
return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
|
||||
}
|
||||
|
||||
/// The minimum architected relative accuracy is 2^-12. We need one
|
||||
|
@ -13036,15 +13032,9 @@ SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
|
|||
SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
|
||||
DAGCombinerInfo &DCI,
|
||||
unsigned &RefinementSteps) const {
|
||||
// FIXME: We should use instruction latency models to calculate the cost of
|
||||
// each potential sequence, but this is very hard to do reliably because
|
||||
// at least Intel's Core* chips have variable timing based on the number of
|
||||
// significant digits in the divisor.
|
||||
if (!Subtarget->useReciprocalEst())
|
||||
return SDValue();
|
||||
|
||||
EVT VT = Op.getValueType();
|
||||
|
||||
const char *RecipOp;
|
||||
|
||||
// SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
|
||||
// TODO: Add support for AVX512 (v16f32).
|
||||
// It is likely not profitable to do this for f64 because a double-precision
|
||||
|
@ -13052,12 +13042,20 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
|
|||
// 15 instructions: convert to single, rcpss, convert back to double, refine
|
||||
// (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
|
||||
// along with FMA, this could be a throughput win.
|
||||
if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
|
||||
(Subtarget->hasAVX() && VT == MVT::v8f32)) {
|
||||
RefinementSteps = ReciprocalEstimateRefinementSteps;
|
||||
return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
|
||||
}
|
||||
return SDValue();
|
||||
if (VT == MVT::f32 && Subtarget->hasSSE1())
|
||||
RecipOp = "divf";
|
||||
else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) ||
|
||||
(VT == MVT::v8f32 && Subtarget->hasAVX()))
|
||||
RecipOp = "vec-divf";
|
||||
else
|
||||
return SDValue();
|
||||
|
||||
TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
|
||||
if (!Recips.isEnabled(RecipOp))
|
||||
return SDValue();
|
||||
|
||||
RefinementSteps = Recips.getRefinementSteps(RecipOp);
|
||||
return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
|
||||
}
|
||||
|
||||
/// If we have at least two divisions that use the same divisor, convert to
|
||||
|
|
|
@ -274,8 +274,6 @@ void X86Subtarget::initializeEnvironment() {
|
|||
LEAUsesAG = false;
|
||||
SlowLEA = false;
|
||||
SlowIncDec = false;
|
||||
UseSqrtEst = false;
|
||||
UseReciprocalEst = false;
|
||||
stackAlignment = 4;
|
||||
// FIXME: this is a known good value for Yonah. How about others?
|
||||
MaxInlineSizeThreshold = 128;
|
||||
|
|
|
@ -190,16 +190,6 @@ protected:
|
|||
/// True if INC and DEC instructions are slow when writing to flags
|
||||
bool SlowIncDec;
|
||||
|
||||
/// Use the RSQRT* instructions to optimize square root calculations.
|
||||
/// For this to be profitable, the cost of FSQRT and FDIV must be
|
||||
/// substantially higher than normal FP ops like FADD and FMUL.
|
||||
bool UseSqrtEst;
|
||||
|
||||
/// Use the RCP* instructions to optimize FP division calculations.
|
||||
/// For this to be profitable, the cost of FDIV must be
|
||||
/// substantially higher than normal FP ops like FADD and FMUL.
|
||||
bool UseReciprocalEst;
|
||||
|
||||
/// Processor has AVX-512 PreFetch Instructions
|
||||
bool HasPFI;
|
||||
|
||||
|
@ -380,8 +370,6 @@ public:
|
|||
bool LEAusesAG() const { return LEAUsesAG; }
|
||||
bool slowLEA() const { return SlowLEA; }
|
||||
bool slowIncDec() const { return SlowIncDec; }
|
||||
bool useSqrtEst() const { return UseSqrtEst; }
|
||||
bool useReciprocalEst() const { return UseReciprocalEst; }
|
||||
bool hasCDI() const { return HasCDI; }
|
||||
bool hasPFI() const { return HasPFI; }
|
||||
bool hasERI() const { return HasERI; }
|
||||
|
|
|
@ -105,6 +105,13 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, StringRef CPU,
|
|||
if (Subtarget.isTargetWin64())
|
||||
this->Options.TrapUnreachable = true;
|
||||
|
||||
// TODO: By default, all reciprocal estimate operations are off because
|
||||
// that matches the behavior before TargetRecip was added (except for btver2
|
||||
// which used subtarget features to enable this type of codegen).
|
||||
// We should change this to match GCC behavior where everything but
|
||||
// scalar division estimates are turned on by default with -ffast-math.
|
||||
this->Options.Reciprocals.setDefaults("all", false, 1);
|
||||
|
||||
initAsmInfo();
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,use-recip-est | FileCheck %s --check-prefix=RECIP
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,use-recip-est -x86-recip-refinement-steps=2 | FileCheck %s --check-prefix=REFINE
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx -recip=divf,vec-divf | FileCheck %s --check-prefix=RECIP
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx -recip=divf:2,vec-divf:2 | FileCheck %s --check-prefix=REFINE
|
||||
|
||||
; If the target's divss/divps instructions are substantially
|
||||
; slower than rcpss/rcpps with a Newton-Raphson refinement,
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,use-sqrt-est | FileCheck %s --check-prefix=ESTIMATE
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx -recip=sqrtf,vec-sqrtf | FileCheck %s --check-prefix=ESTIMATE
|
||||
|
||||
declare double @__sqrt_finite(double) #0
|
||||
declare float @__sqrtf_finite(float) #0
|
||||
|
|
Loading…
Reference in New Issue