forked from OSchip/llvm-project
HIP: Fix handling of denormal mode
I didn't realize HIP was a distinct offloading kind, so the subtarget was looking for -march, which isn't correct for HIP. We also have the possibility of different denormal defaults in the case of multiple offload targets, so we need to thread the JobAction through the target hook.
This commit is contained in:
parent
0d4ec16d3d
commit
dc89a3efb4
|
@ -636,8 +636,7 @@ public:
|
|||
/// environment for the given \p FPType if given. Otherwise, the default
|
||||
/// assumed mode for any floating point type.
|
||||
virtual llvm::DenormalMode getDefaultDenormalModeForType(
|
||||
const llvm::opt::ArgList &DriverArgs,
|
||||
Action::OffloadKind DeviceOffloadKind,
|
||||
const llvm::opt::ArgList &DriverArgs, const JobAction &JA,
|
||||
const llvm::fltSemantics *FPType = nullptr) const {
|
||||
return llvm::DenormalMode::getIEEE();
|
||||
}
|
||||
|
|
|
@ -273,18 +273,22 @@ bool AMDGPUToolChain::getDefaultDenormsAreZeroForTarget(
|
|||
}
|
||||
|
||||
llvm::DenormalMode AMDGPUToolChain::getDefaultDenormalModeForType(
|
||||
const llvm::opt::ArgList &DriverArgs, Action::OffloadKind DeviceOffloadKind,
|
||||
const llvm::opt::ArgList &DriverArgs, const JobAction &JA,
|
||||
const llvm::fltSemantics *FPType) const {
|
||||
// Denormals should always be enabled for f16 and f64.
|
||||
if (!FPType || FPType != &llvm::APFloat::IEEEsingle())
|
||||
return llvm::DenormalMode::getIEEE();
|
||||
|
||||
if (DeviceOffloadKind == Action::OFK_Cuda) {
|
||||
if (JA.getOffloadingDeviceKind() == Action::OFK_HIP ||
|
||||
JA.getOffloadingDeviceKind() == Action::OFK_Cuda) {
|
||||
auto Kind = llvm::AMDGPU::parseArchAMDGCN(JA.getOffloadingArch());
|
||||
if (FPType && FPType == &llvm::APFloat::IEEEsingle() &&
|
||||
DriverArgs.hasFlag(options::OPT_fcuda_flush_denormals_to_zero,
|
||||
options::OPT_fno_cuda_flush_denormals_to_zero,
|
||||
false))
|
||||
getDefaultDenormsAreZeroForTarget(Kind)))
|
||||
return llvm::DenormalMode::getPreserveSign();
|
||||
|
||||
return llvm::DenormalMode::getIEEE();
|
||||
}
|
||||
|
||||
const StringRef GpuArch = DriverArgs.getLastArgValue(options::OPT_mcpu_EQ);
|
||||
|
@ -294,7 +298,9 @@ llvm::DenormalMode AMDGPUToolChain::getDefaultDenormalModeForType(
|
|||
// them all?
|
||||
bool DAZ = DriverArgs.hasArg(options::OPT_cl_denorms_are_zero) ||
|
||||
getDefaultDenormsAreZeroForTarget(Kind);
|
||||
// Outputs are flushed to zero, preserving sign
|
||||
|
||||
// Outputs are flushed to zero (FTZ), preserving sign. Denormal inputs are
|
||||
// also implicit treated as zero (DAZ).
|
||||
return DAZ ? llvm::DenormalMode::getPreserveSign() :
|
||||
llvm::DenormalMode::getIEEE();
|
||||
}
|
||||
|
|
|
@ -214,8 +214,7 @@ public:
|
|||
static bool getDefaultDenormsAreZeroForTarget(llvm::AMDGPU::GPUKind GPUKind);
|
||||
|
||||
llvm::DenormalMode getDefaultDenormalModeForType(
|
||||
const llvm::opt::ArgList &DriverArgs,
|
||||
Action::OffloadKind DeviceOffloadKind,
|
||||
const llvm::opt::ArgList &DriverArgs, const JobAction &JA,
|
||||
const llvm::fltSemantics *FPType = nullptr) const override;
|
||||
};
|
||||
|
||||
|
|
|
@ -2510,7 +2510,7 @@ static void CollectArgsForIntegratedAssembler(Compilation &C,
|
|||
static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
|
||||
bool OFastEnabled, const ArgList &Args,
|
||||
ArgStringList &CmdArgs,
|
||||
Action::OffloadKind DeviceOffloadKind) {
|
||||
const JobAction &JA) {
|
||||
// Handle various floating point optimization flags, mapping them to the
|
||||
// appropriate LLVM code generation flags. This is complicated by several
|
||||
// "umbrella" flags, so we do this by stepping through the flags incrementally
|
||||
|
@ -2533,10 +2533,9 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
|
|||
// -ffp-exception-behavior options: strict, maytrap, ignore
|
||||
StringRef FPExceptionBehavior = "";
|
||||
const llvm::DenormalMode DefaultDenormalFPMath =
|
||||
TC.getDefaultDenormalModeForType(Args, DeviceOffloadKind);
|
||||
TC.getDefaultDenormalModeForType(Args, JA);
|
||||
const llvm::DenormalMode DefaultDenormalFP32Math =
|
||||
TC.getDefaultDenormalModeForType(Args, DeviceOffloadKind,
|
||||
&llvm::APFloat::IEEEsingle());
|
||||
TC.getDefaultDenormalModeForType(Args, JA, &llvm::APFloat::IEEEsingle());
|
||||
|
||||
llvm::DenormalMode DenormalFPMath = DefaultDenormalFPMath;
|
||||
llvm::DenormalMode DenormalFP32Math = DefaultDenormalFP32Math;
|
||||
|
@ -4295,7 +4294,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
|
|||
CmdArgs.push_back("-mdisable-tail-calls");
|
||||
|
||||
RenderFloatingPointOptions(TC, D, isOptimizationLevelFast(Args), Args,
|
||||
CmdArgs, JA.getOffloadingDeviceKind());
|
||||
CmdArgs, JA);
|
||||
|
||||
// Render ABI arguments
|
||||
switch (TC.getArch()) {
|
||||
|
@ -4618,8 +4617,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
|
|||
if (Args.hasArg(options::OPT_fsplit_stack))
|
||||
CmdArgs.push_back("-split-stacks");
|
||||
|
||||
RenderFloatingPointOptions(TC, D, OFastEnabled, Args, CmdArgs,
|
||||
JA.getOffloadingDeviceKind());
|
||||
RenderFloatingPointOptions(TC, D, OFastEnabled, Args, CmdArgs, JA);
|
||||
|
||||
if (Arg *A = Args.getLastArg(options::OPT_mdouble_EQ)) {
|
||||
if (TC.getArch() == llvm::Triple::avr)
|
||||
|
|
|
@ -721,9 +721,9 @@ void CudaToolChain::addClangTargetOptions(
|
|||
}
|
||||
|
||||
llvm::DenormalMode CudaToolChain::getDefaultDenormalModeForType(
|
||||
const llvm::opt::ArgList &DriverArgs, Action::OffloadKind DeviceOffloadKind,
|
||||
const llvm::opt::ArgList &DriverArgs, const JobAction &JA,
|
||||
const llvm::fltSemantics *FPType) const {
|
||||
if (DeviceOffloadKind == Action::OFK_Cuda) {
|
||||
if (JA.getOffloadingDeviceKind() == Action::OFK_Cuda) {
|
||||
if (FPType && FPType == &llvm::APFloat::IEEEsingle() &&
|
||||
DriverArgs.hasFlag(options::OPT_fcuda_flush_denormals_to_zero,
|
||||
options::OPT_fno_cuda_flush_denormals_to_zero,
|
||||
|
@ -731,7 +731,7 @@ llvm::DenormalMode CudaToolChain::getDefaultDenormalModeForType(
|
|||
return llvm::DenormalMode::getPreserveSign();
|
||||
}
|
||||
|
||||
assert(DeviceOffloadKind != Action::OFK_Host);
|
||||
assert(JA.getOffloadingDeviceKind() != Action::OFK_Host);
|
||||
return llvm::DenormalMode::getIEEE();
|
||||
}
|
||||
|
||||
|
|
|
@ -156,8 +156,7 @@ public:
|
|||
Action::OffloadKind DeviceOffloadKind) const override;
|
||||
|
||||
llvm::DenormalMode getDefaultDenormalModeForType(
|
||||
const llvm::opt::ArgList &DriverArgs,
|
||||
Action::OffloadKind DeviceOffloadKind,
|
||||
const llvm::opt::ArgList &DriverArgs, const JobAction &JA,
|
||||
const llvm::fltSemantics *FPType = nullptr) const override;
|
||||
|
||||
// Never try to use the integrated assembler with CUDA; always fork out to
|
||||
|
|
|
@ -988,10 +988,10 @@ void Linux::addProfileRTLibs(const llvm::opt::ArgList &Args,
|
|||
ToolChain::addProfileRTLibs(Args, CmdArgs);
|
||||
}
|
||||
|
||||
llvm::DenormalMode Linux::getDefaultDenormalModeForType(
|
||||
const llvm::opt::ArgList &DriverArgs,
|
||||
Action::OffloadKind DeviceOffloadKind,
|
||||
const llvm::fltSemantics *FPType) const {
|
||||
llvm::DenormalMode
|
||||
Linux::getDefaultDenormalModeForType(const llvm::opt::ArgList &DriverArgs,
|
||||
const JobAction &JA,
|
||||
const llvm::fltSemantics *FPType) const {
|
||||
switch (getTriple().getArch()) {
|
||||
case llvm::Triple::x86:
|
||||
case llvm::Triple::x86_64: {
|
||||
|
|
|
@ -49,9 +49,8 @@ public:
|
|||
std::vector<std::string> ExtraOpts;
|
||||
|
||||
llvm::DenormalMode getDefaultDenormalModeForType(
|
||||
const llvm::opt::ArgList &DriverArgs,
|
||||
Action::OffloadKind DeviceOffloadKind,
|
||||
const llvm::fltSemantics *FPType = nullptr) const override;
|
||||
const llvm::opt::ArgList &DriverArgs, const JobAction &JA,
|
||||
const llvm::fltSemantics *FPType = nullptr) const override;
|
||||
|
||||
protected:
|
||||
Tool *buildAssembler() const override;
|
||||
|
|
|
@ -94,9 +94,8 @@ public:
|
|||
Action::OffloadKind DeviceOffloadingKind) const override;
|
||||
|
||||
llvm::DenormalMode getDefaultDenormalModeForType(
|
||||
const llvm::opt::ArgList &DriverArgs,
|
||||
Action::OffloadKind DeviceOffloadKind,
|
||||
const llvm::fltSemantics *FPType) const override {
|
||||
const llvm::opt::ArgList &DriverArgs, const JobAction &JA,
|
||||
const llvm::fltSemantics *FPType) const override {
|
||||
// DAZ and FTZ are on by default.
|
||||
return llvm::DenormalMode::getPreserveSign();
|
||||
}
|
||||
|
|
|
@ -7,16 +7,28 @@
|
|||
// RUN: %clang -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=sm_70 -fcuda-flush-denormals-to-zero -nocudainc -nocudalib %s 2>&1 | FileCheck -check-prefix=FTZ %s
|
||||
// RUN: %clang -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=sm_70 -fno-cuda-flush-denormals-to-zero -nocudainc -nocudalib %s 2>&1 | FileCheck -check-prefix=NOFTZ %s
|
||||
|
||||
// Test explicit argument.
|
||||
// Test explicit argument, with CUDA offload kind
|
||||
// RUN: %clang -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx803 -fcuda-flush-denormals-to-zero -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=FTZ %s
|
||||
// RUN: %clang -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx803 -fno-cuda-flush-denormals-to-zero -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=NOFTZ %s
|
||||
|
||||
// Test explicit argument, with HIP offload kind
|
||||
// RUN: %clang -x hip -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx803 -fcuda-flush-denormals-to-zero -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=FTZ %s
|
||||
// RUN: %clang -x hip -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx803 -fno-cuda-flush-denormals-to-zero -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=NOFTZ %s
|
||||
|
||||
// RUN: %clang -x hip -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx900 -fcuda-flush-denormals-to-zero -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=FTZ %s
|
||||
// RUN: %clang -x hip -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx900 -fno-cuda-flush-denormals-to-zero -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=NOFTZ %s
|
||||
|
||||
// Test the default changing with no argument based on the subtarget.
|
||||
// Test the default changing with no argument based on the subtarget in HIP mode
|
||||
// RUN: %clang -x hip -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx803 -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=FTZ %s
|
||||
// RUN: %clang -x hip -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx900 -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=NOFTZ %s
|
||||
|
||||
|
||||
// Test multiple offload archs with different defaults.
|
||||
// RUN: %clang -x hip -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=MIXED-DEFAULT-MODE %s
|
||||
// RUN: %clang -x hip -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell -fcuda-flush-denormals-to-zero --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=FTZX2 %s
|
||||
// RUN: %clang -x hip -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell -fno-cuda-flush-denormals-to-zero --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=NOFTZ %s
|
||||
|
||||
|
||||
// CPUFTZ-NOT: -fdenormal-fp-math
|
||||
|
||||
// FTZ-NOT: -fdenormal-fp-math-f32=
|
||||
|
@ -25,3 +37,13 @@
|
|||
// The default of ieee is omitted
|
||||
// NOFTZ-NOT: "-fdenormal-fp-math"
|
||||
// NOFTZ-NOT: "-fdenormal-fp-math-f32"
|
||||
|
||||
// MIXED-DEFAULT-MODE-NOT: -denormal-fp-math
|
||||
// MIXED-DEFAULT-MODE: "-fdenormal-fp-math-f32=preserve-sign,preserve-sign"
|
||||
// MIXED-DEFAULT-MODE-SAME: "-target-cpu" "gfx803"
|
||||
// MIXED-DEFAULT-MODE-NOT: -denormal-fp-math
|
||||
|
||||
// FTZX2: "-fdenormal-fp-math-f32=preserve-sign,preserve-sign"
|
||||
// FTZX2-SAME: "-target-cpu" "gfx803"
|
||||
// FTZX2: "-fdenormal-fp-math-f32=preserve-sign,preserve-sign"
|
||||
// FTZX2-SAME: "-target-cpu" "gfx900"
|
||||
|
|
Loading…
Reference in New Issue