forked from OSchip/llvm-project
[CUDA] Add -fcuda-approx-transcendentals flag.
Summary: This lets us emit e.g. sin.approx.f32. See http://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-sin Reviewers: rnk Subscribers: tra, cfe-commits Differential Revision: http://reviews.llvm.org/D20493 llvm-svn: 270484
This commit is contained in:
parent
66a891962b
commit
91f6f07bb8
|
@ -190,6 +190,7 @@ LANGOPT(CUDAIsDevice , 1, 0, "compiling for CUDA device")
|
|||
LANGOPT(CUDAAllowVariadicFunctions, 1, 0, "allowing variadic functions in CUDA device code")
|
||||
LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr functions as __host__ __device__")
|
||||
LANGOPT(CUDADeviceFlushDenormalsToZero, 1, 0, "flushing denormals to zero")
|
||||
LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions")
|
||||
|
||||
LANGOPT(SizedDeallocation , 1, 0, "enable sized deallocation functions")
|
||||
LANGOPT(ConceptsTS , 1, 0, "enable C++ Extensions for Concepts")
|
||||
|
|
|
@ -395,6 +395,9 @@ def cuda_path_EQ : Joined<["--"], "cuda-path=">, Group<i_Group>,
|
|||
def fcuda_flush_denormals_to_zero : Flag<["-"], "fcuda-flush-denormals-to-zero">,
|
||||
Flags<[CC1Option]>, HelpText<"Flush denormal floating point values to zero in CUDA device mode.">;
|
||||
def fno_cuda_flush_denormals_to_zero : Flag<["-"], "fno-cuda-flush-denormals-to-zero">;
|
||||
def fcuda_approx_transcendentals : Flag<["-"], "fcuda-approx-transcendentals">,
|
||||
Flags<[CC1Option]>, HelpText<"Use approximate transcendental functions">;
|
||||
def fno_cuda_approx_transcendentals : Flag<["-"], "fno-cuda-approx-transcendentals">;
|
||||
def dA : Flag<["-"], "dA">, Group<d_Group>;
|
||||
def dD : Flag<["-"], "dD">, Group<d_Group>, Flags<[CC1Option]>,
|
||||
HelpText<"Print macro definitions in -E mode in addition to normal output">;
|
||||
|
|
|
@ -4502,6 +4502,10 @@ CudaToolChain::addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
|
|||
options::OPT_fno_cuda_flush_denormals_to_zero, false))
|
||||
CC1Args.push_back("-fcuda-flush-denormals-to-zero");
|
||||
|
||||
if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals,
|
||||
options::OPT_fno_cuda_approx_transcendentals, false))
|
||||
CC1Args.push_back("-fcuda-approx-transcendentals");
|
||||
|
||||
if (DriverArgs.hasArg(options::OPT_nocudalib))
|
||||
return;
|
||||
|
||||
|
|
|
@ -1616,6 +1616,9 @@ static void ParseLangArgs(LangOptions &Opts, ArgList &Args, InputKind IK,
|
|||
if (Opts.CUDAIsDevice && Args.hasArg(OPT_fcuda_flush_denormals_to_zero))
|
||||
Opts.CUDADeviceFlushDenormalsToZero = 1;
|
||||
|
||||
if (Opts.CUDAIsDevice && Args.hasArg(OPT_fcuda_approx_transcendentals))
|
||||
Opts.CUDADeviceApproxTranscendentals = 1;
|
||||
|
||||
if (Opts.ObjC1) {
|
||||
if (Arg *arg = Args.getLastArg(OPT_fobjc_runtime_EQ)) {
|
||||
StringRef value = arg->getValue();
|
||||
|
|
|
@ -938,6 +938,12 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
|
|||
Builder.defineMacro("__CUDA_ARCH__");
|
||||
}
|
||||
|
||||
// We need to communicate this to our CUDA header wrapper, which in turn
|
||||
// informs the proper CUDA headers of this choice.
|
||||
if (LangOpts.CUDADeviceApproxTranscendentals || LangOpts.FastMath) {
|
||||
Builder.defineMacro("__CLANG_CUDA_APPROX_TRANSCENDENTALS__");
|
||||
}
|
||||
|
||||
// OpenCL definitions.
|
||||
if (LangOpts.OpenCL) {
|
||||
#define OPENCLEXT(Ext) \
|
||||
|
|
|
@ -142,7 +142,20 @@
|
|||
#pragma push_macro("__forceinline__")
|
||||
#define __forceinline__ __device__ __inline__ __attribute__((always_inline))
|
||||
#include "device_functions.hpp"
|
||||
|
||||
// math_function.hpp uses the __USE_FAST_MATH__ macro to determine whether we
|
||||
// get the slow-but-accurate or fast-but-inaccurate versions of functions like
|
||||
// sin and exp. This is controlled in clang by -fcuda-approx-transcendentals.
|
||||
//
|
||||
// device_functions.hpp uses __USE_FAST_MATH__ for a different purpose (fast vs.
|
||||
// slow divides), so we need to scope our define carefully here.
|
||||
#pragma push_macro("__USE_FAST_MATH__")
|
||||
#if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__)
|
||||
#define __USE_FAST_MATH__
|
||||
#endif
|
||||
#include "math_functions.hpp"
|
||||
#pragma pop_macro("__USE_FAST_MATH__")
|
||||
|
||||
#include "math_functions_dbl_ptx3.hpp"
|
||||
#pragma pop_macro("__forceinline__")
|
||||
|
||||
|
@ -296,6 +309,7 @@ __device__ inline __cuda_builtin_gridDim_t::operator dim3() const {
|
|||
#include "curand_mtgp32_kernel.h"
|
||||
#pragma pop_macro("dim3")
|
||||
#pragma pop_macro("uint3")
|
||||
#pragma pop_macro("__USE_FAST_MATH__")
|
||||
|
||||
#endif // __CUDA__
|
||||
#endif // __CLANG_CUDA_RUNTIME_WRAPPER_H__
|
||||
|
|
|
@ -0,0 +1,8 @@
|
|||
// RUN: %clang --cuda-host-only -nocudainc -target i386-unknown-linux-gnu -x cuda -E -dM -o - /dev/null | FileCheck --check-prefix HOST %s
|
||||
// RUN: %clang --cuda-device-only -nocudainc -target i386-unknown-linux-gnu -x cuda -E -dM -o - /dev/null | FileCheck --check-prefix DEVICE-NOFAST %s
|
||||
// RUN: %clang -fcuda-approx-transcendentals --cuda-device-only -nocudainc -target i386-unknown-linux-gnu -x cuda -E -dM -o - /dev/null | FileCheck --check-prefix DEVICE-FAST %s
|
||||
// RUN: %clang -ffast-math --cuda-device-only -nocudainc -target i386-unknown-linux-gnu -x cuda -E -dM -o - /dev/null | FileCheck --check-prefix DEVICE-FAST %s
|
||||
|
||||
// HOST-NOT: __CLANG_CUDA_APPROX_TRANSCENDENTALS__
|
||||
// DEVICE-NOFAST-NOT: __CLANG_CUDA_APPROX_TRANSCENDENTALS__
|
||||
// DEVICE-FAST: __CLANG_CUDA_APPROX_TRANSCENDENTALS__
|
Loading…
Reference in New Issue