[OpenMP][Clang] Support for target math functions

Summary:
In this patch we propose a temporary solution to resolving math functions for the NVPTX toolchain, temporary until OpenMP variant is supported by Clang.

We intercept the inclusion of math.h and cmath headers and if we are in the OpenMP-NVPTX case, we re-use CUDA's math function resolution mechanism.

Authors:
@gtbercea
@jdoerfert

Reviewers: hfinkel, caomhin, ABataev, tra

Reviewed By: hfinkel, ABataev, tra

Subscribers: JDevlieghere, mgorny, guansong, cfe-commits, jdoerfert

Tags: #clang

Differential Revision: https://reviews.llvm.org/D61399

llvm-svn: 360265
This commit is contained in:
Gheorghe-Teodor Bercea 2019-05-08 15:52:33 +00:00
parent a3ff5727b7
commit e62c693c8e
18 changed files with 693 additions and 433 deletions

View File

@ -425,7 +425,7 @@ bool ToolChain::needsProfileRT(const ArgList &Args) {
Args.hasArg(options::OPT_fprofile_instr_generate) ||
Args.hasArg(options::OPT_fprofile_instr_generate_EQ) ||
Args.hasArg(options::OPT_fcreate_profile) ||
Args.hasArg(options::OPT_forder_file_instrumentation))
Args.hasArg(options::OPT_forder_file_instrumentation))
return true;
return false;

View File

@ -1151,6 +1151,24 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
if (JA.isOffloading(Action::OFK_Cuda))
getToolChain().AddCudaIncludeArgs(Args, CmdArgs);
// If we are offloading to a target via OpenMP we need to include the
// openmp_wrappers folder which contains alternative system headers.
if (JA.isDeviceOffloading(Action::OFK_OpenMP) &&
getToolChain().getTriple().isNVPTX()){
if (!Args.hasArg(options::OPT_nobuiltininc)) {
// Add openmp_wrappers/* to our system include path. This lets us wrap
// standard library headers.
SmallString<128> P(D.ResourceDir);
llvm::sys::path::append(P, "include");
llvm::sys::path::append(P, "openmp_wrappers");
CmdArgs.push_back("-internal-isystem");
CmdArgs.push_back(Args.MakeArgString(P));
}
CmdArgs.push_back("-include");
CmdArgs.push_back("__clang_openmp_math.h");
}
// Add -i* options, and automatically translate to
// -include-pch/-include-pth for transparent PCH support. It's
// wonky, but we include looking for .gch so we can support seamless

View File

@ -128,6 +128,12 @@ set(ppc_wrapper_files
ppc_wrappers/mmintrin.h
)
set(openmp_wrapper_files
openmp_wrappers/math.h
openmp_wrappers/cmath
openmp_wrappers/__clang_openmp_math.h
)
set(output_dir ${LLVM_LIBRARY_OUTPUT_INTDIR}/clang/${CLANG_VERSION}/include)
set(out_files)
set(generated_files)
@ -156,7 +162,7 @@ endfunction(clang_generate_header)
# Copy header files from the source directory to the build directory
foreach( f ${files} ${cuda_wrapper_files} ${ppc_wrapper_files} )
foreach( f ${files} ${cuda_wrapper_files} ${ppc_wrapper_files} ${openmp_wrapper_files})
copy_header_to_output_dir(${CMAKE_CURRENT_SOURCE_DIR} ${f})
endforeach( f )
@ -188,6 +194,11 @@ install(
DESTINATION ${header_install_dir}/ppc_wrappers
COMPONENT clang-resource-headers)
install(
FILES ${openmp_wrapper_files}
DESTINATION ${header_install_dir}/openmp_wrappers
COMPONENT clang-resource-headers)
if (NOT LLVM_ENABLE_IDE)
add_llvm_install_targets(install-clang-resource-headers
DEPENDS clang-resource-headers

View File

@ -30,7 +30,11 @@
// implementation. Declaring in the global namespace and pulling into namespace
// std covers all of the known knowns.
#ifdef _OPENMP
#define __DEVICE__ static __attribute__((always_inline))
#else
#define __DEVICE__ static __device__ __inline__ __attribute__((always_inline))
#endif
__DEVICE__ long long abs(long long __n) { return ::llabs(__n); }
__DEVICE__ long abs(long __n) { return ::labs(__n); }
@ -47,6 +51,8 @@ __DEVICE__ float exp(float __x) { return ::expf(__x); }
__DEVICE__ float fabs(float __x) { return ::fabsf(__x); }
__DEVICE__ float floor(float __x) { return ::floorf(__x); }
__DEVICE__ float fmod(float __x, float __y) { return ::fmodf(__x, __y); }
// TODO: remove when variant is supported
#ifndef _OPENMP
__DEVICE__ int fpclassify(float __x) {
return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
FP_ZERO, __x);
@ -55,6 +61,7 @@ __DEVICE__ int fpclassify(double __x) {
return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
FP_ZERO, __x);
}
#endif
__DEVICE__ float frexp(float __arg, int *__exp) {
return ::frexpf(__arg, __exp);
}
@ -434,7 +441,10 @@ using ::remainderf;
using ::remquof;
using ::rintf;
using ::roundf;
// TODO: remove once variant is supported
#ifndef _OPENMP
using ::scalblnf;
#endif
using ::scalbnf;
using ::sinf;
using ::sinhf;

View File

@ -10,15 +10,21 @@
#ifndef __CLANG_CUDA_DEVICE_FUNCTIONS_H__
#define __CLANG_CUDA_DEVICE_FUNCTIONS_H__
#ifndef _OPENMP
#if CUDA_VERSION < 9000
#error This file is intended to be used with CUDA-9+ only.
#endif
#endif
// __DEVICE__ is a helper macro with common set of attributes for the wrappers
// we implement in this file. We need static in order to avoid emitting unused
// functions and __forceinline__ helps inlining these wrappers at -O1.
#pragma push_macro("__DEVICE__")
#ifdef _OPENMP
#define __DEVICE__ static __attribute__((always_inline))
#else
#define __DEVICE__ static __device__ __forceinline__
#endif
// libdevice provides fast low precision and slow full-recision implementations
// for some functions. Which one gets selected depends on
@ -38,8 +44,13 @@ __DEVICE__ unsigned int __brev(unsigned int __a) { return __nv_brev(__a); }
__DEVICE__ unsigned long long __brevll(unsigned long long __a) {
return __nv_brevll(__a);
}
#if defined(__cplusplus)
__DEVICE__ void __brkpt() { asm volatile("brkpt;"); }
__DEVICE__ void __brkpt(int __a) { __brkpt(); }
#else
__DEVICE__ void __attribute__((overloadable)) __brkpt(void) { asm volatile("brkpt;"); }
__DEVICE__ void __attribute__((overloadable)) __brkpt(int __a) { __brkpt(); }
#endif
__DEVICE__ unsigned int __byte_perm(unsigned int __a, unsigned int __b,
unsigned int __c) {
return __nv_byte_perm(__a, __b, __c);
@ -1559,7 +1570,7 @@ __DEVICE__ float j1f(float __a) { return __nv_j1f(__a); }
__DEVICE__ double jn(int __n, double __a) { return __nv_jn(__n, __a); }
__DEVICE__ float jnf(int __n, float __a) { return __nv_jnf(__n, __a); }
#if defined(__LP64__) || defined(_WIN64)
__DEVICE__ long labs(long __a) { return llabs(__a); };
__DEVICE__ long labs(long __a) { return __nv_llabs(__a); };
#else
__DEVICE__ long labs(long __a) { return __nv_abs(__a); };
#endif
@ -1604,12 +1615,16 @@ __DEVICE__ long lround(double __a) { return round(__a); }
__DEVICE__ long lroundf(float __a) { return roundf(__a); }
#endif
__DEVICE__ int max(int __a, int __b) { return __nv_max(__a, __b); }
// These functions shouldn't be declared when including this header
// for math function resolution purposes.
#ifndef _OPENMP
__DEVICE__ void *memcpy(void *__a, const void *__b, size_t __c) {
return __builtin_memcpy(__a, __b, __c);
}
__DEVICE__ void *memset(void *__a, int __b, size_t __c) {
return __builtin_memset(__a, __b, __c);
}
#endif
__DEVICE__ int min(int __a, int __b) { return __nv_min(__a, __b); }
__DEVICE__ double modf(double __a, double *__b) { return __nv_modf(__a, __b); }
__DEVICE__ float modff(float __a, float *__b) { return __nv_modff(__a, __b); }
@ -1693,6 +1708,8 @@ __DEVICE__ double rsqrt(double __a) { return __nv_rsqrt(__a); }
__DEVICE__ float rsqrtf(float __a) { return __nv_rsqrtf(__a); }
__DEVICE__ double scalbn(double __a, int __b) { return __nv_scalbn(__a, __b); }
__DEVICE__ float scalbnf(float __a, int __b) { return __nv_scalbnf(__a, __b); }
// TODO: remove once variant is supported
#ifndef _OPENMP
__DEVICE__ double scalbln(double __a, long __b) {
if (__b > INT_MAX)
return __a > 0 ? HUGE_VAL : -HUGE_VAL;
@ -1707,6 +1724,7 @@ __DEVICE__ float scalblnf(float __a, long __b) {
return __a > 0 ? 0.f : -0.f;
return scalbnf(__a, (int)__b);
}
#endif
__DEVICE__ double sin(double __a) { return __nv_sin(__a); }
__DEVICE__ void sincos(double __a, double *__s, double *__c) {
return __nv_sincos(__a, __s, __c);

View File

@ -10,443 +10,453 @@
#ifndef __CLANG_CUDA_LIBDEVICE_DECLARES_H__
#define __CLANG_CUDA_LIBDEVICE_DECLARES_H__
#if defined(__cplusplus)
extern "C" {
#endif
__device__ int __nv_abs(int __a);
__device__ double __nv_acos(double __a);
__device__ float __nv_acosf(float __a);
__device__ double __nv_acosh(double __a);
__device__ float __nv_acoshf(float __a);
__device__ double __nv_asin(double __a);
__device__ float __nv_asinf(float __a);
__device__ double __nv_asinh(double __a);
__device__ float __nv_asinhf(float __a);
__device__ double __nv_atan2(double __a, double __b);
__device__ float __nv_atan2f(float __a, float __b);
__device__ double __nv_atan(double __a);
__device__ float __nv_atanf(float __a);
__device__ double __nv_atanh(double __a);
__device__ float __nv_atanhf(float __a);
__device__ int __nv_brev(int __a);
__device__ long long __nv_brevll(long long __a);
__device__ int __nv_byte_perm(int __a, int __b, int __c);
__device__ double __nv_cbrt(double __a);
__device__ float __nv_cbrtf(float __a);
__device__ double __nv_ceil(double __a);
__device__ float __nv_ceilf(float __a);
__device__ int __nv_clz(int __a);
__device__ int __nv_clzll(long long __a);
__device__ double __nv_copysign(double __a, double __b);
__device__ float __nv_copysignf(float __a, float __b);
__device__ double __nv_cos(double __a);
__device__ float __nv_cosf(float __a);
__device__ double __nv_cosh(double __a);
__device__ float __nv_coshf(float __a);
__device__ double __nv_cospi(double __a);
__device__ float __nv_cospif(float __a);
__device__ double __nv_cyl_bessel_i0(double __a);
__device__ float __nv_cyl_bessel_i0f(float __a);
__device__ double __nv_cyl_bessel_i1(double __a);
__device__ float __nv_cyl_bessel_i1f(float __a);
__device__ double __nv_dadd_rd(double __a, double __b);
__device__ double __nv_dadd_rn(double __a, double __b);
__device__ double __nv_dadd_ru(double __a, double __b);
__device__ double __nv_dadd_rz(double __a, double __b);
__device__ double __nv_ddiv_rd(double __a, double __b);
__device__ double __nv_ddiv_rn(double __a, double __b);
__device__ double __nv_ddiv_ru(double __a, double __b);
__device__ double __nv_ddiv_rz(double __a, double __b);
__device__ double __nv_dmul_rd(double __a, double __b);
__device__ double __nv_dmul_rn(double __a, double __b);
__device__ double __nv_dmul_ru(double __a, double __b);
__device__ double __nv_dmul_rz(double __a, double __b);
__device__ float __nv_double2float_rd(double __a);
__device__ float __nv_double2float_rn(double __a);
__device__ float __nv_double2float_ru(double __a);
__device__ float __nv_double2float_rz(double __a);
__device__ int __nv_double2hiint(double __a);
__device__ int __nv_double2int_rd(double __a);
__device__ int __nv_double2int_rn(double __a);
__device__ int __nv_double2int_ru(double __a);
__device__ int __nv_double2int_rz(double __a);
__device__ long long __nv_double2ll_rd(double __a);
__device__ long long __nv_double2ll_rn(double __a);
__device__ long long __nv_double2ll_ru(double __a);
__device__ long long __nv_double2ll_rz(double __a);
__device__ int __nv_double2loint(double __a);
__device__ unsigned int __nv_double2uint_rd(double __a);
__device__ unsigned int __nv_double2uint_rn(double __a);
__device__ unsigned int __nv_double2uint_ru(double __a);
__device__ unsigned int __nv_double2uint_rz(double __a);
__device__ unsigned long long __nv_double2ull_rd(double __a);
__device__ unsigned long long __nv_double2ull_rn(double __a);
__device__ unsigned long long __nv_double2ull_ru(double __a);
__device__ unsigned long long __nv_double2ull_rz(double __a);
__device__ unsigned long long __nv_double_as_longlong(double __a);
__device__ double __nv_drcp_rd(double __a);
__device__ double __nv_drcp_rn(double __a);
__device__ double __nv_drcp_ru(double __a);
__device__ double __nv_drcp_rz(double __a);
__device__ double __nv_dsqrt_rd(double __a);
__device__ double __nv_dsqrt_rn(double __a);
__device__ double __nv_dsqrt_ru(double __a);
__device__ double __nv_dsqrt_rz(double __a);
__device__ double __nv_dsub_rd(double __a, double __b);
__device__ double __nv_dsub_rn(double __a, double __b);
__device__ double __nv_dsub_ru(double __a, double __b);
__device__ double __nv_dsub_rz(double __a, double __b);
__device__ double __nv_erfc(double __a);
__device__ float __nv_erfcf(float __a);
__device__ double __nv_erfcinv(double __a);
__device__ float __nv_erfcinvf(float __a);
__device__ double __nv_erfcx(double __a);
__device__ float __nv_erfcxf(float __a);
__device__ double __nv_erf(double __a);
__device__ float __nv_erff(float __a);
__device__ double __nv_erfinv(double __a);
__device__ float __nv_erfinvf(float __a);
__device__ double __nv_exp10(double __a);
__device__ float __nv_exp10f(float __a);
__device__ double __nv_exp2(double __a);
__device__ float __nv_exp2f(float __a);
__device__ double __nv_exp(double __a);
__device__ float __nv_expf(float __a);
__device__ double __nv_expm1(double __a);
__device__ float __nv_expm1f(float __a);
__device__ double __nv_fabs(double __a);
__device__ float __nv_fabsf(float __a);
__device__ float __nv_fadd_rd(float __a, float __b);
__device__ float __nv_fadd_rn(float __a, float __b);
__device__ float __nv_fadd_ru(float __a, float __b);
__device__ float __nv_fadd_rz(float __a, float __b);
__device__ float __nv_fast_cosf(float __a);
__device__ float __nv_fast_exp10f(float __a);
__device__ float __nv_fast_expf(float __a);
__device__ float __nv_fast_fdividef(float __a, float __b);
__device__ float __nv_fast_log10f(float __a);
__device__ float __nv_fast_log2f(float __a);
__device__ float __nv_fast_logf(float __a);
__device__ float __nv_fast_powf(float __a, float __b);
__device__ void __nv_fast_sincosf(float __a, float *__s, float *__c);
__device__ float __nv_fast_sinf(float __a);
__device__ float __nv_fast_tanf(float __a);
__device__ double __nv_fdim(double __a, double __b);
__device__ float __nv_fdimf(float __a, float __b);
__device__ float __nv_fdiv_rd(float __a, float __b);
__device__ float __nv_fdiv_rn(float __a, float __b);
__device__ float __nv_fdiv_ru(float __a, float __b);
__device__ float __nv_fdiv_rz(float __a, float __b);
__device__ int __nv_ffs(int __a);
__device__ int __nv_ffsll(long long __a);
__device__ int __nv_finitef(float __a);
__device__ unsigned short __nv_float2half_rn(float __a);
__device__ int __nv_float2int_rd(float __a);
__device__ int __nv_float2int_rn(float __a);
__device__ int __nv_float2int_ru(float __a);
__device__ int __nv_float2int_rz(float __a);
__device__ long long __nv_float2ll_rd(float __a);
__device__ long long __nv_float2ll_rn(float __a);
__device__ long long __nv_float2ll_ru(float __a);
__device__ long long __nv_float2ll_rz(float __a);
__device__ unsigned int __nv_float2uint_rd(float __a);
__device__ unsigned int __nv_float2uint_rn(float __a);
__device__ unsigned int __nv_float2uint_ru(float __a);
__device__ unsigned int __nv_float2uint_rz(float __a);
__device__ unsigned long long __nv_float2ull_rd(float __a);
__device__ unsigned long long __nv_float2ull_rn(float __a);
__device__ unsigned long long __nv_float2ull_ru(float __a);
__device__ unsigned long long __nv_float2ull_rz(float __a);
__device__ int __nv_float_as_int(float __a);
__device__ unsigned int __nv_float_as_uint(float __a);
__device__ double __nv_floor(double __a);
__device__ float __nv_floorf(float __a);
__device__ double __nv_fma(double __a, double __b, double __c);
__device__ float __nv_fmaf(float __a, float __b, float __c);
__device__ float __nv_fmaf_ieee_rd(float __a, float __b, float __c);
__device__ float __nv_fmaf_ieee_rn(float __a, float __b, float __c);
__device__ float __nv_fmaf_ieee_ru(float __a, float __b, float __c);
__device__ float __nv_fmaf_ieee_rz(float __a, float __b, float __c);
__device__ float __nv_fmaf_rd(float __a, float __b, float __c);
__device__ float __nv_fmaf_rn(float __a, float __b, float __c);
__device__ float __nv_fmaf_ru(float __a, float __b, float __c);
__device__ float __nv_fmaf_rz(float __a, float __b, float __c);
__device__ double __nv_fma_rd(double __a, double __b, double __c);
__device__ double __nv_fma_rn(double __a, double __b, double __c);
__device__ double __nv_fma_ru(double __a, double __b, double __c);
__device__ double __nv_fma_rz(double __a, double __b, double __c);
__device__ double __nv_fmax(double __a, double __b);
__device__ float __nv_fmaxf(float __a, float __b);
__device__ double __nv_fmin(double __a, double __b);
__device__ float __nv_fminf(float __a, float __b);
__device__ double __nv_fmod(double __a, double __b);
__device__ float __nv_fmodf(float __a, float __b);
__device__ float __nv_fmul_rd(float __a, float __b);
__device__ float __nv_fmul_rn(float __a, float __b);
__device__ float __nv_fmul_ru(float __a, float __b);
__device__ float __nv_fmul_rz(float __a, float __b);
__device__ float __nv_frcp_rd(float __a);
__device__ float __nv_frcp_rn(float __a);
__device__ float __nv_frcp_ru(float __a);
__device__ float __nv_frcp_rz(float __a);
__device__ double __nv_frexp(double __a, int *__b);
__device__ float __nv_frexpf(float __a, int *__b);
__device__ float __nv_frsqrt_rn(float __a);
__device__ float __nv_fsqrt_rd(float __a);
__device__ float __nv_fsqrt_rn(float __a);
__device__ float __nv_fsqrt_ru(float __a);
__device__ float __nv_fsqrt_rz(float __a);
__device__ float __nv_fsub_rd(float __a, float __b);
__device__ float __nv_fsub_rn(float __a, float __b);
__device__ float __nv_fsub_ru(float __a, float __b);
__device__ float __nv_fsub_rz(float __a, float __b);
__device__ int __nv_hadd(int __a, int __b);
__device__ float __nv_half2float(unsigned short __h);
__device__ double __nv_hiloint2double(int __a, int __b);
__device__ double __nv_hypot(double __a, double __b);
__device__ float __nv_hypotf(float __a, float __b);
__device__ int __nv_ilogb(double __a);
__device__ int __nv_ilogbf(float __a);
__device__ double __nv_int2double_rn(int __a);
__device__ float __nv_int2float_rd(int __a);
__device__ float __nv_int2float_rn(int __a);
__device__ float __nv_int2float_ru(int __a);
__device__ float __nv_int2float_rz(int __a);
__device__ float __nv_int_as_float(int __a);
__device__ int __nv_isfinited(double __a);
__device__ int __nv_isinfd(double __a);
__device__ int __nv_isinff(float __a);
__device__ int __nv_isnand(double __a);
__device__ int __nv_isnanf(float __a);
__device__ double __nv_j0(double __a);
__device__ float __nv_j0f(float __a);
__device__ double __nv_j1(double __a);
__device__ float __nv_j1f(float __a);
__device__ float __nv_jnf(int __a, float __b);
__device__ double __nv_jn(int __a, double __b);
__device__ double __nv_ldexp(double __a, int __b);
__device__ float __nv_ldexpf(float __a, int __b);
__device__ double __nv_lgamma(double __a);
__device__ float __nv_lgammaf(float __a);
__device__ double __nv_ll2double_rd(long long __a);
__device__ double __nv_ll2double_rn(long long __a);
__device__ double __nv_ll2double_ru(long long __a);
__device__ double __nv_ll2double_rz(long long __a);
__device__ float __nv_ll2float_rd(long long __a);
__device__ float __nv_ll2float_rn(long long __a);
__device__ float __nv_ll2float_ru(long long __a);
__device__ float __nv_ll2float_rz(long long __a);
__device__ long long __nv_llabs(long long __a);
__device__ long long __nv_llmax(long long __a, long long __b);
__device__ long long __nv_llmin(long long __a, long long __b);
__device__ long long __nv_llrint(double __a);
__device__ long long __nv_llrintf(float __a);
__device__ long long __nv_llround(double __a);
__device__ long long __nv_llroundf(float __a);
__device__ double __nv_log10(double __a);
__device__ float __nv_log10f(float __a);
__device__ double __nv_log1p(double __a);
__device__ float __nv_log1pf(float __a);
__device__ double __nv_log2(double __a);
__device__ float __nv_log2f(float __a);
__device__ double __nv_logb(double __a);
__device__ float __nv_logbf(float __a);
__device__ double __nv_log(double __a);
__device__ float __nv_logf(float __a);
__device__ double __nv_longlong_as_double(long long __a);
__device__ int __nv_max(int __a, int __b);
__device__ int __nv_min(int __a, int __b);
__device__ double __nv_modf(double __a, double *__b);
__device__ float __nv_modff(float __a, float *__b);
__device__ int __nv_mul24(int __a, int __b);
__device__ long long __nv_mul64hi(long long __a, long long __b);
__device__ int __nv_mulhi(int __a, int __b);
__device__ double __nv_nan(const signed char *__a);
__device__ float __nv_nanf(const signed char *__a);
__device__ double __nv_nearbyint(double __a);
__device__ float __nv_nearbyintf(float __a);
__device__ double __nv_nextafter(double __a, double __b);
__device__ float __nv_nextafterf(float __a, float __b);
__device__ double __nv_norm3d(double __a, double __b, double __c);
__device__ float __nv_norm3df(float __a, float __b, float __c);
__device__ double __nv_norm4d(double __a, double __b, double __c, double __d);
__device__ float __nv_norm4df(float __a, float __b, float __c, float __d);
__device__ double __nv_normcdf(double __a);
__device__ float __nv_normcdff(float __a);
__device__ double __nv_normcdfinv(double __a);
__device__ float __nv_normcdfinvf(float __a);
__device__ float __nv_normf(int __a, const float *__b);
__device__ double __nv_norm(int __a, const double *__b);
__device__ int __nv_popc(int __a);
__device__ int __nv_popcll(long long __a);
__device__ double __nv_pow(double __a, double __b);
__device__ float __nv_powf(float __a, float __b);
__device__ double __nv_powi(double __a, int __b);
__device__ float __nv_powif(float __a, int __b);
__device__ double __nv_rcbrt(double __a);
__device__ float __nv_rcbrtf(float __a);
__device__ double __nv_rcp64h(double __a);
__device__ double __nv_remainder(double __a, double __b);
__device__ float __nv_remainderf(float __a, float __b);
__device__ double __nv_remquo(double __a, double __b, int *__c);
__device__ float __nv_remquof(float __a, float __b, int *__c);
__device__ int __nv_rhadd(int __a, int __b);
__device__ double __nv_rhypot(double __a, double __b);
__device__ float __nv_rhypotf(float __a, float __b);
__device__ double __nv_rint(double __a);
__device__ float __nv_rintf(float __a);
__device__ double __nv_rnorm3d(double __a, double __b, double __c);
__device__ float __nv_rnorm3df(float __a, float __b, float __c);
__device__ double __nv_rnorm4d(double __a, double __b, double __c, double __d);
__device__ float __nv_rnorm4df(float __a, float __b, float __c, float __d);
__device__ float __nv_rnormf(int __a, const float *__b);
__device__ double __nv_rnorm(int __a, const double *__b);
__device__ double __nv_round(double __a);
__device__ float __nv_roundf(float __a);
__device__ double __nv_rsqrt(double __a);
__device__ float __nv_rsqrtf(float __a);
__device__ int __nv_sad(int __a, int __b, int __c);
__device__ float __nv_saturatef(float __a);
__device__ double __nv_scalbn(double __a, int __b);
__device__ float __nv_scalbnf(float __a, int __b);
__device__ int __nv_signbitd(double __a);
__device__ int __nv_signbitf(float __a);
__device__ void __nv_sincos(double __a, double *__b, double *__c);
__device__ void __nv_sincosf(float __a, float *__b, float *__c);
__device__ void __nv_sincospi(double __a, double *__b, double *__c);
__device__ void __nv_sincospif(float __a, float *__b, float *__c);
__device__ double __nv_sin(double __a);
__device__ float __nv_sinf(float __a);
__device__ double __nv_sinh(double __a);
__device__ float __nv_sinhf(float __a);
__device__ double __nv_sinpi(double __a);
__device__ float __nv_sinpif(float __a);
__device__ double __nv_sqrt(double __a);
__device__ float __nv_sqrtf(float __a);
__device__ double __nv_tan(double __a);
__device__ float __nv_tanf(float __a);
__device__ double __nv_tanh(double __a);
__device__ float __nv_tanhf(float __a);
__device__ double __nv_tgamma(double __a);
__device__ float __nv_tgammaf(float __a);
__device__ double __nv_trunc(double __a);
__device__ float __nv_truncf(float __a);
__device__ int __nv_uhadd(unsigned int __a, unsigned int __b);
__device__ double __nv_uint2double_rn(unsigned int __i);
__device__ float __nv_uint2float_rd(unsigned int __a);
__device__ float __nv_uint2float_rn(unsigned int __a);
__device__ float __nv_uint2float_ru(unsigned int __a);
__device__ float __nv_uint2float_rz(unsigned int __a);
__device__ float __nv_uint_as_float(unsigned int __a);
__device__ double __nv_ull2double_rd(unsigned long long __a);
__device__ double __nv_ull2double_rn(unsigned long long __a);
__device__ double __nv_ull2double_ru(unsigned long long __a);
__device__ double __nv_ull2double_rz(unsigned long long __a);
__device__ float __nv_ull2float_rd(unsigned long long __a);
__device__ float __nv_ull2float_rn(unsigned long long __a);
__device__ float __nv_ull2float_ru(unsigned long long __a);
__device__ float __nv_ull2float_rz(unsigned long long __a);
__device__ unsigned long long __nv_ullmax(unsigned long long __a,
#if defined(_OPENMP)
#define __DEVICE__
#elif defined(__CUDA__)
#define __DEVICE__ __device__
#endif
__DEVICE__ int __nv_abs(int __a);
__DEVICE__ double __nv_acos(double __a);
__DEVICE__ float __nv_acosf(float __a);
__DEVICE__ double __nv_acosh(double __a);
__DEVICE__ float __nv_acoshf(float __a);
__DEVICE__ double __nv_asin(double __a);
__DEVICE__ float __nv_asinf(float __a);
__DEVICE__ double __nv_asinh(double __a);
__DEVICE__ float __nv_asinhf(float __a);
__DEVICE__ double __nv_atan2(double __a, double __b);
__DEVICE__ float __nv_atan2f(float __a, float __b);
__DEVICE__ double __nv_atan(double __a);
__DEVICE__ float __nv_atanf(float __a);
__DEVICE__ double __nv_atanh(double __a);
__DEVICE__ float __nv_atanhf(float __a);
__DEVICE__ int __nv_brev(int __a);
__DEVICE__ long long __nv_brevll(long long __a);
__DEVICE__ int __nv_byte_perm(int __a, int __b, int __c);
__DEVICE__ double __nv_cbrt(double __a);
__DEVICE__ float __nv_cbrtf(float __a);
__DEVICE__ double __nv_ceil(double __a);
__DEVICE__ float __nv_ceilf(float __a);
__DEVICE__ int __nv_clz(int __a);
__DEVICE__ int __nv_clzll(long long __a);
__DEVICE__ double __nv_copysign(double __a, double __b);
__DEVICE__ float __nv_copysignf(float __a, float __b);
__DEVICE__ double __nv_cos(double __a);
__DEVICE__ float __nv_cosf(float __a);
__DEVICE__ double __nv_cosh(double __a);
__DEVICE__ float __nv_coshf(float __a);
__DEVICE__ double __nv_cospi(double __a);
__DEVICE__ float __nv_cospif(float __a);
__DEVICE__ double __nv_cyl_bessel_i0(double __a);
__DEVICE__ float __nv_cyl_bessel_i0f(float __a);
__DEVICE__ double __nv_cyl_bessel_i1(double __a);
__DEVICE__ float __nv_cyl_bessel_i1f(float __a);
__DEVICE__ double __nv_dadd_rd(double __a, double __b);
__DEVICE__ double __nv_dadd_rn(double __a, double __b);
__DEVICE__ double __nv_dadd_ru(double __a, double __b);
__DEVICE__ double __nv_dadd_rz(double __a, double __b);
__DEVICE__ double __nv_ddiv_rd(double __a, double __b);
__DEVICE__ double __nv_ddiv_rn(double __a, double __b);
__DEVICE__ double __nv_ddiv_ru(double __a, double __b);
__DEVICE__ double __nv_ddiv_rz(double __a, double __b);
__DEVICE__ double __nv_dmul_rd(double __a, double __b);
__DEVICE__ double __nv_dmul_rn(double __a, double __b);
__DEVICE__ double __nv_dmul_ru(double __a, double __b);
__DEVICE__ double __nv_dmul_rz(double __a, double __b);
__DEVICE__ float __nv_double2float_rd(double __a);
__DEVICE__ float __nv_double2float_rn(double __a);
__DEVICE__ float __nv_double2float_ru(double __a);
__DEVICE__ float __nv_double2float_rz(double __a);
__DEVICE__ int __nv_double2hiint(double __a);
__DEVICE__ int __nv_double2int_rd(double __a);
__DEVICE__ int __nv_double2int_rn(double __a);
__DEVICE__ int __nv_double2int_ru(double __a);
__DEVICE__ int __nv_double2int_rz(double __a);
__DEVICE__ long long __nv_double2ll_rd(double __a);
__DEVICE__ long long __nv_double2ll_rn(double __a);
__DEVICE__ long long __nv_double2ll_ru(double __a);
__DEVICE__ long long __nv_double2ll_rz(double __a);
__DEVICE__ int __nv_double2loint(double __a);
__DEVICE__ unsigned int __nv_double2uint_rd(double __a);
__DEVICE__ unsigned int __nv_double2uint_rn(double __a);
__DEVICE__ unsigned int __nv_double2uint_ru(double __a);
__DEVICE__ unsigned int __nv_double2uint_rz(double __a);
__DEVICE__ unsigned long long __nv_double2ull_rd(double __a);
__DEVICE__ unsigned long long __nv_double2ull_rn(double __a);
__DEVICE__ unsigned long long __nv_double2ull_ru(double __a);
__DEVICE__ unsigned long long __nv_double2ull_rz(double __a);
__DEVICE__ unsigned long long __nv_double_as_longlong(double __a);
__DEVICE__ double __nv_drcp_rd(double __a);
__DEVICE__ double __nv_drcp_rn(double __a);
__DEVICE__ double __nv_drcp_ru(double __a);
__DEVICE__ double __nv_drcp_rz(double __a);
__DEVICE__ double __nv_dsqrt_rd(double __a);
__DEVICE__ double __nv_dsqrt_rn(double __a);
__DEVICE__ double __nv_dsqrt_ru(double __a);
__DEVICE__ double __nv_dsqrt_rz(double __a);
__DEVICE__ double __nv_dsub_rd(double __a, double __b);
__DEVICE__ double __nv_dsub_rn(double __a, double __b);
__DEVICE__ double __nv_dsub_ru(double __a, double __b);
__DEVICE__ double __nv_dsub_rz(double __a, double __b);
__DEVICE__ double __nv_erfc(double __a);
__DEVICE__ float __nv_erfcf(float __a);
__DEVICE__ double __nv_erfcinv(double __a);
__DEVICE__ float __nv_erfcinvf(float __a);
__DEVICE__ double __nv_erfcx(double __a);
__DEVICE__ float __nv_erfcxf(float __a);
__DEVICE__ double __nv_erf(double __a);
__DEVICE__ float __nv_erff(float __a);
__DEVICE__ double __nv_erfinv(double __a);
__DEVICE__ float __nv_erfinvf(float __a);
__DEVICE__ double __nv_exp10(double __a);
__DEVICE__ float __nv_exp10f(float __a);
__DEVICE__ double __nv_exp2(double __a);
__DEVICE__ float __nv_exp2f(float __a);
__DEVICE__ double __nv_exp(double __a);
__DEVICE__ float __nv_expf(float __a);
__DEVICE__ double __nv_expm1(double __a);
__DEVICE__ float __nv_expm1f(float __a);
__DEVICE__ double __nv_fabs(double __a);
__DEVICE__ float __nv_fabsf(float __a);
__DEVICE__ float __nv_fadd_rd(float __a, float __b);
__DEVICE__ float __nv_fadd_rn(float __a, float __b);
__DEVICE__ float __nv_fadd_ru(float __a, float __b);
__DEVICE__ float __nv_fadd_rz(float __a, float __b);
__DEVICE__ float __nv_fast_cosf(float __a);
__DEVICE__ float __nv_fast_exp10f(float __a);
__DEVICE__ float __nv_fast_expf(float __a);
__DEVICE__ float __nv_fast_fdividef(float __a, float __b);
__DEVICE__ float __nv_fast_log10f(float __a);
__DEVICE__ float __nv_fast_log2f(float __a);
__DEVICE__ float __nv_fast_logf(float __a);
__DEVICE__ float __nv_fast_powf(float __a, float __b);
__DEVICE__ void __nv_fast_sincosf(float __a, float *__s, float *__c);
__DEVICE__ float __nv_fast_sinf(float __a);
__DEVICE__ float __nv_fast_tanf(float __a);
__DEVICE__ double __nv_fdim(double __a, double __b);
__DEVICE__ float __nv_fdimf(float __a, float __b);
__DEVICE__ float __nv_fdiv_rd(float __a, float __b);
__DEVICE__ float __nv_fdiv_rn(float __a, float __b);
__DEVICE__ float __nv_fdiv_ru(float __a, float __b);
__DEVICE__ float __nv_fdiv_rz(float __a, float __b);
__DEVICE__ int __nv_ffs(int __a);
__DEVICE__ int __nv_ffsll(long long __a);
__DEVICE__ int __nv_finitef(float __a);
__DEVICE__ unsigned short __nv_float2half_rn(float __a);
__DEVICE__ int __nv_float2int_rd(float __a);
__DEVICE__ int __nv_float2int_rn(float __a);
__DEVICE__ int __nv_float2int_ru(float __a);
__DEVICE__ int __nv_float2int_rz(float __a);
__DEVICE__ long long __nv_float2ll_rd(float __a);
__DEVICE__ long long __nv_float2ll_rn(float __a);
__DEVICE__ long long __nv_float2ll_ru(float __a);
__DEVICE__ long long __nv_float2ll_rz(float __a);
__DEVICE__ unsigned int __nv_float2uint_rd(float __a);
__DEVICE__ unsigned int __nv_float2uint_rn(float __a);
__DEVICE__ unsigned int __nv_float2uint_ru(float __a);
__DEVICE__ unsigned int __nv_float2uint_rz(float __a);
__DEVICE__ unsigned long long __nv_float2ull_rd(float __a);
__DEVICE__ unsigned long long __nv_float2ull_rn(float __a);
__DEVICE__ unsigned long long __nv_float2ull_ru(float __a);
__DEVICE__ unsigned long long __nv_float2ull_rz(float __a);
__DEVICE__ int __nv_float_as_int(float __a);
__DEVICE__ unsigned int __nv_float_as_uint(float __a);
__DEVICE__ double __nv_floor(double __a);
__DEVICE__ float __nv_floorf(float __a);
__DEVICE__ double __nv_fma(double __a, double __b, double __c);
__DEVICE__ float __nv_fmaf(float __a, float __b, float __c);
__DEVICE__ float __nv_fmaf_ieee_rd(float __a, float __b, float __c);
__DEVICE__ float __nv_fmaf_ieee_rn(float __a, float __b, float __c);
__DEVICE__ float __nv_fmaf_ieee_ru(float __a, float __b, float __c);
__DEVICE__ float __nv_fmaf_ieee_rz(float __a, float __b, float __c);
__DEVICE__ float __nv_fmaf_rd(float __a, float __b, float __c);
__DEVICE__ float __nv_fmaf_rn(float __a, float __b, float __c);
__DEVICE__ float __nv_fmaf_ru(float __a, float __b, float __c);
__DEVICE__ float __nv_fmaf_rz(float __a, float __b, float __c);
__DEVICE__ double __nv_fma_rd(double __a, double __b, double __c);
__DEVICE__ double __nv_fma_rn(double __a, double __b, double __c);
__DEVICE__ double __nv_fma_ru(double __a, double __b, double __c);
__DEVICE__ double __nv_fma_rz(double __a, double __b, double __c);
__DEVICE__ double __nv_fmax(double __a, double __b);
__DEVICE__ float __nv_fmaxf(float __a, float __b);
__DEVICE__ double __nv_fmin(double __a, double __b);
__DEVICE__ float __nv_fminf(float __a, float __b);
__DEVICE__ double __nv_fmod(double __a, double __b);
__DEVICE__ float __nv_fmodf(float __a, float __b);
__DEVICE__ float __nv_fmul_rd(float __a, float __b);
__DEVICE__ float __nv_fmul_rn(float __a, float __b);
__DEVICE__ float __nv_fmul_ru(float __a, float __b);
__DEVICE__ float __nv_fmul_rz(float __a, float __b);
__DEVICE__ float __nv_frcp_rd(float __a);
__DEVICE__ float __nv_frcp_rn(float __a);
__DEVICE__ float __nv_frcp_ru(float __a);
__DEVICE__ float __nv_frcp_rz(float __a);
__DEVICE__ double __nv_frexp(double __a, int *__b);
__DEVICE__ float __nv_frexpf(float __a, int *__b);
__DEVICE__ float __nv_frsqrt_rn(float __a);
__DEVICE__ float __nv_fsqrt_rd(float __a);
__DEVICE__ float __nv_fsqrt_rn(float __a);
__DEVICE__ float __nv_fsqrt_ru(float __a);
__DEVICE__ float __nv_fsqrt_rz(float __a);
__DEVICE__ float __nv_fsub_rd(float __a, float __b);
__DEVICE__ float __nv_fsub_rn(float __a, float __b);
__DEVICE__ float __nv_fsub_ru(float __a, float __b);
__DEVICE__ float __nv_fsub_rz(float __a, float __b);
__DEVICE__ int __nv_hadd(int __a, int __b);
__DEVICE__ float __nv_half2float(unsigned short __h);
__DEVICE__ double __nv_hiloint2double(int __a, int __b);
__DEVICE__ double __nv_hypot(double __a, double __b);
__DEVICE__ float __nv_hypotf(float __a, float __b);
__DEVICE__ int __nv_ilogb(double __a);
__DEVICE__ int __nv_ilogbf(float __a);
__DEVICE__ double __nv_int2double_rn(int __a);
__DEVICE__ float __nv_int2float_rd(int __a);
__DEVICE__ float __nv_int2float_rn(int __a);
__DEVICE__ float __nv_int2float_ru(int __a);
__DEVICE__ float __nv_int2float_rz(int __a);
__DEVICE__ float __nv_int_as_float(int __a);
__DEVICE__ int __nv_isfinited(double __a);
__DEVICE__ int __nv_isinfd(double __a);
__DEVICE__ int __nv_isinff(float __a);
__DEVICE__ int __nv_isnand(double __a);
__DEVICE__ int __nv_isnanf(float __a);
__DEVICE__ double __nv_j0(double __a);
__DEVICE__ float __nv_j0f(float __a);
__DEVICE__ double __nv_j1(double __a);
__DEVICE__ float __nv_j1f(float __a);
__DEVICE__ float __nv_jnf(int __a, float __b);
__DEVICE__ double __nv_jn(int __a, double __b);
__DEVICE__ double __nv_ldexp(double __a, int __b);
__DEVICE__ float __nv_ldexpf(float __a, int __b);
__DEVICE__ double __nv_lgamma(double __a);
__DEVICE__ float __nv_lgammaf(float __a);
__DEVICE__ double __nv_ll2double_rd(long long __a);
__DEVICE__ double __nv_ll2double_rn(long long __a);
__DEVICE__ double __nv_ll2double_ru(long long __a);
__DEVICE__ double __nv_ll2double_rz(long long __a);
__DEVICE__ float __nv_ll2float_rd(long long __a);
__DEVICE__ float __nv_ll2float_rn(long long __a);
__DEVICE__ float __nv_ll2float_ru(long long __a);
__DEVICE__ float __nv_ll2float_rz(long long __a);
__DEVICE__ long long __nv_llabs(long long __a);
__DEVICE__ long long __nv_llmax(long long __a, long long __b);
__DEVICE__ long long __nv_llmin(long long __a, long long __b);
__DEVICE__ long long __nv_llrint(double __a);
__DEVICE__ long long __nv_llrintf(float __a);
__DEVICE__ long long __nv_llround(double __a);
__DEVICE__ long long __nv_llroundf(float __a);
__DEVICE__ double __nv_log10(double __a);
__DEVICE__ float __nv_log10f(float __a);
__DEVICE__ double __nv_log1p(double __a);
__DEVICE__ float __nv_log1pf(float __a);
__DEVICE__ double __nv_log2(double __a);
__DEVICE__ float __nv_log2f(float __a);
__DEVICE__ double __nv_logb(double __a);
__DEVICE__ float __nv_logbf(float __a);
__DEVICE__ double __nv_log(double __a);
__DEVICE__ float __nv_logf(float __a);
__DEVICE__ double __nv_longlong_as_double(long long __a);
__DEVICE__ int __nv_max(int __a, int __b);
__DEVICE__ int __nv_min(int __a, int __b);
__DEVICE__ double __nv_modf(double __a, double *__b);
__DEVICE__ float __nv_modff(float __a, float *__b);
__DEVICE__ int __nv_mul24(int __a, int __b);
__DEVICE__ long long __nv_mul64hi(long long __a, long long __b);
__DEVICE__ int __nv_mulhi(int __a, int __b);
__DEVICE__ double __nv_nan(const signed char *__a);
__DEVICE__ float __nv_nanf(const signed char *__a);
__DEVICE__ double __nv_nearbyint(double __a);
__DEVICE__ float __nv_nearbyintf(float __a);
__DEVICE__ double __nv_nextafter(double __a, double __b);
__DEVICE__ float __nv_nextafterf(float __a, float __b);
__DEVICE__ double __nv_norm3d(double __a, double __b, double __c);
__DEVICE__ float __nv_norm3df(float __a, float __b, float __c);
__DEVICE__ double __nv_norm4d(double __a, double __b, double __c, double __d);
__DEVICE__ float __nv_norm4df(float __a, float __b, float __c, float __d);
__DEVICE__ double __nv_normcdf(double __a);
__DEVICE__ float __nv_normcdff(float __a);
__DEVICE__ double __nv_normcdfinv(double __a);
__DEVICE__ float __nv_normcdfinvf(float __a);
__DEVICE__ float __nv_normf(int __a, const float *__b);
__DEVICE__ double __nv_norm(int __a, const double *__b);
__DEVICE__ int __nv_popc(int __a);
__DEVICE__ int __nv_popcll(long long __a);
__DEVICE__ double __nv_pow(double __a, double __b);
__DEVICE__ float __nv_powf(float __a, float __b);
__DEVICE__ double __nv_powi(double __a, int __b);
__DEVICE__ float __nv_powif(float __a, int __b);
__DEVICE__ double __nv_rcbrt(double __a);
__DEVICE__ float __nv_rcbrtf(float __a);
__DEVICE__ double __nv_rcp64h(double __a);
__DEVICE__ double __nv_remainder(double __a, double __b);
__DEVICE__ float __nv_remainderf(float __a, float __b);
__DEVICE__ double __nv_remquo(double __a, double __b, int *__c);
__DEVICE__ float __nv_remquof(float __a, float __b, int *__c);
__DEVICE__ int __nv_rhadd(int __a, int __b);
__DEVICE__ double __nv_rhypot(double __a, double __b);
__DEVICE__ float __nv_rhypotf(float __a, float __b);
__DEVICE__ double __nv_rint(double __a);
__DEVICE__ float __nv_rintf(float __a);
__DEVICE__ double __nv_rnorm3d(double __a, double __b, double __c);
__DEVICE__ float __nv_rnorm3df(float __a, float __b, float __c);
__DEVICE__ double __nv_rnorm4d(double __a, double __b, double __c, double __d);
__DEVICE__ float __nv_rnorm4df(float __a, float __b, float __c, float __d);
__DEVICE__ float __nv_rnormf(int __a, const float *__b);
__DEVICE__ double __nv_rnorm(int __a, const double *__b);
__DEVICE__ double __nv_round(double __a);
__DEVICE__ float __nv_roundf(float __a);
__DEVICE__ double __nv_rsqrt(double __a);
__DEVICE__ float __nv_rsqrtf(float __a);
__DEVICE__ int __nv_sad(int __a, int __b, int __c);
__DEVICE__ float __nv_saturatef(float __a);
__DEVICE__ double __nv_scalbn(double __a, int __b);
__DEVICE__ float __nv_scalbnf(float __a, int __b);
__DEVICE__ int __nv_signbitd(double __a);
__DEVICE__ int __nv_signbitf(float __a);
__DEVICE__ void __nv_sincos(double __a, double *__b, double *__c);
__DEVICE__ void __nv_sincosf(float __a, float *__b, float *__c);
__DEVICE__ void __nv_sincospi(double __a, double *__b, double *__c);
__DEVICE__ void __nv_sincospif(float __a, float *__b, float *__c);
__DEVICE__ double __nv_sin(double __a);
__DEVICE__ float __nv_sinf(float __a);
__DEVICE__ double __nv_sinh(double __a);
__DEVICE__ float __nv_sinhf(float __a);
__DEVICE__ double __nv_sinpi(double __a);
__DEVICE__ float __nv_sinpif(float __a);
__DEVICE__ double __nv_sqrt(double __a);
__DEVICE__ float __nv_sqrtf(float __a);
__DEVICE__ double __nv_tan(double __a);
__DEVICE__ float __nv_tanf(float __a);
__DEVICE__ double __nv_tanh(double __a);
__DEVICE__ float __nv_tanhf(float __a);
__DEVICE__ double __nv_tgamma(double __a);
__DEVICE__ float __nv_tgammaf(float __a);
__DEVICE__ double __nv_trunc(double __a);
__DEVICE__ float __nv_truncf(float __a);
__DEVICE__ int __nv_uhadd(unsigned int __a, unsigned int __b);
__DEVICE__ double __nv_uint2double_rn(unsigned int __i);
__DEVICE__ float __nv_uint2float_rd(unsigned int __a);
__DEVICE__ float __nv_uint2float_rn(unsigned int __a);
__DEVICE__ float __nv_uint2float_ru(unsigned int __a);
__DEVICE__ float __nv_uint2float_rz(unsigned int __a);
__DEVICE__ float __nv_uint_as_float(unsigned int __a);
__DEVICE__ double __nv_ull2double_rd(unsigned long long __a);
__DEVICE__ double __nv_ull2double_rn(unsigned long long __a);
__DEVICE__ double __nv_ull2double_ru(unsigned long long __a);
__DEVICE__ double __nv_ull2double_rz(unsigned long long __a);
__DEVICE__ float __nv_ull2float_rd(unsigned long long __a);
__DEVICE__ float __nv_ull2float_rn(unsigned long long __a);
__DEVICE__ float __nv_ull2float_ru(unsigned long long __a);
__DEVICE__ float __nv_ull2float_rz(unsigned long long __a);
__DEVICE__ unsigned long long __nv_ullmax(unsigned long long __a,
unsigned long long __b);
__device__ unsigned long long __nv_ullmin(unsigned long long __a,
__DEVICE__ unsigned long long __nv_ullmin(unsigned long long __a,
unsigned long long __b);
__device__ unsigned int __nv_umax(unsigned int __a, unsigned int __b);
__device__ unsigned int __nv_umin(unsigned int __a, unsigned int __b);
__device__ unsigned int __nv_umul24(unsigned int __a, unsigned int __b);
__device__ unsigned long long __nv_umul64hi(unsigned long long __a,
__DEVICE__ unsigned int __nv_umax(unsigned int __a, unsigned int __b);
__DEVICE__ unsigned int __nv_umin(unsigned int __a, unsigned int __b);
__DEVICE__ unsigned int __nv_umul24(unsigned int __a, unsigned int __b);
__DEVICE__ unsigned long long __nv_umul64hi(unsigned long long __a,
unsigned long long __b);
__device__ unsigned int __nv_umulhi(unsigned int __a, unsigned int __b);
__device__ unsigned int __nv_urhadd(unsigned int __a, unsigned int __b);
__device__ unsigned int __nv_usad(unsigned int __a, unsigned int __b,
__DEVICE__ unsigned int __nv_umulhi(unsigned int __a, unsigned int __b);
__DEVICE__ unsigned int __nv_urhadd(unsigned int __a, unsigned int __b);
__DEVICE__ unsigned int __nv_usad(unsigned int __a, unsigned int __b,
unsigned int __c);
#if CUDA_VERSION >= 9000 && CUDA_VERSION < 9020
__device__ int __nv_vabs2(int __a);
__device__ int __nv_vabs4(int __a);
__device__ int __nv_vabsdiffs2(int __a, int __b);
__device__ int __nv_vabsdiffs4(int __a, int __b);
__device__ int __nv_vabsdiffu2(int __a, int __b);
__device__ int __nv_vabsdiffu4(int __a, int __b);
__device__ int __nv_vabsss2(int __a);
__device__ int __nv_vabsss4(int __a);
__device__ int __nv_vadd2(int __a, int __b);
__device__ int __nv_vadd4(int __a, int __b);
__device__ int __nv_vaddss2(int __a, int __b);
__device__ int __nv_vaddss4(int __a, int __b);
__device__ int __nv_vaddus2(int __a, int __b);
__device__ int __nv_vaddus4(int __a, int __b);
__device__ int __nv_vavgs2(int __a, int __b);
__device__ int __nv_vavgs4(int __a, int __b);
__device__ int __nv_vavgu2(int __a, int __b);
__device__ int __nv_vavgu4(int __a, int __b);
__device__ int __nv_vcmpeq2(int __a, int __b);
__device__ int __nv_vcmpeq4(int __a, int __b);
__device__ int __nv_vcmpges2(int __a, int __b);
__device__ int __nv_vcmpges4(int __a, int __b);
__device__ int __nv_vcmpgeu2(int __a, int __b);
__device__ int __nv_vcmpgeu4(int __a, int __b);
__device__ int __nv_vcmpgts2(int __a, int __b);
__device__ int __nv_vcmpgts4(int __a, int __b);
__device__ int __nv_vcmpgtu2(int __a, int __b);
__device__ int __nv_vcmpgtu4(int __a, int __b);
__device__ int __nv_vcmples2(int __a, int __b);
__device__ int __nv_vcmples4(int __a, int __b);
__device__ int __nv_vcmpleu2(int __a, int __b);
__device__ int __nv_vcmpleu4(int __a, int __b);
__device__ int __nv_vcmplts2(int __a, int __b);
__device__ int __nv_vcmplts4(int __a, int __b);
__device__ int __nv_vcmpltu2(int __a, int __b);
__device__ int __nv_vcmpltu4(int __a, int __b);
__device__ int __nv_vcmpne2(int __a, int __b);
__device__ int __nv_vcmpne4(int __a, int __b);
__device__ int __nv_vhaddu2(int __a, int __b);
__device__ int __nv_vhaddu4(int __a, int __b);
__device__ int __nv_vmaxs2(int __a, int __b);
__device__ int __nv_vmaxs4(int __a, int __b);
__device__ int __nv_vmaxu2(int __a, int __b);
__device__ int __nv_vmaxu4(int __a, int __b);
__device__ int __nv_vmins2(int __a, int __b);
__device__ int __nv_vmins4(int __a, int __b);
__device__ int __nv_vminu2(int __a, int __b);
__device__ int __nv_vminu4(int __a, int __b);
__device__ int __nv_vneg2(int __a);
__device__ int __nv_vneg4(int __a);
__device__ int __nv_vnegss2(int __a);
__device__ int __nv_vnegss4(int __a);
__device__ int __nv_vsads2(int __a, int __b);
__device__ int __nv_vsads4(int __a, int __b);
__device__ int __nv_vsadu2(int __a, int __b);
__device__ int __nv_vsadu4(int __a, int __b);
__device__ int __nv_vseteq2(int __a, int __b);
__device__ int __nv_vseteq4(int __a, int __b);
__device__ int __nv_vsetges2(int __a, int __b);
__device__ int __nv_vsetges4(int __a, int __b);
__device__ int __nv_vsetgeu2(int __a, int __b);
__device__ int __nv_vsetgeu4(int __a, int __b);
__device__ int __nv_vsetgts2(int __a, int __b);
__device__ int __nv_vsetgts4(int __a, int __b);
__device__ int __nv_vsetgtu2(int __a, int __b);
__device__ int __nv_vsetgtu4(int __a, int __b);
__device__ int __nv_vsetles2(int __a, int __b);
__device__ int __nv_vsetles4(int __a, int __b);
__device__ int __nv_vsetleu2(int __a, int __b);
__device__ int __nv_vsetleu4(int __a, int __b);
__device__ int __nv_vsetlts2(int __a, int __b);
__device__ int __nv_vsetlts4(int __a, int __b);
__device__ int __nv_vsetltu2(int __a, int __b);
__device__ int __nv_vsetltu4(int __a, int __b);
__device__ int __nv_vsetne2(int __a, int __b);
__device__ int __nv_vsetne4(int __a, int __b);
__device__ int __nv_vsub2(int __a, int __b);
__device__ int __nv_vsub4(int __a, int __b);
__device__ int __nv_vsubss2(int __a, int __b);
__device__ int __nv_vsubss4(int __a, int __b);
__device__ int __nv_vsubus2(int __a, int __b);
__device__ int __nv_vsubus4(int __a, int __b);
__DEVICE__ int __nv_vabs2(int __a);
__DEVICE__ int __nv_vabs4(int __a);
__DEVICE__ int __nv_vabsdiffs2(int __a, int __b);
__DEVICE__ int __nv_vabsdiffs4(int __a, int __b);
__DEVICE__ int __nv_vabsdiffu2(int __a, int __b);
__DEVICE__ int __nv_vabsdiffu4(int __a, int __b);
__DEVICE__ int __nv_vabsss2(int __a);
__DEVICE__ int __nv_vabsss4(int __a);
__DEVICE__ int __nv_vadd2(int __a, int __b);
__DEVICE__ int __nv_vadd4(int __a, int __b);
__DEVICE__ int __nv_vaddss2(int __a, int __b);
__DEVICE__ int __nv_vaddss4(int __a, int __b);
__DEVICE__ int __nv_vaddus2(int __a, int __b);
__DEVICE__ int __nv_vaddus4(int __a, int __b);
__DEVICE__ int __nv_vavgs2(int __a, int __b);
__DEVICE__ int __nv_vavgs4(int __a, int __b);
__DEVICE__ int __nv_vavgu2(int __a, int __b);
__DEVICE__ int __nv_vavgu4(int __a, int __b);
__DEVICE__ int __nv_vcmpeq2(int __a, int __b);
__DEVICE__ int __nv_vcmpeq4(int __a, int __b);
__DEVICE__ int __nv_vcmpges2(int __a, int __b);
__DEVICE__ int __nv_vcmpges4(int __a, int __b);
__DEVICE__ int __nv_vcmpgeu2(int __a, int __b);
__DEVICE__ int __nv_vcmpgeu4(int __a, int __b);
__DEVICE__ int __nv_vcmpgts2(int __a, int __b);
__DEVICE__ int __nv_vcmpgts4(int __a, int __b);
__DEVICE__ int __nv_vcmpgtu2(int __a, int __b);
__DEVICE__ int __nv_vcmpgtu4(int __a, int __b);
__DEVICE__ int __nv_vcmples2(int __a, int __b);
__DEVICE__ int __nv_vcmples4(int __a, int __b);
__DEVICE__ int __nv_vcmpleu2(int __a, int __b);
__DEVICE__ int __nv_vcmpleu4(int __a, int __b);
__DEVICE__ int __nv_vcmplts2(int __a, int __b);
__DEVICE__ int __nv_vcmplts4(int __a, int __b);
__DEVICE__ int __nv_vcmpltu2(int __a, int __b);
__DEVICE__ int __nv_vcmpltu4(int __a, int __b);
__DEVICE__ int __nv_vcmpne2(int __a, int __b);
__DEVICE__ int __nv_vcmpne4(int __a, int __b);
__DEVICE__ int __nv_vhaddu2(int __a, int __b);
__DEVICE__ int __nv_vhaddu4(int __a, int __b);
__DEVICE__ int __nv_vmaxs2(int __a, int __b);
__DEVICE__ int __nv_vmaxs4(int __a, int __b);
__DEVICE__ int __nv_vmaxu2(int __a, int __b);
__DEVICE__ int __nv_vmaxu4(int __a, int __b);
__DEVICE__ int __nv_vmins2(int __a, int __b);
__DEVICE__ int __nv_vmins4(int __a, int __b);
__DEVICE__ int __nv_vminu2(int __a, int __b);
__DEVICE__ int __nv_vminu4(int __a, int __b);
__DEVICE__ int __nv_vneg2(int __a);
__DEVICE__ int __nv_vneg4(int __a);
__DEVICE__ int __nv_vnegss2(int __a);
__DEVICE__ int __nv_vnegss4(int __a);
__DEVICE__ int __nv_vsads2(int __a, int __b);
__DEVICE__ int __nv_vsads4(int __a, int __b);
__DEVICE__ int __nv_vsadu2(int __a, int __b);
__DEVICE__ int __nv_vsadu4(int __a, int __b);
__DEVICE__ int __nv_vseteq2(int __a, int __b);
__DEVICE__ int __nv_vseteq4(int __a, int __b);
__DEVICE__ int __nv_vsetges2(int __a, int __b);
__DEVICE__ int __nv_vsetges4(int __a, int __b);
__DEVICE__ int __nv_vsetgeu2(int __a, int __b);
__DEVICE__ int __nv_vsetgeu4(int __a, int __b);
__DEVICE__ int __nv_vsetgts2(int __a, int __b);
__DEVICE__ int __nv_vsetgts4(int __a, int __b);
__DEVICE__ int __nv_vsetgtu2(int __a, int __b);
__DEVICE__ int __nv_vsetgtu4(int __a, int __b);
__DEVICE__ int __nv_vsetles2(int __a, int __b);
__DEVICE__ int __nv_vsetles4(int __a, int __b);
__DEVICE__ int __nv_vsetleu2(int __a, int __b);
__DEVICE__ int __nv_vsetleu4(int __a, int __b);
__DEVICE__ int __nv_vsetlts2(int __a, int __b);
__DEVICE__ int __nv_vsetlts4(int __a, int __b);
__DEVICE__ int __nv_vsetltu2(int __a, int __b);
__DEVICE__ int __nv_vsetltu4(int __a, int __b);
__DEVICE__ int __nv_vsetne2(int __a, int __b);
__DEVICE__ int __nv_vsetne4(int __a, int __b);
__DEVICE__ int __nv_vsub2(int __a, int __b);
__DEVICE__ int __nv_vsub4(int __a, int __b);
__DEVICE__ int __nv_vsubss2(int __a, int __b);
__DEVICE__ int __nv_vsubss4(int __a, int __b);
__DEVICE__ int __nv_vsubus2(int __a, int __b);
__DEVICE__ int __nv_vsubus4(int __a, int __b);
#endif // CUDA_VERSION
__device__ double __nv_y0(double __a);
__device__ float __nv_y0f(float __a);
__device__ double __nv_y1(double __a);
__device__ float __nv_y1f(float __a);
__device__ float __nv_ynf(int __a, float __b);
__device__ double __nv_yn(int __a, double __b);
__DEVICE__ double __nv_y0(double __a);
__DEVICE__ float __nv_y0f(float __a);
__DEVICE__ double __nv_y1(double __a);
__DEVICE__ float __nv_y1f(float __a);
__DEVICE__ float __nv_ynf(int __a, float __b);
__DEVICE__ double __nv_yn(int __a, double __b);
#if defined(__cplusplus)
} // extern "C"
#endif
#endif // __CLANG_CUDA_LIBDEVICE_DECLARES_H__

View File

@ -20,8 +20,12 @@
// would preclude the use of our own __device__ overloads for these functions.
#pragma push_macro("__DEVICE__")
#ifdef _OPENMP
#define __DEVICE__ static __inline__ __attribute__((always_inline))
#else
#define __DEVICE__ \
static __inline__ __attribute__((always_inline)) __attribute__((device))
#endif
__DEVICE__ double abs(double);
__DEVICE__ float abs(float);

View File

@ -0,0 +1,44 @@
/*===---- __clang_openmp_math.h - OpenMP target math support ---------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#if defined(__NVPTX__) && defined(_OPENMP)
/// TODO:
/// We are currently reusing the functionality of the Clang-CUDA code path
/// as an alternative to the host declarations provided by math.h and cmath.
/// This is suboptimal.
///
/// We should instead declare the device functions in a similar way, e.g.,
/// through OpenMP 5.0 variants, and afterwards populate the module with the
/// host declarations by unconditionally including the host math.h or cmath,
/// respectively. This is actually what the Clang-CUDA code path does, using
/// __device__ instead of variants to avoid redeclarations and get the desired
/// overload resolution.
#define __CUDA__
#if defined(__cplusplus)
#include <__clang_cuda_math_forward_declares.h>
#endif
/// Include declarations for libdevice functions.
#include <__clang_cuda_libdevice_declares.h>
/// Provide definitions for these functions.
#include <__clang_cuda_device_functions.h>
#if defined(__cplusplus)
#include <__clang_cuda_cmath.h>
#endif
#undef __CUDA__
/// Magic macro for stopping the math.h/cmath host header from being included.
#define __CLANG_NO_HOST_MATH__
#endif

View File

@ -0,0 +1,18 @@
/*===-------------- cmath - Alternative cmath header -----------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __cplusplus
#include <__clang_openmp_math.h>
#endif
#ifndef __CLANG_NO_HOST_MATH__
#include_next <cmath>
#else
#undef __CLANG_NO_HOST_MATH__
#endif

View File

@ -0,0 +1,19 @@
/*===------------- math.h - Alternative math.h header ----------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __cplusplus
#include <__clang_openmp_math.h>
#endif
#ifndef __CLANG_NO_HOST_MATH__
#include_next <math.h>
#else
#undef __CLANG_NO_HOST_MATH__
#endif

View File

@ -278,3 +278,8 @@
// RUN: | FileCheck -check-prefix=CUDA_RED_RECS %s
// CUDA_RED_RECS: clang{{.*}}"-cc1"{{.*}}"-triple" "nvptx64-nvidia-cuda"
// CUDA_RED_RECS-SAME: "-fopenmp-cuda-teams-reduction-recs-num=2048"
// RUN: %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda %s 2>&1 \
// RUN: | FileCheck -check-prefix=OPENMP_NVPTX_WRAPPERS %s
// OPENMP_NVPTX_WRAPPERS: clang{{.*}}"-cc1"{{.*}}"-triple" "nvptx64-nvidia-cuda"
// OPENMP_NVPTX_WRAPPERS-SAME: "-internal-isystem" "{{.*}}openmp_wrappers"

View File

@ -0,0 +1,5 @@
#pragma once
double sqrt(double);
double pow(double, double);
double modf(double, double*);

View File

@ -0,0 +1,10 @@
#pragma once
namespace std
{
struct __numeric_limits_base
{};
template<typename _Tp>
struct numeric_limits : public __numeric_limits_base
{};
}

View File

@ -1 +1,5 @@
#pragma once
double sqrt(double);
double pow(double, double);
double modf(double, double*);

View File

@ -0,0 +1,21 @@
// Test calling of device math functions.
///==========================================================================///
// REQUIRES: nvptx-registered-target
// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -include cmath -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_math.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -include cmath -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefix CHECK-YES %s
#include <cmath>
void test_sqrt(double a1) {
#pragma omp target
{
// CHECK-YES: call double @__nv_sqrt(double
double l1 = sqrt(a1);
// CHECK-YES: call double @__nv_pow(double
double l2 = pow(a1, a1);
// CHECK-YES: call double @__nv_modf(double
double l3 = modf(a1 + 3.5, &a1);
}
}

View File

@ -0,0 +1,21 @@
// Test calling of device math functions.
///==========================================================================///
// REQUIRES: nvptx-registered-target
// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -include cmath -x c++ -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_math.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -include cmath -internal-isystem %S/Inputs/include -include stdlib.h -x c++ -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefix CHECK-YES %s
#include <cmath>
void test_sqrt(double a1) {
#pragma omp target
{
// CHECK-YES: call double @__nv_sqrt(double
double l1 = sqrt(a1);
// CHECK-YES: call double @__nv_pow(double
double l2 = pow(a1, a1);
// CHECK-YES: call double @__nv_modf(double
double l3 = modf(a1 + 3.5, &a1);
}
}

View File

@ -0,0 +1,21 @@
// Test calling of device math functions.
///==========================================================================///
// REQUIRES: nvptx-registered-target
// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -include math.h -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_math.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -include math.h -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefix CHECK-YES %s
#include <math.h>
void test_sqrt(double a1) {
#pragma omp target
{
// CHECK-YES: call double @__nv_sqrt(double
double l1 = sqrt(a1);
// CHECK-YES: call double @__nv_pow(double
double l2 = pow(a1, a1);
// CHECK-YES: call double @__nv_modf(double
double l3 = modf(a1 + 3.5, &a1);
}
}

View File

@ -0,0 +1,21 @@
// Test calling of device math functions.
///==========================================================================///
// REQUIRES: nvptx-registered-target
// RUN: %clang_cc1 -internal-isystem %S/Inputs/include -include math.h -x c++ -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_math.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -include math.h -internal-isystem %S/Inputs/include -include stdlib.h -include limits -x c++ -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefix CHECK-YES %s
#include <math.h>
void test_sqrt(double a1) {
#pragma omp target
{
// CHECK-YES: call double @__nv_sqrt(double
double l1 = sqrt(a1);
// CHECK-YES: call double @__nv_pow(double
double l2 = pow(a1, a1);
// CHECK-YES: call double @__nv_modf(double
double l3 = modf(a1 + 3.5, &a1);
}
}