lammps/lib/gpu/lal_pre_cuda_hip.h

356 lines
10 KiB
C

// **************************************************************************
// pre_cuda_hip.h
// -------------------
// W. Michael Brown (ORNL)
// Nitin Dhamankar (Intel)
//
// Device-side preprocessor definitions for CUDA and HIP builds
//
// __________________________________________________________________________
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
// begin :
// email : brownw@ornl.gov
// ***************************************************************************/
//*************************************************************************
// Device Configuration Definitions
// See lal_preprocessor.h for definitions
//*************************************************************************/
// -------------------------------------------------------------------------
// CUDA and HIP DEFINITIONS
// -------------------------------------------------------------------------
#if defined(NV_KERNEL) || defined(USE_HIP)
// -------------------------------------------------------------------------
// DEVICE CONFIGURATION
// -------------------------------------------------------------------------
#ifdef __HIP_PLATFORM_HCC__
#define CONFIG_ID 303
#define SIMD_SIZE 64
#else
#define CONFIG_ID 103
#define SIMD_SIZE 32
#endif
#define MEM_THREADS SIMD_SIZE
#define SHUFFLE_AVAIL 1
#define FAST_MATH 1
#define THREADS_PER_ATOM 4
#define THREADS_PER_CHARGE 8
#define THREADS_PER_THREE 2
#define BLOCK_PAIR 256
#define BLOCK_BIO_PAIR 256
#define BLOCK_ELLIPSE 128
#define PPPM_BLOCK_1D 64
#define BLOCK_NBOR_BUILD 128
#define BLOCK_CELL_2D 8
#define BLOCK_CELL_ID 128
#define MAX_SHARED_TYPES 11
#define MAX_BIO_SHARED_TYPES 128
#define PPPM_MAX_SPLINE 8
// -------------------------------------------------------------------------
// LEGACY DEVICE CONFIGURATION
// -------------------------------------------------------------------------
#ifdef __CUDA_ARCH__
#if (__CUDA_ARCH__ < 200)
#undef CONFIG_ID
#define CONFIG_ID 101
#define MEM_THREADS 16
#undef THREADS_PER_ATOM
#define THREADS_PER_ATOM 1
#undef THREADS_PER_CHARGE
#define THREADS_PER_CHARGE 16
#undef BLOCK_PAIR
#define BLOCK_PAIR 64
#undef BLOCK_BIO_PAIR
#define BLOCK_BIO_PAIR 64
#undef BLOCK_NBOR_BUILD
#define BLOCK_NBOR_BUILD 64
#undef MAX_SHARED_TYPES
#define MAX_SHARED_TYPES 8
#undef SHUFFLE_AVAIL
#define SHUFFLE_AVAIL 0
#elseif (__CUDA_ARCH__ < 300)
#undef CONFIG_ID
#define CONFIG_ID 102
#undef BLOCK_PAIR
#define BLOCK_PAIR 128
#undef BLOCK_BIO_PAIR
#define BLOCK_BIO_PAIR 128
#undef MAX_SHARED_TYPES
#define MAX_SHARED_TYPES 8
#undef SHUFFLE_AVAIL
#define SHUFFLE_AVAIL 0
#endif
#endif
// -------------------------------------------------------------------------
// KERNEL MACROS
// -------------------------------------------------------------------------
#ifdef USE_HIP
#include <hip/hip_runtime.h>
#endif
#define fast_mul(X,Y) (X)*(Y)
#ifdef __CUDA_ARCH__
#if (__CUDA_ARCH__ < 200)
#define fast_mul __mul24
#endif
#endif
#define EVFLAG 1
#define NOUNROLL
#define GLOBAL_ID_X threadIdx.x+fast_mul(blockIdx.x,blockDim.x)
#define GLOBAL_ID_Y threadIdx.y+fast_mul(blockIdx.y,blockDim.y)
#define GLOBAL_SIZE_X fast_mul(gridDim.x,blockDim.x);
#define GLOBAL_SIZE_Y fast_mul(gridDim.y,blockDim.y);
#define THREAD_ID_X threadIdx.x
#define THREAD_ID_Y threadIdx.y
#define BLOCK_ID_X blockIdx.x
#define BLOCK_ID_Y blockIdx.y
#define BLOCK_SIZE_X blockDim.x
#define BLOCK_SIZE_Y blockDim.y
#define NUM_BLOCKS_X gridDim.x
#define __kernel extern "C" __global__
#ifdef __local
#undef __local
#endif
#define __local __shared__
#define __global
#define restrict __restrict__
#define atom_add atomicAdd
#define ucl_inline static __inline__ __device__
#define simd_size() SIMD_SIZE
#define simdsync()
#ifdef NV_KERNEL
#if (__CUDACC_VER_MAJOR__ >= 9)
#undef simdsync
#define simdsync() __syncwarp(0xffffffff)
#endif
#endif
#ifdef __HIP_PLATFORM_NVCC__
#undef simdsync()
#define simdsync() __syncwarp(0xffffffff)
#endif
// -------------------------------------------------------------------------
// KERNEL MACROS - TEXTURES
// -------------------------------------------------------------------------
#ifdef __HIP_PLATFORM_HCC__
#define _texture(name, type) __device__ type* name
#define _texture_2d(name, type) __device__ type* name
#else
#define _texture(name, type) texture<type> name
#define _texture_2d(name, type) texture<type,1> name
#endif
#if (__CUDACC_VER_MAJOR__ < 11)
#ifdef _DOUBLE_DOUBLE
#define fetch4(ans,i,pos_tex) { \
int4 xy = tex1Dfetch(pos_tex,i*2); \
int4 zt = tex1Dfetch(pos_tex,i*2+1); \
ans.x=__hiloint2double(xy.y, xy.x); \
ans.y=__hiloint2double(xy.w, xy.z); \
ans.z=__hiloint2double(zt.y, zt.x); \
ans.w=__hiloint2double(zt.w, zt.z); \
}
#define fetch(ans,i,q_tex) { \
int2 qt = tex1Dfetch(q_tex,i); \
ans=__hiloint2double(qt.y, qt.x); \
}
#else
#define fetch4(ans,i,pos_tex) ans=tex1Dfetch(pos_tex, i);
#define fetch(ans,i,q_tex) ans=tex1Dfetch(q_tex,i);
#endif
#else
#define fetch4(ans,i,x) ans=x[i]
#define fetch(ans,i,q) ans=q[i]
#undef _texture
#undef _texture_2d
#define _texture(name, type)
#define _texture_2d(name, type)
#define pos_tex x_
#define quat_tex qif
#define q_tex q_
#define vel_tex v_
#define mu_tex mu_
#endif
#ifdef __HIP_PLATFORM_HCC__
#undef fetch4
#undef fetch
#ifdef _DOUBLE_DOUBLE
#define fetch4(ans,i,pos_tex) (ans=*(((double4*)pos_tex) + i))
#define fetch(ans,i,q_tex) (ans=*(((double *) q_tex) + i))
#else
#define fetch4(ans,i,pos_tex) (ans=*(((float4*)pos_tex) + i))
#define fetch(ans,i,q_tex) (ans=*(((float *) q_tex) + i))
#endif
#endif
// -------------------------------------------------------------------------
// KERNEL MACROS - MATH
// -------------------------------------------------------------------------
#ifdef CUDA_PRE_THREE
struct __builtin_align__(16) _double4
{
double x, y, z, w;
};
typedef struct _double4 double4;
#endif
#ifdef _DOUBLE_DOUBLE
#define ucl_exp exp
#define ucl_powr pow
#define ucl_atan atan
#define ucl_cbrt cbrt
#define ucl_ceil ceil
#define ucl_abs fabs
#define ucl_rsqrt rsqrt
#define ucl_sqrt sqrt
#define ucl_recip(x) ((numtyp)1.0/(x))
#else
#define ucl_atan atanf
#define ucl_cbrt cbrtf
#define ucl_ceil ceilf
#define ucl_abs fabsf
#define ucl_recip(x) ((numtyp)1.0/(x))
#define ucl_rsqrt rsqrtf
#define ucl_sqrt sqrtf
#define ucl_exp expf
#define ucl_powr powf
#endif
// -------------------------------------------------------------------------
// KERNEL MACROS - SHUFFLE
// -------------------------------------------------------------------------
#if SHUFFLE_AVAIL == 1
#ifndef USE_HIP
#if (__CUDACC_VER_MAJOR__ < 9)
#define CUDA_PRE_NINE
#endif
#endif
#if defined(CUDA_PRE_NINE) || defined(__HIP_PLATFORM_HCC__)
#ifdef _SINGLE_SINGLE
#define shfl_down __shfl_down
#define shfl_xor __shfl_xor
#else
ucl_inline double shfl_down(double var, unsigned int delta, int width) {
int2 tmp;
tmp.x = __double2hiint(var);
tmp.y = __double2loint(var);
tmp.x = __shfl_down(tmp.x,delta,width);
tmp.y = __shfl_down(tmp.y,delta,width);
return __hiloint2double(tmp.x,tmp.y);
}
ucl_inline double shfl_xor(double var, unsigned int lanemask, int width) {
int2 tmp;
tmp.x = __double2hiint(var);
tmp.y = __double2loint(var);
tmp.x = __shfl_xor(tmp.x,lanemask,width);
tmp.y = __shfl_xor(tmp.y,lanemask,width);
return __hiloint2double(tmp.x,tmp.y);
}
#endif
#define simd_broadcast_i __shfl
#define simd_broadcast_f __shfl
#ifdef _DOUBLE_DOUBLE
ucl_inline double simd_broadcast_d(double var, unsigned int src,
int width) {
int2 tmp;
tmp.x = __double2hiint(var);
tmp.y = __double2loint(var);
tmp.x = __shfl(tmp.x,src,width);
tmp.y = __shfl(tmp.y,src,width);
return __hiloint2double(tmp.x,tmp.y);
}
#endif
#else
#ifdef _SINGLE_SINGLE
ucl_inline float shfl_down(float var, unsigned int delta, int width) {
return __shfl_down_sync(0xffffffff, var, delta, width);
}
ucl_inline float shfl_xor(float var, unsigned int lanemask, int width) {
return __shfl_xor_sync(0xffffffff, var, lanemask, width);
}
#else
ucl_inline double shfl_down(double var, unsigned int delta, int width) {
int2 tmp;
tmp.x = __double2hiint(var);
tmp.y = __double2loint(var);
tmp.x = __shfl_down_sync(0xffffffff,tmp.x,delta,width);
tmp.y = __shfl_down_sync(0xffffffff,tmp.y,delta,width);
return __hiloint2double(tmp.x,tmp.y);
}
ucl_inline double shfl_xor(double var, unsigned int lanemask, int width) {
int2 tmp;
tmp.x = __double2hiint(var);
tmp.y = __double2loint(var);
tmp.x = __shfl_xor_sync(0xffffffff,tmp.x,lanemask,width);
tmp.y = __shfl_xor_sync(0xffffffff,tmp.y,lanemask,width);
return __hiloint2double(tmp.x,tmp.y);
}
#endif
#define simd_broadcast_i(var, src, width) \
__shfl_sync(0xffffffff, var, src, width)
#define simd_broadcast_f(var, src, width) \
__shfl_sync(0xffffffff, var, src, width)
#ifdef _DOUBLE_DOUBLE
ucl_inline double simd_broadcast_d(double var, unsigned int src, int width) {
int2 tmp;
tmp.x = __double2hiint(var);
tmp.y = __double2loint(var);
tmp.x = __shfl_sync(0xffffffff,tmp.x,src,width);
tmp.y = __shfl_sync(0xffffffff,tmp.y,src,width);
return __hiloint2double(tmp.x,tmp.y);
}
#endif
#endif
#endif
// -------------------------------------------------------------------------
// END CUDA / HIP DEFINITIONS
// -------------------------------------------------------------------------
#endif