mirror of https://github.com/lammps/lammps.git
356 lines
10 KiB
C
356 lines
10 KiB
C
// **************************************************************************
|
|
// pre_cuda_hip.h
|
|
// -------------------
|
|
// W. Michael Brown (ORNL)
|
|
// Nitin Dhamankar (Intel)
|
|
//
|
|
// Device-side preprocessor definitions for CUDA and HIP builds
|
|
//
|
|
// __________________________________________________________________________
|
|
// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
|
|
// __________________________________________________________________________
|
|
//
|
|
// begin :
|
|
// email : brownw@ornl.gov
|
|
// ***************************************************************************/
|
|
|
|
//*************************************************************************
|
|
// Device Configuration Definitions
|
|
// See lal_preprocessor.h for definitions
|
|
//*************************************************************************/
|
|
|
|
// -------------------------------------------------------------------------
|
|
// CUDA and HIP DEFINITIONS
|
|
// -------------------------------------------------------------------------
|
|
|
|
#if defined(NV_KERNEL) || defined(USE_HIP)
|
|
|
|
// -------------------------------------------------------------------------
|
|
// DEVICE CONFIGURATION
|
|
// -------------------------------------------------------------------------
|
|
|
|
|
|
#ifdef __HIP_PLATFORM_HCC__
|
|
#define CONFIG_ID 303
|
|
#define SIMD_SIZE 64
|
|
#else
|
|
#define CONFIG_ID 103
|
|
#define SIMD_SIZE 32
|
|
#endif
|
|
|
|
#define MEM_THREADS SIMD_SIZE
|
|
#define SHUFFLE_AVAIL 1
|
|
#define FAST_MATH 1
|
|
|
|
#define THREADS_PER_ATOM 4
|
|
#define THREADS_PER_CHARGE 8
|
|
#define THREADS_PER_THREE 2
|
|
|
|
#define BLOCK_PAIR 256
|
|
#define BLOCK_BIO_PAIR 256
|
|
#define BLOCK_ELLIPSE 128
|
|
#define PPPM_BLOCK_1D 64
|
|
#define BLOCK_NBOR_BUILD 128
|
|
#define BLOCK_CELL_2D 8
|
|
#define BLOCK_CELL_ID 128
|
|
|
|
#define MAX_SHARED_TYPES 11
|
|
#define MAX_BIO_SHARED_TYPES 128
|
|
#define PPPM_MAX_SPLINE 8
|
|
|
|
// -------------------------------------------------------------------------
|
|
// LEGACY DEVICE CONFIGURATION
|
|
// -------------------------------------------------------------------------
|
|
|
|
#ifdef __CUDA_ARCH__
|
|
|
|
#if (__CUDA_ARCH__ < 200)
|
|
|
|
#undef CONFIG_ID
|
|
#define CONFIG_ID 101
|
|
#define MEM_THREADS 16
|
|
#undef THREADS_PER_ATOM
|
|
#define THREADS_PER_ATOM 1
|
|
#undef THREADS_PER_CHARGE
|
|
#define THREADS_PER_CHARGE 16
|
|
#undef BLOCK_PAIR
|
|
#define BLOCK_PAIR 64
|
|
#undef BLOCK_BIO_PAIR
|
|
#define BLOCK_BIO_PAIR 64
|
|
#undef BLOCK_NBOR_BUILD
|
|
#define BLOCK_NBOR_BUILD 64
|
|
#undef MAX_SHARED_TYPES
|
|
#define MAX_SHARED_TYPES 8
|
|
#undef SHUFFLE_AVAIL
|
|
#define SHUFFLE_AVAIL 0
|
|
|
|
#elseif (__CUDA_ARCH__ < 300)
|
|
|
|
#undef CONFIG_ID
|
|
#define CONFIG_ID 102
|
|
#undef BLOCK_PAIR
|
|
#define BLOCK_PAIR 128
|
|
#undef BLOCK_BIO_PAIR
|
|
#define BLOCK_BIO_PAIR 128
|
|
#undef MAX_SHARED_TYPES
|
|
#define MAX_SHARED_TYPES 8
|
|
#undef SHUFFLE_AVAIL
|
|
#define SHUFFLE_AVAIL 0
|
|
|
|
#endif
|
|
|
|
#endif
|
|
|
|
// -------------------------------------------------------------------------
|
|
// KERNEL MACROS
|
|
// -------------------------------------------------------------------------
|
|
|
|
#ifdef USE_HIP
|
|
#include <hip/hip_runtime.h>
|
|
#endif
|
|
|
|
#define fast_mul(X,Y) (X)*(Y)
|
|
|
|
#ifdef __CUDA_ARCH__
|
|
#if (__CUDA_ARCH__ < 200)
|
|
#define fast_mul __mul24
|
|
#endif
|
|
#endif
|
|
|
|
#define EVFLAG 1
|
|
#define NOUNROLL
|
|
#define GLOBAL_ID_X threadIdx.x+fast_mul(blockIdx.x,blockDim.x)
|
|
#define GLOBAL_ID_Y threadIdx.y+fast_mul(blockIdx.y,blockDim.y)
|
|
#define GLOBAL_SIZE_X fast_mul(gridDim.x,blockDim.x);
|
|
#define GLOBAL_SIZE_Y fast_mul(gridDim.y,blockDim.y);
|
|
#define THREAD_ID_X threadIdx.x
|
|
#define THREAD_ID_Y threadIdx.y
|
|
#define BLOCK_ID_X blockIdx.x
|
|
#define BLOCK_ID_Y blockIdx.y
|
|
#define BLOCK_SIZE_X blockDim.x
|
|
#define BLOCK_SIZE_Y blockDim.y
|
|
#define NUM_BLOCKS_X gridDim.x
|
|
|
|
#define __kernel extern "C" __global__
|
|
#ifdef __local
|
|
#undef __local
|
|
#endif
|
|
#define __local __shared__
|
|
#define __global
|
|
#define restrict __restrict__
|
|
#define atom_add atomicAdd
|
|
#define ucl_inline static __inline__ __device__
|
|
|
|
#define simd_size() SIMD_SIZE
|
|
|
|
#define simdsync()
|
|
|
|
#ifdef NV_KERNEL
|
|
#if (__CUDACC_VER_MAJOR__ >= 9)
|
|
#undef simdsync
|
|
#define simdsync() __syncwarp(0xffffffff)
|
|
#endif
|
|
#endif
|
|
|
|
#ifdef __HIP_PLATFORM_NVCC__
|
|
#undef simdsync()
|
|
#define simdsync() __syncwarp(0xffffffff)
|
|
#endif
|
|
|
|
// -------------------------------------------------------------------------
|
|
// KERNEL MACROS - TEXTURES
|
|
// -------------------------------------------------------------------------
|
|
|
|
#ifdef __HIP_PLATFORM_HCC__
|
|
#define _texture(name, type) __device__ type* name
|
|
#define _texture_2d(name, type) __device__ type* name
|
|
#else
|
|
#define _texture(name, type) texture<type> name
|
|
#define _texture_2d(name, type) texture<type,1> name
|
|
#endif
|
|
|
|
#if (__CUDACC_VER_MAJOR__ < 11)
|
|
#ifdef _DOUBLE_DOUBLE
|
|
#define fetch4(ans,i,pos_tex) { \
|
|
int4 xy = tex1Dfetch(pos_tex,i*2); \
|
|
int4 zt = tex1Dfetch(pos_tex,i*2+1); \
|
|
ans.x=__hiloint2double(xy.y, xy.x); \
|
|
ans.y=__hiloint2double(xy.w, xy.z); \
|
|
ans.z=__hiloint2double(zt.y, zt.x); \
|
|
ans.w=__hiloint2double(zt.w, zt.z); \
|
|
}
|
|
#define fetch(ans,i,q_tex) { \
|
|
int2 qt = tex1Dfetch(q_tex,i); \
|
|
ans=__hiloint2double(qt.y, qt.x); \
|
|
}
|
|
#else
|
|
#define fetch4(ans,i,pos_tex) ans=tex1Dfetch(pos_tex, i);
|
|
#define fetch(ans,i,q_tex) ans=tex1Dfetch(q_tex,i);
|
|
#endif
|
|
#else
|
|
#define fetch4(ans,i,x) ans=x[i]
|
|
#define fetch(ans,i,q) ans=q[i]
|
|
#undef _texture
|
|
#undef _texture_2d
|
|
#define _texture(name, type)
|
|
#define _texture_2d(name, type)
|
|
#define pos_tex x_
|
|
#define quat_tex qif
|
|
#define q_tex q_
|
|
#define vel_tex v_
|
|
#define mu_tex mu_
|
|
#endif
|
|
|
|
#ifdef __HIP_PLATFORM_HCC__
|
|
|
|
#undef fetch4
|
|
#undef fetch
|
|
|
|
#ifdef _DOUBLE_DOUBLE
|
|
#define fetch4(ans,i,pos_tex) (ans=*(((double4*)pos_tex) + i))
|
|
#define fetch(ans,i,q_tex) (ans=*(((double *) q_tex) + i))
|
|
#else
|
|
#define fetch4(ans,i,pos_tex) (ans=*(((float4*)pos_tex) + i))
|
|
#define fetch(ans,i,q_tex) (ans=*(((float *) q_tex) + i))
|
|
#endif
|
|
|
|
#endif
|
|
|
|
// -------------------------------------------------------------------------
|
|
// KERNEL MACROS - MATH
|
|
// -------------------------------------------------------------------------
|
|
|
|
#ifdef CUDA_PRE_THREE
|
|
struct __builtin_align__(16) _double4
|
|
{
|
|
double x, y, z, w;
|
|
};
|
|
typedef struct _double4 double4;
|
|
#endif
|
|
|
|
#ifdef _DOUBLE_DOUBLE
|
|
|
|
#define ucl_exp exp
|
|
#define ucl_powr pow
|
|
#define ucl_atan atan
|
|
#define ucl_cbrt cbrt
|
|
#define ucl_ceil ceil
|
|
#define ucl_abs fabs
|
|
#define ucl_rsqrt rsqrt
|
|
#define ucl_sqrt sqrt
|
|
#define ucl_recip(x) ((numtyp)1.0/(x))
|
|
|
|
#else
|
|
|
|
#define ucl_atan atanf
|
|
#define ucl_cbrt cbrtf
|
|
#define ucl_ceil ceilf
|
|
#define ucl_abs fabsf
|
|
#define ucl_recip(x) ((numtyp)1.0/(x))
|
|
#define ucl_rsqrt rsqrtf
|
|
#define ucl_sqrt sqrtf
|
|
#define ucl_exp expf
|
|
#define ucl_powr powf
|
|
|
|
#endif
|
|
|
|
// -------------------------------------------------------------------------
|
|
// KERNEL MACROS - SHUFFLE
|
|
// -------------------------------------------------------------------------
|
|
|
|
#if SHUFFLE_AVAIL == 1
|
|
|
|
#ifndef USE_HIP
|
|
#if (__CUDACC_VER_MAJOR__ < 9)
|
|
#define CUDA_PRE_NINE
|
|
#endif
|
|
#endif
|
|
|
|
#if defined(CUDA_PRE_NINE) || defined(__HIP_PLATFORM_HCC__)
|
|
|
|
#ifdef _SINGLE_SINGLE
|
|
#define shfl_down __shfl_down
|
|
#define shfl_xor __shfl_xor
|
|
#else
|
|
ucl_inline double shfl_down(double var, unsigned int delta, int width) {
|
|
int2 tmp;
|
|
tmp.x = __double2hiint(var);
|
|
tmp.y = __double2loint(var);
|
|
tmp.x = __shfl_down(tmp.x,delta,width);
|
|
tmp.y = __shfl_down(tmp.y,delta,width);
|
|
return __hiloint2double(tmp.x,tmp.y);
|
|
}
|
|
ucl_inline double shfl_xor(double var, unsigned int lanemask, int width) {
|
|
int2 tmp;
|
|
tmp.x = __double2hiint(var);
|
|
tmp.y = __double2loint(var);
|
|
tmp.x = __shfl_xor(tmp.x,lanemask,width);
|
|
tmp.y = __shfl_xor(tmp.y,lanemask,width);
|
|
return __hiloint2double(tmp.x,tmp.y);
|
|
}
|
|
#endif
|
|
#define simd_broadcast_i __shfl
|
|
#define simd_broadcast_f __shfl
|
|
#ifdef _DOUBLE_DOUBLE
|
|
ucl_inline double simd_broadcast_d(double var, unsigned int src,
|
|
int width) {
|
|
int2 tmp;
|
|
tmp.x = __double2hiint(var);
|
|
tmp.y = __double2loint(var);
|
|
tmp.x = __shfl(tmp.x,src,width);
|
|
tmp.y = __shfl(tmp.y,src,width);
|
|
return __hiloint2double(tmp.x,tmp.y);
|
|
}
|
|
#endif
|
|
|
|
#else
|
|
|
|
#ifdef _SINGLE_SINGLE
|
|
ucl_inline float shfl_down(float var, unsigned int delta, int width) {
|
|
return __shfl_down_sync(0xffffffff, var, delta, width);
|
|
}
|
|
ucl_inline float shfl_xor(float var, unsigned int lanemask, int width) {
|
|
return __shfl_xor_sync(0xffffffff, var, lanemask, width);
|
|
}
|
|
#else
|
|
ucl_inline double shfl_down(double var, unsigned int delta, int width) {
|
|
int2 tmp;
|
|
tmp.x = __double2hiint(var);
|
|
tmp.y = __double2loint(var);
|
|
tmp.x = __shfl_down_sync(0xffffffff,tmp.x,delta,width);
|
|
tmp.y = __shfl_down_sync(0xffffffff,tmp.y,delta,width);
|
|
return __hiloint2double(tmp.x,tmp.y);
|
|
}
|
|
ucl_inline double shfl_xor(double var, unsigned int lanemask, int width) {
|
|
int2 tmp;
|
|
tmp.x = __double2hiint(var);
|
|
tmp.y = __double2loint(var);
|
|
tmp.x = __shfl_xor_sync(0xffffffff,tmp.x,lanemask,width);
|
|
tmp.y = __shfl_xor_sync(0xffffffff,tmp.y,lanemask,width);
|
|
return __hiloint2double(tmp.x,tmp.y);
|
|
}
|
|
#endif
|
|
#define simd_broadcast_i(var, src, width) \
|
|
__shfl_sync(0xffffffff, var, src, width)
|
|
#define simd_broadcast_f(var, src, width) \
|
|
__shfl_sync(0xffffffff, var, src, width)
|
|
#ifdef _DOUBLE_DOUBLE
|
|
ucl_inline double simd_broadcast_d(double var, unsigned int src, int width) {
|
|
int2 tmp;
|
|
tmp.x = __double2hiint(var);
|
|
tmp.y = __double2loint(var);
|
|
tmp.x = __shfl_sync(0xffffffff,tmp.x,src,width);
|
|
tmp.y = __shfl_sync(0xffffffff,tmp.y,src,width);
|
|
return __hiloint2double(tmp.x,tmp.y);
|
|
}
|
|
#endif
|
|
#endif
|
|
|
|
#endif
|
|
|
|
// -------------------------------------------------------------------------
|
|
// END CUDA / HIP DEFINITIONS
|
|
// -------------------------------------------------------------------------
|
|
|
|
#endif
|