lammps/lib/gpu/lal_pre_cuda_hip.h

// **************************************************************************
//                               pre_cuda_hip.h
//                             -------------------
//                           W. Michael Brown (ORNL)
//                           Nitin Dhamankar (Intel)
//
//  Device-side preprocessor definitions for CUDA and HIP builds
//
// __________________________________________________________________________
//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
//    begin                :
//    email                : brownw@ornl.gov
// ***************************************************************************/

//*************************************************************************
//                       Device Configuration Definitions
//                    See lal_preprocessor.h for definitions
//*************************************************************************/

// -------------------------------------------------------------------------
//                           CUDA and HIP DEFINITIONS
// -------------------------------------------------------------------------

#if defined(NV_KERNEL) || defined(USE_HIP)

// -------------------------------------------------------------------------
//                             DEVICE CONFIGURATION
// -------------------------------------------------------------------------


#ifdef __HIP_PLATFORM_HCC__
#define CONFIG_ID 303
#define SIMD_SIZE 64
#else
#define CONFIG_ID 103
#define SIMD_SIZE 32
#endif

#define MEM_THREADS SIMD_SIZE
#define SHUFFLE_AVAIL 1
#define FAST_MATH 1

#define THREADS_PER_ATOM 4
#define THREADS_PER_CHARGE 8
#define THREADS_PER_THREE 2

#define BLOCK_PAIR 256
#define BLOCK_BIO_PAIR 256
#define BLOCK_ELLIPSE 128
#define PPPM_BLOCK_1D 64
#define BLOCK_NBOR_BUILD 128
#define BLOCK_CELL_2D 8
#define BLOCK_CELL_ID 128

#define MAX_SHARED_TYPES 11
#define MAX_BIO_SHARED_TYPES 128
#define PPPM_MAX_SPLINE 8

// -------------------------------------------------------------------------
//                          LEGACY DEVICE CONFIGURATION
// -------------------------------------------------------------------------

#ifdef __CUDA_ARCH__

#if (__CUDA_ARCH__ < 200)

#undef CONFIG_ID
#define CONFIG_ID 101
#define MEM_THREADS 16
#undef THREADS_PER_ATOM
#define THREADS_PER_ATOM 1
#undef THREADS_PER_CHARGE
#define THREADS_PER_CHARGE 16
#undef BLOCK_PAIR
#define BLOCK_PAIR 64
#undef BLOCK_BIO_PAIR
#define BLOCK_BIO_PAIR 64
#undef BLOCK_NBOR_BUILD
#define BLOCK_NBOR_BUILD 64
#undef MAX_SHARED_TYPES
#define MAX_SHARED_TYPES 8
#undef SHUFFLE_AVAIL
#define SHUFFLE_AVAIL 0

#elseif (__CUDA_ARCH__ < 300)

#undef CONFIG_ID
#define CONFIG_ID 102
#undef BLOCK_PAIR
#define BLOCK_PAIR 128
#undef BLOCK_BIO_PAIR
#define BLOCK_BIO_PAIR 128
#undef MAX_SHARED_TYPES
#define MAX_SHARED_TYPES 8
#undef SHUFFLE_AVAIL
#define SHUFFLE_AVAIL 0

#endif

#endif

// -------------------------------------------------------------------------
//                              KERNEL MACROS
// -------------------------------------------------------------------------

#ifdef USE_HIP
#include <hip/hip_runtime.h>
#endif

#define fast_mul(X,Y) (X)*(Y)

#ifdef __CUDA_ARCH__
#if (__CUDA_ARCH__ < 200)
#define fast_mul __mul24
#endif
#endif

#define EVFLAG 1
#define NOUNROLL
#define GLOBAL_ID_X threadIdx.x+fast_mul(blockIdx.x,blockDim.x)
#define GLOBAL_ID_Y threadIdx.y+fast_mul(blockIdx.y,blockDim.y)
#define GLOBAL_SIZE_X fast_mul(gridDim.x,blockDim.x);
#define GLOBAL_SIZE_Y fast_mul(gridDim.y,blockDim.y);
#define THREAD_ID_X threadIdx.x
#define THREAD_ID_Y threadIdx.y
#define BLOCK_ID_X blockIdx.x
#define BLOCK_ID_Y blockIdx.y
#define BLOCK_SIZE_X blockDim.x
#define BLOCK_SIZE_Y blockDim.y
#define NUM_BLOCKS_X gridDim.x

#define __kernel extern "C" __global__
#ifdef __local
#undef __local
#endif
#define __local __shared__
#define __global
#define restrict __restrict__
#define atom_add atomicAdd
#define ucl_inline static __inline__ __device__

#define simd_size() SIMD_SIZE

#define simdsync()

#ifdef NV_KERNEL
#if (__CUDACC_VER_MAJOR__ >= 9)
#undef simdsync
#define simdsync() __syncwarp(0xffffffff)
#endif
#endif

#ifdef __HIP_PLATFORM_NVCC__
#undef simdsync()
#define simdsync() __syncwarp(0xffffffff)
#endif

// -------------------------------------------------------------------------
//                         KERNEL MACROS - TEXTURES
// -------------------------------------------------------------------------

#ifdef __HIP_PLATFORM_HCC__
#define _texture(name, type)  __device__ type* name
#define _texture_2d(name, type)  __device__ type* name
#else
#define _texture(name, type)  texture<type> name
#define _texture_2d(name, type) texture<type,1> name
#endif

#if (__CUDACC_VER_MAJOR__ < 11)
  #ifdef _DOUBLE_DOUBLE
  #define fetch4(ans,i,pos_tex) {                        \
    int4 xy = tex1Dfetch(pos_tex,i*2);                   \
    int4 zt = tex1Dfetch(pos_tex,i*2+1);                 \
    ans.x=__hiloint2double(xy.y, xy.x);                  \
    ans.y=__hiloint2double(xy.w, xy.z);                  \
    ans.z=__hiloint2double(zt.y, zt.x);                  \
    ans.w=__hiloint2double(zt.w, zt.z);                  \
  }
  #define fetch(ans,i,q_tex) {                           \
    int2 qt = tex1Dfetch(q_tex,i);                       \
    ans=__hiloint2double(qt.y, qt.x);                    \
  }
  #else
  #define fetch4(ans,i,pos_tex) ans=tex1Dfetch(pos_tex, i);
  #define fetch(ans,i,q_tex) ans=tex1Dfetch(q_tex,i);
  #endif
#else
  #define fetch4(ans,i,x) ans=x[i]
  #define fetch(ans,i,q) ans=q[i]
  #undef _texture
  #undef _texture_2d
  #define _texture(name, type)
  #define _texture_2d(name, type)
  #define pos_tex x_
  #define quat_tex qif
  #define q_tex q_
  #define vel_tex v_
  #define mu_tex mu_
#endif

#ifdef __HIP_PLATFORM_HCC__

#undef fetch4
#undef fetch

#ifdef _DOUBLE_DOUBLE
#define fetch4(ans,i,pos_tex) (ans=*(((double4*)pos_tex) + i))
#define fetch(ans,i,q_tex)    (ans=*(((double *)  q_tex) + i))
#else
#define fetch4(ans,i,pos_tex) (ans=*(((float4*)pos_tex) + i))
#define fetch(ans,i,q_tex)    (ans=*(((float *)  q_tex) + i))
#endif

#endif

// -------------------------------------------------------------------------
//                           KERNEL MACROS - MATH
// -------------------------------------------------------------------------

#ifdef CUDA_PRE_THREE
struct __builtin_align__(16) _double4
{
  double x, y, z, w;
};
typedef struct _double4 double4;
#endif

#ifdef _DOUBLE_DOUBLE

#define ucl_exp exp
#define ucl_powr pow
#define ucl_atan atan
#define ucl_cbrt cbrt
#define ucl_ceil ceil
#define ucl_abs fabs
#define ucl_rsqrt rsqrt
#define ucl_sqrt sqrt
#define ucl_recip(x) ((numtyp)1.0/(x))

#else

#define ucl_atan atanf
#define ucl_cbrt cbrtf
#define ucl_ceil ceilf
#define ucl_abs fabsf
#define ucl_recip(x) ((numtyp)1.0/(x))
#define ucl_rsqrt rsqrtf
#define ucl_sqrt sqrtf
#define ucl_exp expf
#define ucl_powr powf

#endif

// -------------------------------------------------------------------------
//                         KERNEL MACROS - SHUFFLE
// -------------------------------------------------------------------------

#if SHUFFLE_AVAIL == 1

#ifndef USE_HIP
#if (__CUDACC_VER_MAJOR__ < 9)
#define CUDA_PRE_NINE
#endif
#endif

#if defined(CUDA_PRE_NINE) || defined(__HIP_PLATFORM_HCC__)

  #ifdef _SINGLE_SINGLE
    #define shfl_down __shfl_down
    #define shfl_xor __shfl_xor
  #else
    ucl_inline double shfl_down(double var, unsigned int delta, int width) {
      int2 tmp;
      tmp.x = __double2hiint(var);
      tmp.y = __double2loint(var);
      tmp.x = __shfl_down(tmp.x,delta,width);
      tmp.y = __shfl_down(tmp.y,delta,width);
      return __hiloint2double(tmp.x,tmp.y);
    }
    ucl_inline double shfl_xor(double var, unsigned int lanemask, int width) {
      int2 tmp;
      tmp.x = __double2hiint(var);
      tmp.y = __double2loint(var);
      tmp.x = __shfl_xor(tmp.x,lanemask,width);
      tmp.y = __shfl_xor(tmp.y,lanemask,width);
      return __hiloint2double(tmp.x,tmp.y);
    }
  #endif
  #define simd_broadcast_i __shfl
  #define simd_broadcast_f __shfl
  #ifdef _DOUBLE_DOUBLE
    ucl_inline double simd_broadcast_d(double var, unsigned int src,
                                       int width) {
      int2 tmp;
      tmp.x = __double2hiint(var);
      tmp.y = __double2loint(var);
      tmp.x = __shfl(tmp.x,src,width);
      tmp.y = __shfl(tmp.y,src,width);
      return __hiloint2double(tmp.x,tmp.y);
    }
  #endif

#else

  #ifdef _SINGLE_SINGLE
  ucl_inline float shfl_down(float var, unsigned int delta, int width) {
    return __shfl_down_sync(0xffffffff, var, delta, width);
  }
  ucl_inline float shfl_xor(float var, unsigned int lanemask, int width) {
    return __shfl_xor_sync(0xffffffff, var, lanemask, width);
  }
  #else
  ucl_inline double shfl_down(double var, unsigned int delta, int width) {
    int2 tmp;
    tmp.x = __double2hiint(var);
    tmp.y = __double2loint(var);
    tmp.x = __shfl_down_sync(0xffffffff,tmp.x,delta,width);
    tmp.y = __shfl_down_sync(0xffffffff,tmp.y,delta,width);
    return __hiloint2double(tmp.x,tmp.y);
  }
  ucl_inline double shfl_xor(double var, unsigned int lanemask, int width) {
    int2 tmp;
    tmp.x = __double2hiint(var);
    tmp.y = __double2loint(var);
    tmp.x = __shfl_xor_sync(0xffffffff,tmp.x,lanemask,width);
    tmp.y = __shfl_xor_sync(0xffffffff,tmp.y,lanemask,width);
    return __hiloint2double(tmp.x,tmp.y);
  }
  #endif
  #define simd_broadcast_i(var, src, width) \
    __shfl_sync(0xffffffff, var, src, width)
  #define simd_broadcast_f(var, src, width) \
    __shfl_sync(0xffffffff, var, src, width)
  #ifdef _DOUBLE_DOUBLE
  ucl_inline double simd_broadcast_d(double var, unsigned int src, int width) {
    int2 tmp;
    tmp.x = __double2hiint(var);
    tmp.y = __double2loint(var);
    tmp.x = __shfl_sync(0xffffffff,tmp.x,src,width);
    tmp.y = __shfl_sync(0xffffffff,tmp.y,src,width);
    return __hiloint2double(tmp.x,tmp.y);
  }
  #endif
#endif

#endif

// -------------------------------------------------------------------------
//                            END CUDA / HIP DEFINITIONS
// -------------------------------------------------------------------------

#endif