From 14a348470c1975aa22f1a6cd646721222429923c Mon Sep 17 00:00:00 2001 From: sjplimp Date: Wed, 18 May 2011 20:03:24 +0000 Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6136 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- lib/gpu/Makefile.linux | 4 ++ lib/gpu/README | 52 ++++++++++++++----------- lib/gpu/cmm_cut_gpu_kernel.cu | 48 +++++++++++------------ lib/gpu/cmmc_long_gpu_kernel.cu | 64 +++++++++++++++---------------- lib/gpu/crml_gpu_kernel.cu | 68 ++++++++++++++++----------------- lib/gpu/gb_gpu_kernel_nbor.cu | 20 +++++----- lib/gpu/geryon/ucl_nv_kernel.h | 8 ++++ lib/gpu/lj96_cut_gpu_kernel.cu | 48 +++++++++++------------ lib/gpu/lj_cut_gpu_kernel.cu | 48 +++++++++++------------ lib/gpu/lj_expand_gpu_kernel.cu | 48 +++++++++++------------ lib/gpu/ljc_cut_gpu_kernel.cu | 48 +++++++++++------------ lib/gpu/ljcl_cut_gpu_kernel.cu | 64 +++++++++++++++---------------- lib/gpu/morse_gpu_kernel.cu | 48 +++++++++++------------ lib/gpu/pair_gpu_atom_kernel.cu | 14 +++---- lib/gpu/pair_gpu_device.cpp | 3 +- lib/gpu/pair_gpu_device.h | 3 ++ lib/gpu/pppm_gpu_kernel.cu | 48 +++++++++++++---------- lib/gpu/pppm_gpu_memory.cpp | 6 ++- 18 files changed, 337 insertions(+), 305 deletions(-) diff --git a/lib/gpu/Makefile.linux b/lib/gpu/Makefile.linux index d69a00a817..1777187010 100644 --- a/lib/gpu/Makefile.linux +++ b/lib/gpu/Makefile.linux @@ -20,7 +20,11 @@ CUDA_HOME = /usr/local/cuda NVCC = nvcc +# newer CUDA CUDA_ARCH = -arch=sm_13 +# older CUDA +#CUDA_ARCH = -arch=sm_10 -DCUDA_PRE_THREE + CUDA_PRECISION = -D_SINGLE_SINGLE CUDA_INCLUDE = -I$(CUDA_HOME)/include CUDA_LIB = -L$(CUDA_HOME)/lib64 diff --git a/lib/gpu/README b/lib/gpu/README index a60d43064a..73a51fc391 100644 --- a/lib/gpu/README +++ b/lib/gpu/README @@ -33,13 +33,17 @@ NOTE: Installation of the CUDA SDK is not required. Current pair styles supporting GPU acceleration: - 1. lj/cut/gpu - 2. lj/cut/coul/cut/gpu - 3. lj/cut/coul/long/gpu - 4. lj96/cut/gpu - 5. gayberne/gpu - 6. cmm/cg/gpu - 7. cmm/cg/coul/long/gpu + 1. lj/cut + 2. lj96/cut + 3. lj/expand + 4. lj/cut/coul/cut + 5. lj/cut/coul/long + 6. lj/charmm/coul/long + 7. morse + 8. cg/cmm + 9. cg/cmm/coul/long + 10. gayberne + 11. pppm MULTIPLE LAMMPS PROCESSES @@ -52,12 +56,12 @@ LAMMPS user manual for details on running with GPU acceleration. BUILDING AND PRECISION MODES -To build, edit the CUDA_ARCH, CUDA_PRECISION, CUDA_HOME, NVCC, CUDA_INCLUD, -CUDA_LIB and CUDA_OPTS variables in one of the Makefiles. CUDA_ARCH should -be set based on the compute capability of your GPU. This can be verified by -running the nvc_get_devices executable after the build is complete. -Additionally, the GPU package must be installed and compiled for LAMMPS. -This may require editing the gpu_SYSPATH variable in the LAMMPS makefile. +To build, edit the CUDA_ARCH, CUDA_PRECISION, CUDA_HOME variables in one of +the Makefiles. CUDA_ARCH should be set based on the compute capability of +your GPU. This can be verified by running the nvc_get_devices executable after +the build is complete. Additionally, the GPU package must be installed and +compiled for LAMMPS. This may require editing the gpu_SYSPATH variable in the +LAMMPS makefile. Please note that the GPU library accesses the CUDA driver library directly, so it needs to be linked not only to the CUDA runtime library (libcudart.so) @@ -74,6 +78,10 @@ the CUDA_PRECISION variable: CUDA_PREC = -D_DOUBLE_DOUBLE # Double precision for all calculations CUDA_PREC = -D_SINGLE_DOUBLE # Accumulation of forces, etc. in double +NOTE: PPPM acceleration can only be run on GPUs with compute capability>=1.1. + You will get the error "GPU library not compiled for this accelerator." + when attempting to run PPPM on a GPU with compute capability 1.0. + NOTE: Double precision is only supported on certain GPUs (with compute capability>=1.3). @@ -83,15 +91,17 @@ NOTE: For Tesla and other graphics cards with compute capability>=1.3, NOTE: For Fermi, make sure that -arch=sm_20 is set on the CUDA_ARCH line. NOTE: The gayberne/gpu pair style will only be installed if the ASPHERE - package has been installed before installing the GPU package in LAMMPS. + package has been installed. NOTE: The cg/cmm/gpu and cg/cmm/coul/long/gpu pair styles will only be - installed if the USER-CG-CMM package has been installed before - installing the GPU package in LAMMPS. + installed if the USER-CG-CMM package has been installed. -NOTE: The lj/cut/coul/long/gpu and cg/cmm/coul/long/gpu style will only be - installed if the KSPACE package has been installed before installing - the GPU package in LAMMPS. +NOTE: The lj/cut/coul/long/gpu, cg/cmm/coul/long/gpu, pppm/gpu/single, and + pppm/gpu/double styles will only be installed if the KSPACE package has + been installed. + +NOTE: The lj/charmm/coul/long will only be installed if the MOLECULE package + has been installed. EXAMPLE BUILD PROCESS @@ -105,7 +115,3 @@ make yes-asphere make yes-kspace make yes-gpu make linux - ------------------------------------------------------------------------- -Last merge with gpulammps: r561 on 2010-11-12 ------------------------------------------------------------------------- diff --git a/lib/gpu/cmm_cut_gpu_kernel.cu b/lib/gpu/cmm_cut_gpu_kernel.cu index 08cc31ed7f..f99e7f06ac 100644 --- a/lib/gpu/cmm_cut_gpu_kernel.cu +++ b/lib/gpu/cmm_cut_gpu_kernel.cu @@ -18,30 +18,6 @@ #ifndef CMM_GPU_KERNEL #define CMM_GPU_KERNEL -#ifdef _DOUBLE_DOUBLE -#define numtyp double -#define numtyp2 double2 -#define numtyp4 double4 -#define acctyp double -#define acctyp4 double4 -#endif - -#ifdef _SINGLE_DOUBLE -#define numtyp float -#define numtyp2 float2 -#define numtyp4 float4 -#define acctyp double -#define acctyp4 double4 -#endif - -#ifndef numtyp -#define numtyp float -#define numtyp2 float2 -#define numtyp4 float4 -#define acctyp float -#define acctyp4 float4 -#endif - #ifdef NV_KERNEL #include "nv_kernel_def.h" @@ -75,6 +51,30 @@ __inline float4 fetch_pos(const int& i, const float4 *pos) #endif +#ifdef _DOUBLE_DOUBLE +#define numtyp double +#define numtyp2 double2 +#define numtyp4 double4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifdef _SINGLE_DOUBLE +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifndef numtyp +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp float +#define acctyp4 float4 +#endif + #define SBBITS 30 #define NEIGHMASK 0x3FFFFFFF __inline int sbmask(int j) { return j >> SBBITS & 3; } diff --git a/lib/gpu/cmmc_long_gpu_kernel.cu b/lib/gpu/cmmc_long_gpu_kernel.cu index 5153cb5016..a47a9267a1 100644 --- a/lib/gpu/cmmc_long_gpu_kernel.cu +++ b/lib/gpu/cmmc_long_gpu_kernel.cu @@ -18,38 +18,6 @@ #ifndef CMML_GPU_KERNEL #define CMML_GPU_KERNEL -#ifdef _DOUBLE_DOUBLE -#define numtyp double -#define numtyp2 double2 -#define numtyp4 double4 -#define acctyp double -#define acctyp4 double4 -#endif - -#ifdef _SINGLE_DOUBLE -#define numtyp float -#define numtyp2 float2 -#define numtyp4 float4 -#define acctyp double -#define acctyp4 double4 -#endif - -#ifndef numtyp -#define numtyp float -#define numtyp2 float2 -#define numtyp4 float4 -#define acctyp float -#define acctyp4 float4 -#endif - -#define EWALD_F (numtyp)1.12837917 -#define EWALD_P (numtyp)0.3275911 -#define A1 (numtyp)0.254829592 -#define A2 (numtyp)-0.284496736 -#define A3 (numtyp)1.421413741 -#define A4 (numtyp)-1.453152027 -#define A5 (numtyp)1.061405429 - #ifdef NV_KERNEL #include "nv_kernel_def.h" @@ -93,6 +61,38 @@ __inline float fetch_q(const int& i, const float *q) #endif +#ifdef _DOUBLE_DOUBLE +#define numtyp double +#define numtyp2 double2 +#define numtyp4 double4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifdef _SINGLE_DOUBLE +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifndef numtyp +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp float +#define acctyp4 float4 +#endif + +#define EWALD_F (numtyp)1.12837917 +#define EWALD_P (numtyp)0.3275911 +#define A1 (numtyp)0.254829592 +#define A2 (numtyp)-0.284496736 +#define A3 (numtyp)1.421413741 +#define A4 (numtyp)-1.453152027 +#define A5 (numtyp)1.061405429 + #define SBBITS 30 #define NEIGHMASK 0x3FFFFFFF __inline int sbmask(int j) { return j >> SBBITS & 3; } diff --git a/lib/gpu/crml_gpu_kernel.cu b/lib/gpu/crml_gpu_kernel.cu index 63ce924581..dfdc7af3cd 100644 --- a/lib/gpu/crml_gpu_kernel.cu +++ b/lib/gpu/crml_gpu_kernel.cu @@ -18,40 +18,6 @@ #ifndef CRML_GPU_KERNEL #define CRML_GPU_KERNEL -#define MAX_BIO_SHARED_TYPES 128 - -#ifdef _DOUBLE_DOUBLE -#define numtyp double -#define numtyp2 double2 -#define numtyp4 double4 -#define acctyp double -#define acctyp4 double4 -#endif - -#ifdef _SINGLE_DOUBLE -#define numtyp float -#define numtyp2 float2 -#define numtyp4 float4 -#define acctyp double -#define acctyp4 double4 -#endif - -#ifndef numtyp -#define numtyp float -#define numtyp2 float2 -#define numtyp4 float4 -#define acctyp float -#define acctyp4 float4 -#endif - -#define EWALD_F (numtyp)1.12837917 -#define EWALD_P (numtyp)0.3275911 -#define A1 (numtyp)0.254829592 -#define A2 (numtyp)-0.284496736 -#define A3 (numtyp)1.421413741 -#define A4 (numtyp)-1.453152027 -#define A5 (numtyp)1.061405429 - #ifdef NV_KERNEL #include "nv_kernel_def.h" @@ -94,6 +60,40 @@ __inline float fetch_q(const int& i, const float *q) #endif +#define MAX_BIO_SHARED_TYPES 128 + +#ifdef _DOUBLE_DOUBLE +#define numtyp double +#define numtyp2 double2 +#define numtyp4 double4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifdef _SINGLE_DOUBLE +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifndef numtyp +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp float +#define acctyp4 float4 +#endif + +#define EWALD_F (numtyp)1.12837917 +#define EWALD_P (numtyp)0.3275911 +#define A1 (numtyp)0.254829592 +#define A2 (numtyp)-0.284496736 +#define A3 (numtyp)1.421413741 +#define A4 (numtyp)-1.453152027 +#define A5 (numtyp)1.061405429 + #define SBBITS 30 #define NEIGHMASK 0x3FFFFFFF __inline int sbmask(int j) { return j >> SBBITS & 3; } diff --git a/lib/gpu/gb_gpu_kernel_nbor.cu b/lib/gpu/gb_gpu_kernel_nbor.cu index 1b1d81fa42..b35b5b6998 100644 --- a/lib/gpu/gb_gpu_kernel_nbor.cu +++ b/lib/gpu/gb_gpu_kernel_nbor.cu @@ -18,16 +18,6 @@ #ifndef PAIR_GPU_KERNEL_H #define PAIR_GPU_KERNEL_H -#ifdef _DOUBLE_DOUBLE -#define numtyp double -#define numtyp2 double2 -#define numtyp4 double4 -#else -#define numtyp float -#define numtyp2 float2 -#define numtyp4 float4 -#endif - #ifdef NV_KERNEL #include "nv_kernel_def.h" @@ -44,6 +34,16 @@ #endif +#ifdef _DOUBLE_DOUBLE +#define numtyp double +#define numtyp2 double2 +#define numtyp4 double4 +#else +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#endif + // --------------------------------------------------------------------------- // Unpack neighbors from dev_ij array into dev_nbor matrix for coalesced access // -- Only unpack neighbors matching the specified inclusive range of forms diff --git a/lib/gpu/geryon/ucl_nv_kernel.h b/lib/gpu/geryon/ucl_nv_kernel.h index 5c45dc3a87..65a51b5f04 100644 --- a/lib/gpu/geryon/ucl_nv_kernel.h +++ b/lib/gpu/geryon/ucl_nv_kernel.h @@ -33,6 +33,14 @@ #define MEM_THREADS 32 #endif +#ifdef CUDA_PRE_THREE +struct __builtin_align__(16) _double4 +{ + double x, y, z, w; +}; +typedef struct _double4 double4; +#endif + #define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x) #define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y) #define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x); diff --git a/lib/gpu/lj96_cut_gpu_kernel.cu b/lib/gpu/lj96_cut_gpu_kernel.cu index 3fc6a2f308..1de9a8a7bf 100644 --- a/lib/gpu/lj96_cut_gpu_kernel.cu +++ b/lib/gpu/lj96_cut_gpu_kernel.cu @@ -18,30 +18,6 @@ #ifndef LJ96_GPU_KERNEL #define LJ96_GPU_KERNEL -#ifdef _DOUBLE_DOUBLE -#define numtyp double -#define numtyp2 double2 -#define numtyp4 double4 -#define acctyp double -#define acctyp4 double4 -#endif - -#ifdef _SINGLE_DOUBLE -#define numtyp float -#define numtyp2 float2 -#define numtyp4 float4 -#define acctyp double -#define acctyp4 double4 -#endif - -#ifndef numtyp -#define numtyp float -#define numtyp2 float2 -#define numtyp4 float4 -#define acctyp float -#define acctyp4 float4 -#endif - #ifdef NV_KERNEL #include "nv_kernel_def.h" @@ -75,6 +51,30 @@ __inline float4 fetch_pos(const int& i, const float4 *pos) #endif +#ifdef _DOUBLE_DOUBLE +#define numtyp double +#define numtyp2 double2 +#define numtyp4 double4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifdef _SINGLE_DOUBLE +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifndef numtyp +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp float +#define acctyp4 float4 +#endif + #define SBBITS 30 #define NEIGHMASK 0x3FFFFFFF __inline int sbmask(int j) { return j >> SBBITS & 3; } diff --git a/lib/gpu/lj_cut_gpu_kernel.cu b/lib/gpu/lj_cut_gpu_kernel.cu index 75f36446f7..9ef698cd09 100644 --- a/lib/gpu/lj_cut_gpu_kernel.cu +++ b/lib/gpu/lj_cut_gpu_kernel.cu @@ -18,30 +18,6 @@ #ifndef LJ_GPU_KERNEL #define LJ_GPU_KERNEL -#ifdef _DOUBLE_DOUBLE -#define numtyp double -#define numtyp2 double2 -#define numtyp4 double4 -#define acctyp double -#define acctyp4 double4 -#endif - -#ifdef _SINGLE_DOUBLE -#define numtyp float -#define numtyp2 float2 -#define numtyp4 float4 -#define acctyp double -#define acctyp4 double4 -#endif - -#ifndef numtyp -#define numtyp float -#define numtyp2 float2 -#define numtyp4 float4 -#define acctyp float -#define acctyp4 float4 -#endif - #ifdef NV_KERNEL #include "nv_kernel_def.h" @@ -75,6 +51,30 @@ __inline float4 fetch_pos(const int& i, const float4 *pos) #endif +#ifdef _DOUBLE_DOUBLE +#define numtyp double +#define numtyp2 double2 +#define numtyp4 double4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifdef _SINGLE_DOUBLE +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifndef numtyp +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp float +#define acctyp4 float4 +#endif + #define SBBITS 30 #define NEIGHMASK 0x3FFFFFFF __inline int sbmask(int j) { return j >> SBBITS & 3; } diff --git a/lib/gpu/lj_expand_gpu_kernel.cu b/lib/gpu/lj_expand_gpu_kernel.cu index 2d09b4d941..26fbefacf8 100644 --- a/lib/gpu/lj_expand_gpu_kernel.cu +++ b/lib/gpu/lj_expand_gpu_kernel.cu @@ -18,30 +18,6 @@ #ifndef LJE_GPU_KERNEL #define LJE_GPU_KERNEL -#ifdef _DOUBLE_DOUBLE -#define numtyp double -#define numtyp2 double2 -#define numtyp4 double4 -#define acctyp double -#define acctyp4 double4 -#endif - -#ifdef _SINGLE_DOUBLE -#define numtyp float -#define numtyp2 float2 -#define numtyp4 float4 -#define acctyp double -#define acctyp4 double4 -#endif - -#ifndef numtyp -#define numtyp float -#define numtyp2 float2 -#define numtyp4 float4 -#define acctyp float -#define acctyp4 float4 -#endif - #ifdef NV_KERNEL #include "nv_kernel_def.h" @@ -75,6 +51,30 @@ __inline float4 fetch_pos(const int& i, const float4 *pos) #endif +#ifdef _DOUBLE_DOUBLE +#define numtyp double +#define numtyp2 double2 +#define numtyp4 double4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifdef _SINGLE_DOUBLE +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifndef numtyp +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp float +#define acctyp4 float4 +#endif + #define SBBITS 30 #define NEIGHMASK 0x3FFFFFFF __inline int sbmask(int j) { return j >> SBBITS & 3; } diff --git a/lib/gpu/ljc_cut_gpu_kernel.cu b/lib/gpu/ljc_cut_gpu_kernel.cu index 44a607588a..ad1e530712 100644 --- a/lib/gpu/ljc_cut_gpu_kernel.cu +++ b/lib/gpu/ljc_cut_gpu_kernel.cu @@ -18,30 +18,6 @@ #ifndef LJC_GPU_KERNEL #define LJC_GPU_KERNEL -#ifdef _DOUBLE_DOUBLE -#define numtyp double -#define numtyp2 double2 -#define numtyp4 double4 -#define acctyp double -#define acctyp4 double4 -#endif - -#ifdef _SINGLE_DOUBLE -#define numtyp float -#define numtyp2 float2 -#define numtyp4 float4 -#define acctyp double -#define acctyp4 double4 -#endif - -#ifndef numtyp -#define numtyp float -#define numtyp2 float2 -#define numtyp4 float4 -#define acctyp float -#define acctyp4 float4 -#endif - #ifdef NV_KERNEL #include "nv_kernel_def.h" @@ -85,6 +61,30 @@ __inline float fetch_q(const int& i, const float *q) #endif +#ifdef _DOUBLE_DOUBLE +#define numtyp double +#define numtyp2 double2 +#define numtyp4 double4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifdef _SINGLE_DOUBLE +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifndef numtyp +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp float +#define acctyp4 float4 +#endif + #define SBBITS 30 #define NEIGHMASK 0x3FFFFFFF __inline int sbmask(int j) { return j >> SBBITS & 3; } diff --git a/lib/gpu/ljcl_cut_gpu_kernel.cu b/lib/gpu/ljcl_cut_gpu_kernel.cu index 7be7a86114..ddde1dec32 100644 --- a/lib/gpu/ljcl_cut_gpu_kernel.cu +++ b/lib/gpu/ljcl_cut_gpu_kernel.cu @@ -18,38 +18,6 @@ #ifndef LJCL_GPU_KERNEL #define LJCL_GPU_KERNEL -#ifdef _DOUBLE_DOUBLE -#define numtyp double -#define numtyp2 double2 -#define numtyp4 double4 -#define acctyp double -#define acctyp4 double4 -#endif - -#ifdef _SINGLE_DOUBLE -#define numtyp float -#define numtyp2 float2 -#define numtyp4 float4 -#define acctyp double -#define acctyp4 double4 -#endif - -#ifndef numtyp -#define numtyp float -#define numtyp2 float2 -#define numtyp4 float4 -#define acctyp float -#define acctyp4 float4 -#endif - -#define EWALD_F (numtyp)1.12837917 -#define EWALD_P (numtyp)0.3275911 -#define A1 (numtyp)0.254829592 -#define A2 (numtyp)-0.284496736 -#define A3 (numtyp)1.421413741 -#define A4 (numtyp)-1.453152027 -#define A5 (numtyp)1.061405429 - #ifdef NV_KERNEL #include "nv_kernel_def.h" @@ -93,6 +61,38 @@ __inline float fetch_q(const int& i, const float *q) #endif +#ifdef _DOUBLE_DOUBLE +#define numtyp double +#define numtyp2 double2 +#define numtyp4 double4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifdef _SINGLE_DOUBLE +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifndef numtyp +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp float +#define acctyp4 float4 +#endif + +#define EWALD_F (numtyp)1.12837917 +#define EWALD_P (numtyp)0.3275911 +#define A1 (numtyp)0.254829592 +#define A2 (numtyp)-0.284496736 +#define A3 (numtyp)1.421413741 +#define A4 (numtyp)-1.453152027 +#define A5 (numtyp)1.061405429 + #define SBBITS 30 #define NEIGHMASK 0x3FFFFFFF __inline int sbmask(int j) { return j >> SBBITS & 3; } diff --git a/lib/gpu/morse_gpu_kernel.cu b/lib/gpu/morse_gpu_kernel.cu index 0a89aae070..8832f58c64 100644 --- a/lib/gpu/morse_gpu_kernel.cu +++ b/lib/gpu/morse_gpu_kernel.cu @@ -18,30 +18,6 @@ #ifndef MORSE_GPU_KERNEL #define MORSE_GPU_KERNEL -#ifdef _DOUBLE_DOUBLE -#define numtyp double -#define numtyp2 double2 -#define numtyp4 double4 -#define acctyp double -#define acctyp4 double4 -#endif - -#ifdef _SINGLE_DOUBLE -#define numtyp float -#define numtyp2 float2 -#define numtyp4 float4 -#define acctyp double -#define acctyp4 double4 -#endif - -#ifndef numtyp -#define numtyp float -#define numtyp2 float2 -#define numtyp4 float4 -#define acctyp float -#define acctyp4 float4 -#endif - #ifdef NV_KERNEL #include "nv_kernel_def.h" @@ -75,6 +51,30 @@ __inline float4 fetch_pos(const int& i, const float4 *pos) #endif +#ifdef _DOUBLE_DOUBLE +#define numtyp double +#define numtyp2 double2 +#define numtyp4 double4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifdef _SINGLE_DOUBLE +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifndef numtyp +#define numtyp float +#define numtyp2 float2 +#define numtyp4 float4 +#define acctyp float +#define acctyp4 float4 +#endif + #define SBBITS 30 #define NEIGHMASK 0x3FFFFFFF __inline int sbmask(int j) { return j >> SBBITS & 3; } diff --git a/lib/gpu/pair_gpu_atom_kernel.cu b/lib/gpu/pair_gpu_atom_kernel.cu index 2d1a6ba85f..ab79ac6e9c 100644 --- a/lib/gpu/pair_gpu_atom_kernel.cu +++ b/lib/gpu/pair_gpu_atom_kernel.cu @@ -15,6 +15,13 @@ Contributing authors: Mike Brown (ORNL), brownw@ornl.gov ------------------------------------------------------------------------- */ +#ifdef NV_KERNEL +#include "geryon/ucl_nv_kernel.h" +#else +#pragma OPENCL EXTENSION cl_khr_fp64: enable +#define GLOBAL_ID_X get_global_id(0) +#endif + #ifdef _DOUBLE_DOUBLE #define numtyp double #define numtyp4 double4 @@ -23,13 +30,6 @@ #define numtyp4 float4 #endif -#ifdef NV_KERNEL -#include "geryon/ucl_nv_kernel.h" -#else -#pragma OPENCL EXTENSION cl_khr_fp64: enable -#define GLOBAL_ID_X get_global_id(0) -#endif - __kernel void kernel_cast_x(__global numtyp4 *x_type, __global double *x, __global int *type, const int nall) { int ii=GLOBAL_ID_X; diff --git a/lib/gpu/pair_gpu_device.cpp b/lib/gpu/pair_gpu_device.cpp index d5906b10e5..165d202832 100644 --- a/lib/gpu/pair_gpu_device.cpp +++ b/lib/gpu/pair_gpu_device.cpp @@ -549,8 +549,9 @@ int PairGPUDeviceT::compile_kernels() { k_info.run(&d_gpu_lib_data.begin()); ucl_copy(h_gpu_lib_data,d_gpu_lib_data,false); + _ptx_arch=static_cast(h_gpu_lib_data[0])/100.0; #ifndef USE_OPENCL - if (static_cast(h_gpu_lib_data[0])/100.0>gpu->arch()) + if (_ptx_arch>gpu->arch()) return -4; #endif diff --git a/lib/gpu/pair_gpu_device.h b/lib/gpu/pair_gpu_device.h index 1e7e15e6a8..52b35cfcf2 100644 --- a/lib/gpu/pair_gpu_device.h +++ b/lib/gpu/pair_gpu_device.h @@ -226,6 +226,8 @@ class PairGPUDevice { inline int block_bio_pair() const { return _block_bio_pair; } /// Return the maximum number of atom types for shared mem with "bio" styles inline int max_bio_shared_types() const { return _max_bio_shared_types; } + /// Architecture gpu code compiled for (returns 0 for OpenCL) + inline double ptx_arch() const { return _ptx_arch; } // -------------------- SHARED DEVICE ROUTINES -------------------- // Perform asynchronous zero of integer array @@ -281,6 +283,7 @@ class PairGPUDevice { int _gpu_mode, _first_device, _last_device, _nthreads; double _particle_split; double _cpu_full; + double _ptx_arch; int _num_mem_threads, _warp_size, _threads_per_atom, _threads_per_charge; int _pppm_max_spline, _pppm_block; diff --git a/lib/gpu/pppm_gpu_kernel.cu b/lib/gpu/pppm_gpu_kernel.cu index c04e784de8..fe1862d051 100644 --- a/lib/gpu/pppm_gpu_kernel.cu +++ b/lib/gpu/pppm_gpu_kernel.cu @@ -18,27 +18,6 @@ #ifndef PPPM_GPU_KERNEL #define PPPM_GPU_KERNEL -#ifdef _DOUBLE_DOUBLE -#define numtyp double -#define numtyp4 double4 -#define acctyp double -#define acctyp4 double4 -#endif - -#ifdef _SINGLE_DOUBLE -#define numtyp float -#define numtyp4 float4 -#define acctyp double -#define acctyp4 double4 -#endif - -#ifndef numtyp -#define numtyp float -#define numtyp4 float4 -#define acctyp float -#define acctyp4 float4 -#endif - #ifdef NV_KERNEL #include "geryon/ucl_nv_kernel.h" @@ -67,6 +46,12 @@ __inline float fetch_q(const int& i, const float *q) #endif +// Allow PPPM to compile without atomics for NVIDIA 1.0 cards, error +// generated at runtime with use of pppm/gpu +#if (__CUDA_ARCH__ < 110) +#define atom_add(x,y) 0 +#endif + #else #pragma OPENCL EXTENSION cl_khr_fp64: enable @@ -85,6 +70,27 @@ __inline float fetch_q(const int& i, const float *q) #endif +#ifdef _DOUBLE_DOUBLE +#define numtyp double +#define numtyp4 double4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifdef _SINGLE_DOUBLE +#define numtyp float +#define numtyp4 float4 +#define acctyp double +#define acctyp4 double4 +#endif + +#ifndef numtyp +#define numtyp float +#define numtyp4 float4 +#define acctyp float +#define acctyp4 float4 +#endif + // Maximum order for spline #define PPPM_MAX_SPLINE 8 // Thread block size for PPPM kernels diff --git a/lib/gpu/pppm_gpu_memory.cpp b/lib/gpu/pppm_gpu_memory.cpp index 521b3b1e46..2f7b35d051 100644 --- a/lib/gpu/pppm_gpu_memory.cpp +++ b/lib/gpu/pppm_gpu_memory.cpp @@ -66,7 +66,11 @@ grdtyp * PPPMGPUMemoryT::init(const int nlocal, const int nall, FILE *_screen, flag=-5; return 0; } - + if (device->ptx_arch()>0.0 && device->ptx_arch()<1.1) { + flag=-4; + return 0; + } + ucl_device=device->gpu; atom=&device->atom;