From 14a348470c1975aa22f1a6cd646721222429923c Mon Sep 17 00:00:00 2001
From: sjplimp <sjplimp@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Wed, 18 May 2011 20:03:24 +0000
Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6136
 f3b2605a-c512-4ea7-a41b-209d697bcdaa

---
 lib/gpu/Makefile.linux          |  4 ++
 lib/gpu/README                  | 52 ++++++++++++++-----------
 lib/gpu/cmm_cut_gpu_kernel.cu   | 48 +++++++++++------------
 lib/gpu/cmmc_long_gpu_kernel.cu | 64 +++++++++++++++----------------
 lib/gpu/crml_gpu_kernel.cu      | 68 ++++++++++++++++-----------------
 lib/gpu/gb_gpu_kernel_nbor.cu   | 20 +++++-----
 lib/gpu/geryon/ucl_nv_kernel.h  |  8 ++++
 lib/gpu/lj96_cut_gpu_kernel.cu  | 48 +++++++++++------------
 lib/gpu/lj_cut_gpu_kernel.cu    | 48 +++++++++++------------
 lib/gpu/lj_expand_gpu_kernel.cu | 48 +++++++++++------------
 lib/gpu/ljc_cut_gpu_kernel.cu   | 48 +++++++++++------------
 lib/gpu/ljcl_cut_gpu_kernel.cu  | 64 +++++++++++++++----------------
 lib/gpu/morse_gpu_kernel.cu     | 48 +++++++++++------------
 lib/gpu/pair_gpu_atom_kernel.cu | 14 +++----
 lib/gpu/pair_gpu_device.cpp     |  3 +-
 lib/gpu/pair_gpu_device.h       |  3 ++
 lib/gpu/pppm_gpu_kernel.cu      | 48 +++++++++++++----------
 lib/gpu/pppm_gpu_memory.cpp     |  6 ++-
 18 files changed, 337 insertions(+), 305 deletions(-)

diff --git a/lib/gpu/Makefile.linux b/lib/gpu/Makefile.linux
index d69a00a817..1777187010 100644
--- a/lib/gpu/Makefile.linux
+++ b/lib/gpu/Makefile.linux
@@ -20,7 +20,11 @@
 CUDA_HOME = /usr/local/cuda
 NVCC = nvcc
 
+# newer CUDA
 CUDA_ARCH = -arch=sm_13
+# older CUDA
+#CUDA_ARCH = -arch=sm_10 -DCUDA_PRE_THREE
+
 CUDA_PRECISION = -D_SINGLE_SINGLE
 CUDA_INCLUDE = -I$(CUDA_HOME)/include
 CUDA_LIB = -L$(CUDA_HOME)/lib64
diff --git a/lib/gpu/README b/lib/gpu/README
index a60d43064a..73a51fc391 100644
--- a/lib/gpu/README
+++ b/lib/gpu/README
@@ -33,13 +33,17 @@ NOTE: Installation of the CUDA SDK is not required.
 
 Current pair styles supporting GPU acceleration:
 
-  1. lj/cut/gpu
-  2. lj/cut/coul/cut/gpu
-  3. lj/cut/coul/long/gpu
-  4. lj96/cut/gpu
-  5. gayberne/gpu
-  6. cmm/cg/gpu
-  7. cmm/cg/coul/long/gpu
+  1. lj/cut
+  2. lj96/cut
+  3. lj/expand
+  4. lj/cut/coul/cut
+  5. lj/cut/coul/long
+  6. lj/charmm/coul/long
+  7. morse
+  8. cg/cmm
+  9. cg/cmm/coul/long
+ 10. gayberne
+ 11. pppm
 
                      MULTIPLE LAMMPS PROCESSES
                      
@@ -52,12 +56,12 @@ LAMMPS user manual for details on running with GPU acceleration.
 
                     BUILDING AND PRECISION MODES
 
-To build, edit the CUDA_ARCH, CUDA_PRECISION, CUDA_HOME, NVCC, CUDA_INCLUD,
-CUDA_LIB and CUDA_OPTS variables in one of the Makefiles. CUDA_ARCH should
-be set based on the compute capability of your GPU. This can be verified by
-running the nvc_get_devices executable after the build is complete.
-Additionally, the GPU package must be installed and compiled for LAMMPS.
-This may require editing the gpu_SYSPATH variable in the LAMMPS makefile.
+To build, edit the CUDA_ARCH, CUDA_PRECISION, CUDA_HOME variables in one of 
+the Makefiles. CUDA_ARCH should be set based on the compute capability of 
+your GPU. This can be verified by running the nvc_get_devices executable after
+the build is complete. Additionally, the GPU package must be installed and
+compiled for LAMMPS. This may require editing the gpu_SYSPATH variable in the
+LAMMPS makefile.
 
 Please note that the GPU library accesses the CUDA driver library directly,
 so it needs to be linked not only to the CUDA runtime library (libcudart.so)
@@ -74,6 +78,10 @@ the CUDA_PRECISION variable:
   CUDA_PREC = -D_DOUBLE_DOUBLE  # Double precision for all calculations
   CUDA_PREC = -D_SINGLE_DOUBLE  # Accumulation of forces, etc. in double
 
+NOTE: PPPM acceleration can only be run on GPUs with compute capability>=1.1.
+      You will get the error "GPU library not compiled for this accelerator."
+      when attempting to run PPPM on a GPU with compute capability 1.0.
+
 NOTE: Double precision is only supported on certain GPUs (with
       compute capability>=1.3).
       
@@ -83,15 +91,17 @@ NOTE: For Tesla and other graphics cards with compute capability>=1.3,
 NOTE: For Fermi, make sure that -arch=sm_20 is set on the CUDA_ARCH line.
 
 NOTE: The gayberne/gpu pair style will only be installed if the ASPHERE
-      package has been installed before installing the GPU package in LAMMPS.
+      package has been installed.
 
 NOTE: The cg/cmm/gpu and cg/cmm/coul/long/gpu pair styles will only be
-      installed if the USER-CG-CMM package has been installed before
-      installing the GPU package in LAMMPS.
+      installed if the USER-CG-CMM package has been installed.
 
-NOTE: The lj/cut/coul/long/gpu and cg/cmm/coul/long/gpu style will only be 
-      installed if the KSPACE package has been installed before installing
-      the GPU package in LAMMPS.
+NOTE: The lj/cut/coul/long/gpu, cg/cmm/coul/long/gpu, pppm/gpu/single, and
+      pppm/gpu/double styles will only be installed if the KSPACE package has
+      been installed.
+
+NOTE: The lj/charmm/coul/long will only be installed if the MOLECULE package
+      has been installed.
 
                       EXAMPLE BUILD PROCESS
                     
@@ -105,7 +115,3 @@ make yes-asphere
 make yes-kspace
 make yes-gpu
 make linux
-
-------------------------------------------------------------------------
-Last merge with gpulammps: r561 on 2010-11-12
-------------------------------------------------------------------------
diff --git a/lib/gpu/cmm_cut_gpu_kernel.cu b/lib/gpu/cmm_cut_gpu_kernel.cu
index 08cc31ed7f..f99e7f06ac 100644
--- a/lib/gpu/cmm_cut_gpu_kernel.cu
+++ b/lib/gpu/cmm_cut_gpu_kernel.cu
@@ -18,30 +18,6 @@
 #ifndef CMM_GPU_KERNEL
 #define CMM_GPU_KERNEL
 
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp2 double2
-#define numtyp4 double4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifdef _SINGLE_DOUBLE
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifndef numtyp
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp float
-#define acctyp4 float4
-#endif
-
 #ifdef NV_KERNEL
 
 #include "nv_kernel_def.h"
@@ -75,6 +51,30 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
 
 #endif
 
+#ifdef _DOUBLE_DOUBLE
+#define numtyp double
+#define numtyp2 double2
+#define numtyp4 double4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifdef _SINGLE_DOUBLE
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifndef numtyp
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp float
+#define acctyp4 float4
+#endif
+
 #define SBBITS 30
 #define NEIGHMASK 0x3FFFFFFF
 __inline int sbmask(int j) { return j >> SBBITS & 3; }
diff --git a/lib/gpu/cmmc_long_gpu_kernel.cu b/lib/gpu/cmmc_long_gpu_kernel.cu
index 5153cb5016..a47a9267a1 100644
--- a/lib/gpu/cmmc_long_gpu_kernel.cu
+++ b/lib/gpu/cmmc_long_gpu_kernel.cu
@@ -18,38 +18,6 @@
 #ifndef CMML_GPU_KERNEL
 #define CMML_GPU_KERNEL
 
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp2 double2
-#define numtyp4 double4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifdef _SINGLE_DOUBLE
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifndef numtyp
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp float
-#define acctyp4 float4
-#endif
-
-#define EWALD_F (numtyp)1.12837917
-#define EWALD_P (numtyp)0.3275911
-#define A1 (numtyp)0.254829592
-#define A2 (numtyp)-0.284496736
-#define A3 (numtyp)1.421413741
-#define A4 (numtyp)-1.453152027
-#define A5 (numtyp)1.061405429
-
 #ifdef NV_KERNEL
 
 #include "nv_kernel_def.h"
@@ -93,6 +61,38 @@ __inline float fetch_q(const int& i, const float *q)
 
 #endif
 
+#ifdef _DOUBLE_DOUBLE
+#define numtyp double
+#define numtyp2 double2
+#define numtyp4 double4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifdef _SINGLE_DOUBLE
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifndef numtyp
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp float
+#define acctyp4 float4
+#endif
+
+#define EWALD_F (numtyp)1.12837917
+#define EWALD_P (numtyp)0.3275911
+#define A1 (numtyp)0.254829592
+#define A2 (numtyp)-0.284496736
+#define A3 (numtyp)1.421413741
+#define A4 (numtyp)-1.453152027
+#define A5 (numtyp)1.061405429
+
 #define SBBITS 30
 #define NEIGHMASK 0x3FFFFFFF
 __inline int sbmask(int j) { return j >> SBBITS & 3; }
diff --git a/lib/gpu/crml_gpu_kernel.cu b/lib/gpu/crml_gpu_kernel.cu
index 63ce924581..dfdc7af3cd 100644
--- a/lib/gpu/crml_gpu_kernel.cu
+++ b/lib/gpu/crml_gpu_kernel.cu
@@ -18,40 +18,6 @@
 #ifndef CRML_GPU_KERNEL
 #define CRML_GPU_KERNEL
 
-#define MAX_BIO_SHARED_TYPES 128
-
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp2 double2
-#define numtyp4 double4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifdef _SINGLE_DOUBLE
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifndef numtyp
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp float
-#define acctyp4 float4
-#endif
-
-#define EWALD_F (numtyp)1.12837917
-#define EWALD_P (numtyp)0.3275911
-#define A1 (numtyp)0.254829592
-#define A2 (numtyp)-0.284496736
-#define A3 (numtyp)1.421413741
-#define A4 (numtyp)-1.453152027
-#define A5 (numtyp)1.061405429
-
 #ifdef NV_KERNEL
 
 #include "nv_kernel_def.h"
@@ -94,6 +60,40 @@ __inline float fetch_q(const int& i, const float *q)
 
 #endif
 
+#define MAX_BIO_SHARED_TYPES 128
+
+#ifdef _DOUBLE_DOUBLE
+#define numtyp double
+#define numtyp2 double2
+#define numtyp4 double4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifdef _SINGLE_DOUBLE
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifndef numtyp
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp float
+#define acctyp4 float4
+#endif
+
+#define EWALD_F (numtyp)1.12837917
+#define EWALD_P (numtyp)0.3275911
+#define A1 (numtyp)0.254829592
+#define A2 (numtyp)-0.284496736
+#define A3 (numtyp)1.421413741
+#define A4 (numtyp)-1.453152027
+#define A5 (numtyp)1.061405429
+
 #define SBBITS 30
 #define NEIGHMASK 0x3FFFFFFF
 __inline int sbmask(int j) { return j >> SBBITS & 3; }
diff --git a/lib/gpu/gb_gpu_kernel_nbor.cu b/lib/gpu/gb_gpu_kernel_nbor.cu
index 1b1d81fa42..b35b5b6998 100644
--- a/lib/gpu/gb_gpu_kernel_nbor.cu
+++ b/lib/gpu/gb_gpu_kernel_nbor.cu
@@ -18,16 +18,6 @@
 #ifndef PAIR_GPU_KERNEL_H
 #define PAIR_GPU_KERNEL_H
 
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp2 double2
-#define numtyp4 double4
-#else
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#endif
-
 #ifdef NV_KERNEL
 
 #include "nv_kernel_def.h"
@@ -44,6 +34,16 @@
 
 #endif
 
+#ifdef _DOUBLE_DOUBLE
+#define numtyp double
+#define numtyp2 double2
+#define numtyp4 double4
+#else
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#endif
+
 // ---------------------------------------------------------------------------
 // Unpack neighbors from dev_ij array into dev_nbor matrix for coalesced access
 // -- Only unpack neighbors matching the specified inclusive range of forms
diff --git a/lib/gpu/geryon/ucl_nv_kernel.h b/lib/gpu/geryon/ucl_nv_kernel.h
index 5c45dc3a87..65a51b5f04 100644
--- a/lib/gpu/geryon/ucl_nv_kernel.h
+++ b/lib/gpu/geryon/ucl_nv_kernel.h
@@ -33,6 +33,14 @@
 #define MEM_THREADS 32
 #endif
 
+#ifdef CUDA_PRE_THREE
+struct __builtin_align__(16) _double4
+{
+  double x, y, z, w;
+};
+typedef struct _double4 double4;
+#endif
+
 #define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x)
 #define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y)
 #define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x);
diff --git a/lib/gpu/lj96_cut_gpu_kernel.cu b/lib/gpu/lj96_cut_gpu_kernel.cu
index 3fc6a2f308..1de9a8a7bf 100644
--- a/lib/gpu/lj96_cut_gpu_kernel.cu
+++ b/lib/gpu/lj96_cut_gpu_kernel.cu
@@ -18,30 +18,6 @@
 #ifndef LJ96_GPU_KERNEL
 #define LJ96_GPU_KERNEL
 
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp2 double2
-#define numtyp4 double4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifdef _SINGLE_DOUBLE
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifndef numtyp
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp float
-#define acctyp4 float4
-#endif
-
 #ifdef NV_KERNEL
 
 #include "nv_kernel_def.h"
@@ -75,6 +51,30 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
 
 #endif
 
+#ifdef _DOUBLE_DOUBLE
+#define numtyp double
+#define numtyp2 double2
+#define numtyp4 double4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifdef _SINGLE_DOUBLE
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifndef numtyp
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp float
+#define acctyp4 float4
+#endif
+
 #define SBBITS 30
 #define NEIGHMASK 0x3FFFFFFF
 __inline int sbmask(int j) { return j >> SBBITS & 3; }
diff --git a/lib/gpu/lj_cut_gpu_kernel.cu b/lib/gpu/lj_cut_gpu_kernel.cu
index 75f36446f7..9ef698cd09 100644
--- a/lib/gpu/lj_cut_gpu_kernel.cu
+++ b/lib/gpu/lj_cut_gpu_kernel.cu
@@ -18,30 +18,6 @@
 #ifndef LJ_GPU_KERNEL
 #define LJ_GPU_KERNEL
 
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp2 double2
-#define numtyp4 double4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifdef _SINGLE_DOUBLE
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifndef numtyp
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp float
-#define acctyp4 float4
-#endif
-
 #ifdef NV_KERNEL
 
 #include "nv_kernel_def.h"
@@ -75,6 +51,30 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
 
 #endif
 
+#ifdef _DOUBLE_DOUBLE
+#define numtyp double
+#define numtyp2 double2
+#define numtyp4 double4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifdef _SINGLE_DOUBLE
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifndef numtyp
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp float
+#define acctyp4 float4
+#endif
+
 #define SBBITS 30
 #define NEIGHMASK 0x3FFFFFFF
 __inline int sbmask(int j) { return j >> SBBITS & 3; }
diff --git a/lib/gpu/lj_expand_gpu_kernel.cu b/lib/gpu/lj_expand_gpu_kernel.cu
index 2d09b4d941..26fbefacf8 100644
--- a/lib/gpu/lj_expand_gpu_kernel.cu
+++ b/lib/gpu/lj_expand_gpu_kernel.cu
@@ -18,30 +18,6 @@
 #ifndef LJE_GPU_KERNEL
 #define LJE_GPU_KERNEL
 
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp2 double2
-#define numtyp4 double4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifdef _SINGLE_DOUBLE
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifndef numtyp
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp float
-#define acctyp4 float4
-#endif
-
 #ifdef NV_KERNEL
 
 #include "nv_kernel_def.h"
@@ -75,6 +51,30 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
 
 #endif
 
+#ifdef _DOUBLE_DOUBLE
+#define numtyp double
+#define numtyp2 double2
+#define numtyp4 double4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifdef _SINGLE_DOUBLE
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifndef numtyp
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp float
+#define acctyp4 float4
+#endif
+
 #define SBBITS 30
 #define NEIGHMASK 0x3FFFFFFF
 __inline int sbmask(int j) { return j >> SBBITS & 3; }
diff --git a/lib/gpu/ljc_cut_gpu_kernel.cu b/lib/gpu/ljc_cut_gpu_kernel.cu
index 44a607588a..ad1e530712 100644
--- a/lib/gpu/ljc_cut_gpu_kernel.cu
+++ b/lib/gpu/ljc_cut_gpu_kernel.cu
@@ -18,30 +18,6 @@
 #ifndef LJC_GPU_KERNEL
 #define LJC_GPU_KERNEL
 
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp2 double2
-#define numtyp4 double4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifdef _SINGLE_DOUBLE
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifndef numtyp
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp float
-#define acctyp4 float4
-#endif
-
 #ifdef NV_KERNEL
 
 #include "nv_kernel_def.h"
@@ -85,6 +61,30 @@ __inline float fetch_q(const int& i, const float *q)
 
 #endif
 
+#ifdef _DOUBLE_DOUBLE
+#define numtyp double
+#define numtyp2 double2
+#define numtyp4 double4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifdef _SINGLE_DOUBLE
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifndef numtyp
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp float
+#define acctyp4 float4
+#endif
+
 #define SBBITS 30
 #define NEIGHMASK 0x3FFFFFFF
 __inline int sbmask(int j) { return j >> SBBITS & 3; }
diff --git a/lib/gpu/ljcl_cut_gpu_kernel.cu b/lib/gpu/ljcl_cut_gpu_kernel.cu
index 7be7a86114..ddde1dec32 100644
--- a/lib/gpu/ljcl_cut_gpu_kernel.cu
+++ b/lib/gpu/ljcl_cut_gpu_kernel.cu
@@ -18,38 +18,6 @@
 #ifndef LJCL_GPU_KERNEL
 #define LJCL_GPU_KERNEL
 
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp2 double2
-#define numtyp4 double4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifdef _SINGLE_DOUBLE
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifndef numtyp
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp float
-#define acctyp4 float4
-#endif
-
-#define EWALD_F (numtyp)1.12837917
-#define EWALD_P (numtyp)0.3275911
-#define A1 (numtyp)0.254829592
-#define A2 (numtyp)-0.284496736
-#define A3 (numtyp)1.421413741
-#define A4 (numtyp)-1.453152027
-#define A5 (numtyp)1.061405429
-
 #ifdef NV_KERNEL
 
 #include "nv_kernel_def.h"
@@ -93,6 +61,38 @@ __inline float fetch_q(const int& i, const float *q)
 
 #endif
 
+#ifdef _DOUBLE_DOUBLE
+#define numtyp double
+#define numtyp2 double2
+#define numtyp4 double4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifdef _SINGLE_DOUBLE
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifndef numtyp
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp float
+#define acctyp4 float4
+#endif
+
+#define EWALD_F (numtyp)1.12837917
+#define EWALD_P (numtyp)0.3275911
+#define A1 (numtyp)0.254829592
+#define A2 (numtyp)-0.284496736
+#define A3 (numtyp)1.421413741
+#define A4 (numtyp)-1.453152027
+#define A5 (numtyp)1.061405429
+
 #define SBBITS 30
 #define NEIGHMASK 0x3FFFFFFF
 __inline int sbmask(int j) { return j >> SBBITS & 3; }
diff --git a/lib/gpu/morse_gpu_kernel.cu b/lib/gpu/morse_gpu_kernel.cu
index 0a89aae070..8832f58c64 100644
--- a/lib/gpu/morse_gpu_kernel.cu
+++ b/lib/gpu/morse_gpu_kernel.cu
@@ -18,30 +18,6 @@
 #ifndef MORSE_GPU_KERNEL
 #define MORSE_GPU_KERNEL
 
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp2 double2
-#define numtyp4 double4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifdef _SINGLE_DOUBLE
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifndef numtyp
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp float
-#define acctyp4 float4
-#endif
-
 #ifdef NV_KERNEL
 
 #include "nv_kernel_def.h"
@@ -75,6 +51,30 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
 
 #endif
 
+#ifdef _DOUBLE_DOUBLE
+#define numtyp double
+#define numtyp2 double2
+#define numtyp4 double4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifdef _SINGLE_DOUBLE
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifndef numtyp
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp float
+#define acctyp4 float4
+#endif
+
 #define SBBITS 30
 #define NEIGHMASK 0x3FFFFFFF
 __inline int sbmask(int j) { return j >> SBBITS & 3; }
diff --git a/lib/gpu/pair_gpu_atom_kernel.cu b/lib/gpu/pair_gpu_atom_kernel.cu
index 2d1a6ba85f..ab79ac6e9c 100644
--- a/lib/gpu/pair_gpu_atom_kernel.cu
+++ b/lib/gpu/pair_gpu_atom_kernel.cu
@@ -15,6 +15,13 @@
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
 
+#ifdef NV_KERNEL
+#include "geryon/ucl_nv_kernel.h"
+#else
+#pragma OPENCL EXTENSION cl_khr_fp64: enable
+#define GLOBAL_ID_X get_global_id(0)
+#endif
+
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp4 double4
@@ -23,13 +30,6 @@
 #define numtyp4 float4
 #endif
 
-#ifdef NV_KERNEL
-#include "geryon/ucl_nv_kernel.h"
-#else
-#pragma OPENCL EXTENSION cl_khr_fp64: enable
-#define GLOBAL_ID_X get_global_id(0)
-#endif
-
 __kernel void kernel_cast_x(__global numtyp4 *x_type, __global double *x,
                             __global int *type, const int nall) {
   int ii=GLOBAL_ID_X;
diff --git a/lib/gpu/pair_gpu_device.cpp b/lib/gpu/pair_gpu_device.cpp
index d5906b10e5..165d202832 100644
--- a/lib/gpu/pair_gpu_device.cpp
+++ b/lib/gpu/pair_gpu_device.cpp
@@ -549,8 +549,9 @@ int PairGPUDeviceT::compile_kernels() {
   k_info.run(&d_gpu_lib_data.begin());
   ucl_copy(h_gpu_lib_data,d_gpu_lib_data,false);
   
+  _ptx_arch=static_cast<double>(h_gpu_lib_data[0])/100.0;
   #ifndef USE_OPENCL
-  if (static_cast<double>(h_gpu_lib_data[0])/100.0>gpu->arch())
+  if (_ptx_arch>gpu->arch())
     return -4;
   #endif
 
diff --git a/lib/gpu/pair_gpu_device.h b/lib/gpu/pair_gpu_device.h
index 1e7e15e6a8..52b35cfcf2 100644
--- a/lib/gpu/pair_gpu_device.h
+++ b/lib/gpu/pair_gpu_device.h
@@ -226,6 +226,8 @@ class PairGPUDevice {
   inline int block_bio_pair() const { return _block_bio_pair; }
   /// Return the maximum number of atom types for shared mem with "bio" styles
   inline int max_bio_shared_types() const { return _max_bio_shared_types; }
+  /// Architecture gpu code compiled for (returns 0 for OpenCL)
+  inline double ptx_arch() const { return _ptx_arch; }
 
   // -------------------- SHARED DEVICE ROUTINES -------------------- 
   // Perform asynchronous zero of integer array 
@@ -281,6 +283,7 @@ class PairGPUDevice {
   int _gpu_mode, _first_device, _last_device, _nthreads;
   double _particle_split;
   double _cpu_full;
+  double _ptx_arch;
 
   int _num_mem_threads, _warp_size, _threads_per_atom, _threads_per_charge;
   int _pppm_max_spline, _pppm_block;
diff --git a/lib/gpu/pppm_gpu_kernel.cu b/lib/gpu/pppm_gpu_kernel.cu
index c04e784de8..fe1862d051 100644
--- a/lib/gpu/pppm_gpu_kernel.cu
+++ b/lib/gpu/pppm_gpu_kernel.cu
@@ -18,27 +18,6 @@
 #ifndef PPPM_GPU_KERNEL
 #define PPPM_GPU_KERNEL
 
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp4 double4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifdef _SINGLE_DOUBLE
-#define numtyp float
-#define numtyp4 float4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifndef numtyp
-#define numtyp float
-#define numtyp4 float4
-#define acctyp float
-#define acctyp4 float4
-#endif
-
 #ifdef NV_KERNEL
 
 #include "geryon/ucl_nv_kernel.h"
@@ -67,6 +46,12 @@ __inline float fetch_q(const int& i, const float *q)
 
 #endif
 
+// Allow PPPM to compile without atomics for NVIDIA 1.0 cards, error
+// generated at runtime with use of pppm/gpu
+#if (__CUDA_ARCH__ < 110)
+#define atom_add(x,y) 0
+#endif
+
 #else
 
 #pragma OPENCL EXTENSION cl_khr_fp64: enable
@@ -85,6 +70,27 @@ __inline float fetch_q(const int& i, const float *q)
 
 #endif
 
+#ifdef _DOUBLE_DOUBLE
+#define numtyp double
+#define numtyp4 double4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifdef _SINGLE_DOUBLE
+#define numtyp float
+#define numtyp4 float4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifndef numtyp
+#define numtyp float
+#define numtyp4 float4
+#define acctyp float
+#define acctyp4 float4
+#endif
+
 // Maximum order for spline
 #define PPPM_MAX_SPLINE 8
 // Thread block size for PPPM kernels
diff --git a/lib/gpu/pppm_gpu_memory.cpp b/lib/gpu/pppm_gpu_memory.cpp
index 521b3b1e46..2f7b35d051 100644
--- a/lib/gpu/pppm_gpu_memory.cpp
+++ b/lib/gpu/pppm_gpu_memory.cpp
@@ -66,7 +66,11 @@ grdtyp * PPPMGPUMemoryT::init(const int nlocal, const int nall, FILE *_screen,
     flag=-5;
     return 0;
   }
-  
+  if (device->ptx_arch()>0.0 && device->ptx_arch()<1.1) {
+    flag=-4;
+    return 0;
+  }
+
   ucl_device=device->gpu;
   atom=&device->atom;