git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6625 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2011-08-08 19:51:22 +00:00 · 2011-08-08 19:51:22 +00:00 · 3c1cefdba0
parent 7bf99d0e58
commit 3c1cefdba0
9 changed files with 37 additions and 42 deletions
--- a/lib/gpu/README
+++ b/lib/gpu/README
@ -83,7 +83,11 @@ NOTE: PPPM acceleration can only be run on GPUs with compute capability>=1.1.
      when attempting to run PPPM on a GPU with compute capability 1.0.

 NOTE: Double precision is only supported on certain GPUs (with
-      compute capability>=1.3).
+      compute capability>=1.3). If you compile the GPU library for
+      a GPU with compute capability 1.1 and 1.2, then only single
+      precistion FFTs are supported, i.e. LAMMPS has to be compiled
+      with -DFFT_SINGLE. For details on configuring FFT support in 
+      LAMMPS, see http://lammps.sandia.gov/doc/Section_start.html#2_2_4
      
 NOTE: For Tesla and other graphics cards with compute capability>=1.3,
      make sure that -arch=sm_13 is set on the CUDA_ARCH line.
@ -96,9 +100,8 @@ NOTE: The gayberne/gpu pair style will only be installed if the ASPHERE
 NOTE: The cg/cmm/gpu and cg/cmm/coul/long/gpu pair styles will only be
      installed if the USER-CG-CMM package has been installed.

-NOTE: The lj/cut/coul/long/gpu, cg/cmm/coul/long/gpu, pppm/gpu/single, and
-      pppm/gpu/double styles will only be installed if the KSPACE package has
-      been installed.
+NOTE: The lj/cut/coul/long/gpu, cg/cmm/coul/long/gpu, and pppm/gpu styles
+      will only be installed if the KSPACE package has been installed.

 NOTE: The lj/charmm/coul/long will only be installed if the MOLECULE package
      has been installed.
--- a/lib/gpu/cmmc_long_gpu_kernel.cu
+++ b/lib/gpu/cmmc_long_gpu_kernel.cu
@ -200,10 +200,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
-        } else {
+        } else
          forcecoul = (numtyp)0.0;
-          prefactor = (numtyp)0.0;
-        }

        force = (force_lj + forcecoul) * r2inv;

@ -212,7 +210,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
        f.z+=delz*force;

        if (eflag>0) {
-          e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < cut_coulsq)
+            e_coul += prefactor*(_erfc-factor_coul);
          if (rsq < lj1[mtype].y) {
            energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)-
                      lj3[mtype].w;
@ -394,10 +393,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
-        } else {
+        } else
          forcecoul = (numtyp)0.0;
-          prefactor = (numtyp)0.0;
-        }

        force = (force_lj + forcecoul) * r2inv;

@ -406,7 +403,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
        f.z+=delz*force;

        if (eflag>0) {
-          e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < cut_coulsq)
+            e_coul += prefactor*(_erfc-factor_coul);
          if (rsq < lj1[mtype].y) {
            energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)-
                      lj3[mtype].w;
--- a/lib/gpu/crml_gpu_kernel.cu
+++ b/lib/gpu/crml_gpu_kernel.cu
@ -204,10 +204,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
-        } else {
+        } else
          forcecoul = (numtyp)0.0;
-          prefactor = (numtyp)0.0;
-        }

        force = (force_lj + forcecoul) * r2inv;

@ -216,7 +214,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
        f.z+=delz*force;

        if (eflag>0) {
-          e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < cut_coulsq)
+            e_coul += prefactor*(_erfc-factor_coul);
          if (rsq < cut_ljsq) {
            numtyp e=r6inv*(lj1[mtype].z*r6inv-lj1[mtype].w);
            if (rsq > cut_lj_innersq)
@ -407,10 +406,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
-        } else {
+        } else
          forcecoul = (numtyp)0.0;
-          prefactor = (numtyp)0.0;
-        }

        force = (force_lj + forcecoul) * r2inv;

@ -419,7 +416,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
        f.z+=delz*force;

        if (eflag>0) {
-          e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < cut_coulsq)
+            e_coul += prefactor*(_erfc-factor_coul);
          if (rsq < cut_ljsq) {
            numtyp e=lj3-lj4;
            if (rsq > cut_lj_innersq)
--- a/lib/gpu/lj_class2_long.cu
+++ b/lib/gpu/lj_class2_long.cu
@ -191,10 +191,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
-        } else {
+        } else
          forcecoul = (numtyp)0.0;
-          prefactor = (numtyp)0.0;
-        }

        force = (force_lj + forcecoul) * r2inv;

@ -203,7 +201,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
        f.z+=delz*force;

        if (eflag>0) {
-          e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < cut_coulsq)
+            e_coul += prefactor*(_erfc-factor_coul);
          if (rsq < lj1[mtype].w) {
            numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
            energy+=factor_lj*(e-lj3[mtype].z);
@ -379,10 +378,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
-        } else {
+        } else
          forcecoul = (numtyp)0.0;
-          prefactor = (numtyp)0.0;
-        }

        force = (force_lj + forcecoul) * r2inv;

@ -391,7 +388,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
        f.z+=delz*force;

        if (eflag>0) {
-          e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < cut_coulsq)
+            e_coul += prefactor*(_erfc-factor_coul);
          if (rsq < lj1[mtype].w) {
            numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
            energy+=factor_lj*(e-lj3[mtype].z);
--- a/lib/gpu/ljcl_cut_gpu_kernel.cu
+++ b/lib/gpu/ljcl_cut_gpu_kernel.cu
@ -191,10 +191,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
-        } else {
+        } else
          forcecoul = (numtyp)0.0;
-          prefactor = (numtyp)0.0;
-        }

        force = (force_lj + forcecoul) * r2inv;

@ -203,7 +201,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
        f.z+=delz*force;

        if (eflag>0) {
-          e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < cut_coulsq)
+            e_coul += prefactor*(_erfc-factor_coul);
          if (rsq < lj1[mtype].w) {
            numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
            energy+=factor_lj*(e-lj3[mtype].z);
@ -377,10 +376,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
-        } else {
+        } else
          forcecoul = (numtyp)0.0;
-          prefactor = (numtyp)0.0;
-        }

        force = (force_lj + forcecoul) * r2inv;

@ -389,7 +386,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
        f.z+=delz*force;

        if (eflag>0) {
-          e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < cut_coulsq)
+            e_coul += prefactor*(_erfc-factor_coul);
          if (rsq < lj1[mtype].w) {
            numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
            energy+=factor_lj*(e-lj3[mtype].z);
--- a/lib/gpu/pppm_gpu_kernel.cu
+++ b/lib/gpu/pppm_gpu_kernel.cu
@ -144,7 +144,7 @@ __kernel void particle_map(__global numtyp4 *x_,  __global numtyp *q_,
      
        int i=nz*nlocal_y*nlocal_x+ny*nlocal_x+nx;
        int old=atom_add(counts+i, 1);
-        if (old==max_atoms) {
+        if (old>=max_atoms) {
          *error=2;
          atom_add(counts+i, -1);
        } else
--- a/lib/gpu/pppm_gpu_memory.cpp
+++ b/lib/gpu/pppm_gpu_memory.cpp
@ -51,7 +51,7 @@ grdtyp * PPPMGPUMemoryT::init(const int nlocal, const int nall, FILE *_screen,
                              const int order, const int nxlo_out,
                              const int nylo_out, const int nzlo_out,
                              const int nxhi_out, const int nyhi_out,
-                              const int nzhi_out, double **rho_coeff,
+                              const int nzhi_out, grdtyp **rho_coeff,
                              grdtyp **vd_brick, const double slab_volfactor, 
                              const int nx_pppm, const int ny_pppm,
                              const int nz_pppm, int &flag) {
@ -123,7 +123,7 @@ grdtyp * PPPMGPUMemoryT::init(const int nlocal, const int nall, FILE *_screen,
  int numel=order*( order/2 - n2lo + 1 );
  success=success && (d_rho_coeff.alloc(numel,*ucl_device,UCL_READ_ONLY)==
                      UCL_SUCCESS);
-  UCL_H_Vec<double> view;
+  UCL_H_Vec<grdtyp> view;
  view.view(rho_coeff[0]+n2lo,numel,*ucl_device);
  ucl_copy(d_rho_coeff,view,true);
  _max_bytes+=d_rho_coeff.row_bytes();
--- a/lib/gpu/pppm_gpu_memory.h
+++ b/lib/gpu/pppm_gpu_memory.h
@ -46,7 +46,7 @@ class PPPMGPUMemory {
  grdtyp * init(const int nlocal, const int nall, FILE *screen, const int order,
                const int nxlo_out, const int nylo_out, const int nzlo_out,
                const int nxhi_out, const int nyhi_out, const int nzhi_out,
-                double **rho_coeff, grdtyp **vd_brick, 
+                grdtyp **rho_coeff, grdtyp **vd_brick, 
                const double slab_volfactor, const int nx_pppm, 
                const int ny_pppm, const int nz_pppm, int &success);

--- a/lib/gpu/pppm_l_gpu.cpp
+++ b/lib/gpu/pppm_l_gpu.cpp
@ -34,7 +34,7 @@ grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall,
                       FILE *screen, const int order, const int nxlo_out, 
                       const int nylo_out, const int nzlo_out,
                       const int nxhi_out, const int nyhi_out,
-                       const int nzhi_out, double **rho_coeff,
+                       const int nzhi_out, grdtyp **rho_coeff,
                       grdtyp **vd_brick, const double slab_volfactor,
                       const int nx_pppm, const int ny_pppm, const int nz_pppm,
                       int &success) {
@ -95,7 +95,7 @@ float * pppm_gpu_init_f(const int nlocal, const int nall, FILE *screen,
                        const int order, const int nxlo_out, 
                        const int nylo_out, const int nzlo_out,
                        const int nxhi_out, const int nyhi_out,
-                        const int nzhi_out, double **rho_coeff,
+                        const int nzhi_out, float **rho_coeff,
                        float **vd_brick, const double slab_volfactor,
                        const int nx_pppm, const int ny_pppm, const int nz_pppm,
                        int &success) {