Adding changes from Mike Brown.

git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@3883 f3b2605a-c512-4ea7-a41b-209d697bcdaa
2010-03-04 22:35:29 +00:00 · 2010-03-04 22:35:29 +00:00 · c0e0b65619
parent 780ba6e377
commit c0e0b65619
20 changed files with 578 additions and 611 deletions
--- a/lib/gpu/README
+++ b/lib/gpu/README
@ -74,13 +74,3 @@ NOTE: For Tesla and other graphics cards with compute capability>=1.3,
 NOTE: The gayberne/gpu pair style will only be installed if the ASPHERE
      package has been installed before installing the GPU package in LAMMPS.
      
-                           GPU MEMORY
-
-Upon initialization of the gayberne/gpu pair style, the library will reserve
-memory for 64K atoms per GPU or 70% of each cards GPU memory, whichever value
-is limiting.  The value of 70% can be changed by editing the
-PERCENT_GPU_MEMORY definition in the source file. For gayberne/gpu, the value
-of 64K cannot be increased and is the maximum number of atoms allowed per
-GPU. Using the 'neigh_modify one' modifier in your LAMMPS input script
-can help to increase maximum number of atoms per GPU for cards with
-limited memory.
--- a/lib/gpu/gb_gpu.cu
+++ b/lib/gpu/gb_gpu.cu
@ -36,7 +36,7 @@ static GB_GPU_Memory<PRECISION,ACC_PRECISION> GBMF[MAX_GPU_THREADS];
 // -- Only pack neighbors within cutoff
 // ---------------------------------------------------------------------------
 template<class numtyp>
-__global__ void kernel_pack_nbor(int *dev_nbor, const int nbor_pitch, 
+__global__ void kernel_pack_nbor(const vec4 *x_, int *dev_nbor, const int nbor_pitch, 
                                 const int start, const int inum, 
                                 const int *dev_ij, const int form_low, 
                                 const int form_high, const int nall) {
@ -56,25 +56,24 @@ __global__ void kernel_pack_nbor(int *dev_nbor, const int nbor_pitch,
    int *nbor_newj=nbor;
    nbor+=nbor_pitch;
  
-    numtyp ix=_x_<numtyp>(i,0);
-    numtyp iy=_x_<numtyp>(i,1);
-    numtyp iz=_x_<numtyp>(i,2);
-    int itype=_x_<numtyp>(i,7);
+    vec4 ix=x_[i];
+    int itype=ix.w;

    int newj=0;  
    for ( ; list<list_end; list++) {
      int j=*list;
      if (j>=nall)
        j%=nall;
-      int jtype=_x_<numtyp>(j,7);
+      vec4 jx=x_[j];
+      int jtype=jx.w;
      
      if (_form_(itype,jtype)>=form_low && _form_(itype,jtype)<=form_high) {
        // Compute r12;
-        numtyp rsq=_x_<numtyp>(j,0)-ix;
+        numtyp rsq=jx.x-ix.x;
        rsq*=rsq;
-        numtyp t=_x_<numtyp>(j,1)-iy;
+        numtyp t=jx.y-ix.y;
        rsq+=t*t;
-        t=_x_<numtyp>(j,2)-iz;
+        t=jx.z-ix.z;
        rsq+=t*t;

        if (rsq< _cutsq_<numtyp>(itype,jtype)) {
@ -95,7 +94,7 @@ __global__ void kernel_pack_nbor(int *dev_nbor, const int nbor_pitch,
 // -- Fast version of routine that uses shared memory for LJ constants
 // ---------------------------------------------------------------------------
 template<class numtyp>
-__global__ void kernel_pack_nbor_fast(int *dev_nbor, const int nbor_pitch, 
+__global__ void kernel_pack_nbor_fast(const vec4 *x_, int *dev_nbor, const int nbor_pitch, 
                                      const int start, const int inum, 
                                      const int *dev_ij, const int form_low, 
                                      const int form_high, const int nall) {
@ -124,25 +123,25 @@ __global__ void kernel_pack_nbor_fast(int *dev_nbor, const int nbor_pitch,
    int *nbor_newj=nbor;
    nbor+=nbor_pitch;
  
-    numtyp ix=_x_<numtyp>(i,0);
-    numtyp iy=_x_<numtyp>(i,1);
-    numtyp iz=_x_<numtyp>(i,2);
-    int itype=INT_MUL(MAX_SHARED_TYPES,_x_<numtyp>(i,7));
+    vec4 ix=x_[i];
+    int itype=INT_MUL(MAX_SHARED_TYPES,ix.w);

    int newj=0;  
    for ( ; list<list_end; list++) {
      int j=*list;
      if (j>=nall)
        j%=nall;
-      int mtype=itype+_x_<numtyp>(j,7);
+      vec4 jx=x_[j];
+      int jtype=jx.w;
+      int mtype=itype+jtype;
      
      if (form[mtype]>=form_low && form[mtype]<=form_high) {
        // Compute r12;
-        numtyp rsq=_x_<numtyp>(j,0)-ix;
+        numtyp rsq=jx.x-ix.x;
        rsq*=rsq;
-        numtyp t=_x_<numtyp>(j,1)-iy;
+        numtyp t=jx.y-ix.y;
        rsq+=t*t;
-        t=_x_<numtyp>(j,2)-iz;
+        t=jx.z-ix.z;
        rsq+=t*t;

        if (rsq<cutsq[mtype]) {
@ -159,13 +158,15 @@ __global__ void kernel_pack_nbor_fast(int *dev_nbor, const int nbor_pitch,
 template<class numtyp, class acctyp>
 void pack_nbors(GBMT &gbm, const int GX, const int BX, const int start, 
                const int inum, const int form_low, const int form_high) {
-  if (gbm.shared_types)
+  if (gbm.shared_types) {
    kernel_pack_nbor_fast<numtyp><<<GX,BX,0,gbm.pair_stream>>>
-          (gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), start, inum,
+          ((vec4 *)gbm.atom.dev_x.begin(),gbm.nbor.dev_nbor.begin(), 
+           gbm.atom.inum(), start, inum,
           gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall());
-  else
+  } else
    kernel_pack_nbor<numtyp><<<GX,BX,0,gbm.pair_stream>>>
-          (gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), start, inum,
+          ((vec4 *)gbm.atom.dev_x.begin(),gbm.nbor.dev_nbor.begin(), 
+           gbm.atom.inum(), start, inum,
           gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall());
 }

@ -188,9 +189,7 @@ EXTERN void gb_gpu_name(const int id, const int max_nbors, char * name) {
  string sname=GBMF[0].gpu.name(id)+", "+
              gb_gpu_toa(GBMF[0].gpu.cores(id))+" cores, "+
              gb_gpu_toa(GBMF[0].gpu.gigabytes(id))+" GB, "+
-              gb_gpu_toa(GBMF[0].gpu.clock_rate(id))+" GHZ, "+
-              gb_gpu_toa(GBMF[0].get_max_atoms(GBMF[0].gpu.bytes(id),
-                                               max_nbors))+" Atoms";
+              gb_gpu_toa(GBMF[0].gpu.clock_rate(id))+" GHZ";
  strcpy(name,sname.c_str());
 }

@ -203,6 +202,7 @@ EXTERN bool gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
                  double **epsilon, double *host_lshape, int **form,
                  double **host_lj1, double **host_lj2, double **host_lj3, 
                  double **host_lj4, double **offset, double *special_lj,
+                  const int nlocal, const int nall, 
                  const int max_nbors, const int thread, const int gpu_id) {
  assert(thread<MAX_GPU_THREADS);
  
@ -215,7 +215,8 @@ EXTERN bool gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
  return GBMF[thread].init(ij_size, ntypes, gamma, upsilon, mu, shape,
                           well, cutsq, sigma, epsilon, host_lshape, form,
                           host_lj1, host_lj2, host_lj3, host_lj4, offset,
-                           special_lj, max_nbors, false, gpu_id);
+                           special_lj, nlocal, nall, max_nbors, false, 
+                           gpu_id);
 }

 // ---------------------------------------------------------------------------
@ -230,34 +231,23 @@ EXTERN void gb_gpu_clear(const int thread) {
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 inline void _gb_gpu_atom(PairGPUAtom<numtyp,acctyp> &atom, double **host_x, 
-                         double **host_quat, const int *host_type, 
-                         const bool rebuild, cudaStream_t &stream) {
+                          double **host_quat, const int *host_type, 
+                          const bool rebuild, cudaStream_t &stream) {
  atom.time_atom.start();
  atom.reset_write_buffer();
 
  // Rows 1-3 of dev_x are position; rows 4-7 are quaternion
-  atom.add_atom_data(host_x[0],3);
-  atom.add_atom_data(host_x[0]+1,3);
-  atom.add_atom_data(host_x[0]+2,3);
-  atom.add_atom_data(host_quat[0],4);
-  atom.add_atom_data(host_quat[0]+1,4);
-  atom.add_atom_data(host_quat[0]+2,4);
-  atom.add_atom_data(host_quat[0]+3,4);
-   
-  int csize=7;
-  
-  // If a rebuild occured, copy type data
-  if (rebuild) {
-    atom.add_atom_data(host_type);
-    csize++;
-  }
-  
-  atom.copy_atom_data(csize,stream);
+  atom.add_x_data(host_x,host_type);
+  atom.add_q_data(host_quat[0]);
+
+  atom.copy_x_data(stream);
+  atom.copy_q_data(stream);
  atom.time_atom.stop();
 }

 EXTERN void gb_gpu_atom(double **host_x, double **host_quat, 
-                 const int *host_type, const bool rebuild, const int thread) {
+                        const int *host_type, const bool rebuild, 
+                        const int thread) {
  _gb_gpu_atom(GBMF[thread].atom, host_x, host_quat, host_type, rebuild,
               GBMF[thread].pair_stream);
 }
@ -269,46 +259,111 @@ template <class gbmtyp>
 int * _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall, const int nlocal, 
                          const int inum, int *ilist, const int *numj,
                          const int *type, bool &success) {
-  if (nall>gbm.max_atoms) {
-    success=false;
-    return 0;
-  }
  success=true;
    
  gbm.nbor.time_nbor.start();

+  int mn=0;
+  for (int i=0; i<inum; i++)
+    mn=std::max(mn,numj[i]);
+
+  if (nall>gbm.max_atoms)
+    gbm.resize_atom(nall,success);  
+  if (nlocal>gbm.max_local || mn>gbm._max_nbors)
+    gbm.resize_local(nlocal,mn,success);
+  if (!success)
+    return false;
+    
  gbm.atom.nall(nall);
  gbm.atom.inum(inum);

  if (gbm.multiple_forms) {
-    int p=0, acc=0;
-    for (int i=0; i<inum; i++) {
-      int itype=type[ilist[i]];
-      if (gbm.host_form[itype][itype]==ELLIPSE_ELLIPSE) {
-        gbm.host_olist[p]=ilist[i];
-        gbm.nbor.host_ij[p]=numj[ilist[i]];
-        gbm.nbor.host_ij[p+inum]=acc;
-        acc+=numj[ilist[i]];
-        p++;
+    int ij_size=gbm.nbor.host_ij.numel();
+    if (inum*2<ij_size) {
+      int p=0, acc=0;
+      for (int i=0; i<inum; i++) {
+        int itype=type[ilist[i]];
+        if (gbm.host_form[itype][itype]==ELLIPSE_ELLIPSE) {
+          gbm.host_olist[p]=ilist[i];
+          gbm.nbor.host_ij[p]=numj[ilist[i]];
+          gbm.nbor.host_ij[p+inum]=acc;
+          acc+=numj[ilist[i]];
+          p++;
+        }
      }
-    }
-    gbm.last_ellipse=p;
-    for (int i=0; i<inum; i++) {
-      int itype=type[ilist[i]];
-      if (gbm.host_form[itype][itype]!=ELLIPSE_ELLIPSE) {
-        gbm.host_olist[p]=ilist[i];
-        gbm.nbor.host_ij[p]=numj[ilist[i]];
-        gbm.nbor.host_ij[p+inum]=acc;
-        acc+=numj[ilist[i]];
-        p++;
+      gbm.last_ellipse=p;
+      for (int i=0; i<inum; i++) {
+        int itype=type[ilist[i]];
+        if (gbm.host_form[itype][itype]!=ELLIPSE_ELLIPSE) {
+          gbm.host_olist[p]=ilist[i];
+          gbm.nbor.host_ij[p]=numj[ilist[i]];
+          gbm.nbor.host_ij[p+inum]=acc;
+          acc+=numj[ilist[i]];
+          p++;
+        }
      }
+      gbm.nbor.ij_total=0;
+      gbm.nbor.dev_nbor.copy_from_host(gbm.host_olist.begin(),inum);
+      gbm.nbor.host_ij.copy_to_device(gbm.nbor.dev_nbor.begin()+inum,
+                                      2*inum,gbm.pair_stream);
+    } else {
+      int p=0, acc=0;
+      int offset=0;
+      int half=ij_size/2;
+      int hi=0;
+      for (int i=0; i<inum; i++) {
+        int itype=type[ilist[i]];
+        if (gbm.host_form[itype][itype]==ELLIPSE_ELLIPSE) {
+          gbm.host_olist[p]=ilist[i];
+          gbm.nbor.host_ij[hi]=numj[ilist[i]];
+          gbm.nbor.host_ij[hi+half]=acc;
+          acc+=numj[ilist[i]];
+          p++;
+          hi++;
+          if (hi==half) {
+            gbm.nbor.host_ij.copy_to_device(gbm.nbor.dev_nbor.begin()+inum+offset,
+                                            half,gbm.pair_stream);
+            gbm.nbor.host_ij.copy_to_device(half,gbm.nbor.dev_nbor.begin()+
+                                                 inum*2+offset,
+                                            half,gbm.pair_stream);
+            hi=0;
+            offset+=half;
+            CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream));
+          }
+        }
+      }
+      gbm.last_ellipse=p;
+      for (int i=0; i<inum; i++) {
+        int itype=type[ilist[i]];
+        if (gbm.host_form[itype][itype]!=ELLIPSE_ELLIPSE) {
+          gbm.host_olist[p]=ilist[i];
+          gbm.nbor.host_ij[hi]=numj[ilist[i]];
+          gbm.nbor.host_ij[hi+half]=acc;
+          acc+=numj[ilist[i]];
+          p++;
+          hi++;
+          if (hi==half) {
+            gbm.nbor.host_ij.copy_to_device(gbm.nbor.dev_nbor.begin()+inum+offset,
+                                            half,gbm.pair_stream);
+            gbm.nbor.host_ij.copy_to_device(half,gbm.nbor.dev_nbor.begin()+
+                                                 inum*2+offset,
+                                            half,gbm.pair_stream);
+            hi=0;
+            offset+=half;
+            CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream));
+          }
+        }
+      }
+      gbm.nbor.dev_nbor.copy_from_host(gbm.host_olist.begin(),inum);
+      if (hi>0) {
+        gbm.nbor.host_ij.copy_to_device(gbm.nbor.dev_nbor.begin()+inum+offset,
+                                        hi,gbm.pair_stream);
+        gbm.nbor.host_ij.copy_to_device(half,gbm.nbor.dev_nbor.begin()+
+                                             inum*2+offset,
+                                        hi,gbm.pair_stream);
+      }
+      gbm.nbor.ij_total=0;
    }
-    gbm.nbor.ij_total=0;
-    gbm.nbor.dev_nbor.copy_from_host(gbm.host_olist.begin(),inum);
-    gbm.nbor.host_ij.copy_to_2Ddevice(gbm.nbor.dev_nbor.begin()+
-                                      gbm.nbor.dev_nbor.row_size(),
-                                      gbm.nbor.dev_nbor.row_size(),2,inum,
-                                      gbm.pair_stream);
  } else {
    gbm.nbor.reset(inum,ilist,numj,gbm.pair_stream);
    gbm.last_ellipse=inum;
@ -349,6 +404,14 @@ EXTERN void gb_gpu_nbors(const int *ij, const int num_ij, const bool eflag,
  _gb_gpu_nbors(GBMF[thread],ij,num_ij,eflag);
 }

+
+template<class numtyp, class acctyp>
+void _gb_gpu_enqueue(GBMT &gbm, const bool eflag, const bool vflag) {
+  gbm.atom.time_answer.start();
+  gbm.atom.copy_answers(eflag,vflag,gbm.pair_stream);
+  gbm.atom.time_answer.stop();
+}
+
 // ---------------------------------------------------------------------------
 // Calculate energies, forces, and torques for all ij interactions
 // ---------------------------------------------------------------------------
@ -357,7 +420,12 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag,
                      const bool rebuild) {
  // Compute the block size and grid size to keep all cores busy
  const int BX=BLOCK_1D;
-
+  int ans_pitch=6;
+  if (eflag)
+    ans_pitch++;
+  if (vflag)
+    ans_pitch+=6;
+  
  int GX=static_cast<int>(ceil(static_cast<double>(gbm.atom.inum())/BX));

  if (gbm.multiple_forms) {
@ -371,9 +439,10 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag,
  
      gbm.time_gayberne.start();                                 
      kernel_gayberne<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
-           (gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), 
-            gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(),
-            gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(),
+           ((vec4*)gbm.atom.dev_x.begin(), (vec4*)gbm.atom.dev_q.begin(), 
+            gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), 
+            gbm.nbor.dev_nbor.begin(), gbm.atom.inum(),
+            gbm.atom.ans.begin(), ans_pitch,gbm.dev_error.begin(),
            eflag, vflag, gbm.last_ellipse, gbm.atom.nall());
      gbm.time_gayberne.stop();

@ -393,14 +462,15 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag,
      GX=static_cast<int>(ceil(static_cast<double>(gbm.atom.inum()-
                               gbm.last_ellipse)/BX));
      pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom.inum(),ELLIPSE_SPHERE,
-                 ELLIPSE_SPHERE);
+                  ELLIPSE_SPHERE);
      gbm.time_kernel2.stop();

      gbm.time_gayberne2.start();
      kernel_sphere_gb<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
-           (gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), 
-            gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), 
-            gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(),
+           ((vec4*)gbm.atom.dev_x.begin(), (vec4*)gbm.atom.dev_q.begin(), 
+            gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), 
+            gbm.nbor.dev_nbor.begin(), gbm.atom.inum(),
+            gbm.atom.ans.begin(), ans_pitch,gbm.dev_error.begin(),
            eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());
      gbm.time_gayberne2.stop();
   } else {
@ -419,16 +489,15 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag,
    if (gbm.last_ellipse<gbm.atom.inum()) {
      if (gbm.shared_types)
        kernel_lj_fast<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
-           (gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(), 
-            gbm.nbor.ij.begin(), gbm.nbor.dev_nbor.row_size(), 
-            gbm.atom.ans.begin(), gbm.atom.ans.row_size(), 
-            gbm.dev_error.begin(), eflag, vflag, gbm.last_ellipse, 
-            gbm.atom.inum(), gbm.atom.nall());
+           ((vec4*)gbm.atom.dev_x.begin(), gbm.special_lj.begin(), 
+            gbm.nbor.dev_nbor.begin(), gbm.atom.inum(), gbm.nbor.ij.begin(),
+            gbm.atom.ans.begin(), ans_pitch, gbm.dev_error.begin(), eflag, 
+            vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());
      else
        kernel_lj<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
-           (gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(), 
-            gbm.nbor.ij.begin(), gbm.nbor.dev_nbor.row_size(), 
-            gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(),
+           ((vec4*)gbm.atom.dev_x.begin(), gbm.special_lj.begin(), 
+            gbm.nbor.dev_nbor.begin(), gbm.atom.inum(), gbm.nbor.ij.begin(),
+            gbm.atom.ans.begin(), ans_pitch,gbm.dev_error.begin(),
            eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());
    }
    gbm.time_pair.stop();
@ -437,19 +506,21 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag,
    pack_nbors(gbm, GX, BX, 0, gbm.atom.inum(),SPHERE_SPHERE,ELLIPSE_ELLIPSE);
    gbm.time_kernel.stop();
  
-    gbm.time_gayberne.start();                                 
+    gbm.time_gayberne.start(); 
    kernel_gayberne<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
-         (gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), 
-          gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), 
-          gbm.atom.ans.begin(), gbm.atom.ans.row_size(), gbm.dev_error.begin(), 
+         ((vec4*)gbm.atom.dev_x.begin(), (vec4*)gbm.atom.dev_q.begin(), 
+          gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), 
+          gbm.nbor.dev_nbor.begin(), gbm.atom.inum(),
+          gbm.atom.ans.begin(), ans_pitch, gbm.dev_error.begin(), 
          eflag, vflag, gbm.atom.inum(), gbm.atom.nall());
    gbm.time_gayberne.stop();
  }
 }

 EXTERN void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild, 
-                     const int thread) {
+                            const int thread) {
  _gb_gpu_gayberne<PRECISION,ACC_PRECISION>(GBMF[thread],eflag,vflag,rebuild);
+  _gb_gpu_enqueue<PRECISION,ACC_PRECISION>(GBMF[thread],eflag,vflag);
 }

 // ---------------------------------------------------------------------------
@ -462,9 +533,6 @@ double _gb_gpu_forces(GBMT &gbm, double **f, double **tor, const int *ilist,
                      double *virial) {
  double evdw;

-  gbm.atom.time_answer.start();
-  gbm.atom.copy_answers(eflag,vflag,gbm.pair_stream);
-
  gbm.atom.time_atom.add_to_total();
  gbm.nbor.time_nbor.add_to_total();
  gbm.time_kernel.add_to_total();
@ -475,11 +543,19 @@ double _gb_gpu_forces(GBMT &gbm, double **f, double **tor, const int *ilist,
    gbm.time_pair.add_to_total();
  }      
  CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream));
-  
-  evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial);
-  gbm.atom.add_forces(ilist,f);
-  gbm.atom.add_torques(ilist,tor,gbm.last_ellipse);
-  gbm.atom.time_answer.stop();
+  if (gbm.last_ellipse>gbm.atom.inum()) {
+    if (eflag || vflag)
+      evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial,
+                                  f,tor,gbm.atom.inum());
+    else
+      gbm.atom.copy_asphere(ilist,f,tor,gbm.atom.inum());
+  } else {
+    if (eflag || vflag)
+      evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial,
+                                  f,tor,gbm.last_ellipse);
+    else
+      gbm.atom.copy_asphere(ilist,f,tor,gbm.last_ellipse);
+  }
  gbm.atom.time_answer.add_to_total();
  return evdw;
 }
@ -516,3 +592,4 @@ EXTERN int gb_gpu_num_devices() {
 EXTERN double gb_gpu_bytes() {
  return GBMF[0].host_memory_usage();
 }
+
--- a/lib/gpu/gb_gpu_extra.h
+++ b/lib/gpu/gb_gpu_extra.h
@ -298,24 +298,22 @@ static __inline__ __device__ void gpu_mldivide3(const numtyp m[9],
 ------------------------------------------------------------------------- */

 template <class numtyp>
-static __inline__ __device__ void gpu_quat_to_mat_trans(const int qi, 
+static __inline__ __device__ void gpu_quat_to_mat_trans(const vec4 *qif,
+                                                        const int qi, 
                                                        numtyp mat[9])
 {
-  numtyp qi3=_x_<numtyp>(qi,3);
-  numtyp qi4=_x_<numtyp>(qi,4);
-  numtyp qi5=_x_<numtyp>(qi,5);
-  numtyp qi6=_x_<numtyp>(qi,6);
-
-  numtyp w2 = qi3*qi3;
-  numtyp i2 = qi4*qi4;
-  numtyp j2 = qi5*qi5;
-  numtyp k2 = qi6*qi6;
-  numtyp twoij = (numtyp)2.0*qi4*qi5;
-  numtyp twoik = (numtyp)2.0*qi4*qi6;
-  numtyp twojk = (numtyp)2.0*qi5*qi6;
-  numtyp twoiw = (numtyp)2.0*qi4*qi3;
-  numtyp twojw = (numtyp)2.0*qi5*qi3;
-  numtyp twokw = (numtyp)2.0*qi6*qi3;
+  vec4 q=qif[qi];
+  
+  numtyp w2 = q.x*q.x;
+  numtyp i2 = q.y*q.y;
+  numtyp j2 = q.z*q.z;
+  numtyp k2 = q.w*q.w;
+  numtyp twoij = (numtyp)2.0*q.y*q.z;
+  numtyp twoik = (numtyp)2.0*q.y*q.w;
+  numtyp twojk = (numtyp)2.0*q.z*q.w;
+  numtyp twoiw = (numtyp)2.0*q.y*q.x;
+  numtyp twojw = (numtyp)2.0*q.z*q.x;
+  numtyp twokw = (numtyp)2.0*q.w*q.x;

  mat[0] = w2+i2-j2-k2;
  mat[3] = twoij-twokw;
--- a/lib/gpu/gb_gpu_kernel.h
+++ b/lib/gpu/gb_gpu_kernel.h
@ -92,16 +92,19 @@ static __inline__ __device__ void compute_eta_torque(numtyp m[9],
 		    m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
 }

+#include "gb_gpu_kernel.h"
+
 template<class numtyp, class acctyp>
-__global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj,
-                                const int *dev_nbor, const int nbor_pitch, 
+__global__ void kernel_gayberne(const vec4* x_, const vec4 *q,
+                                const numtyp *gum, const numtyp *special_lj,
+                                const int *dev_nbor, const size_t nbor_pitch, 
                                acctyp *ans, size_t ans_pitch, int *err_flag, 
                                const bool eflag, const bool vflag,
                                const int inum, const int nall) {
                                
  __shared__ numtyp sp_lj[4];

-   // ii indexes the two interacting particles in gi
+  // ii indexes the two interacting particles in gi
  int ii=threadIdx.x;
  if (ii<4)
    sp_lj[ii]=special_lj[ii];    
@ -109,7 +112,7 @@ __global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj,
  __syncthreads();

  if (ii<inum) {
-  
+
  acctyp energy=(numtyp)0;
  acctyp fx=(numtyp)0;
  acctyp fy=(numtyp)0;
@ -128,16 +131,14 @@ __global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj,
  nbor+=nbor_pitch;
  int numj=*nbor;
  nbor+=nbor_pitch;
-  const int *nbor_end=nbor+INT_MUL(nbor_pitch,numj);
+  const int *nbor_end=nbor+nbor_pitch*numj;
  
-  numtyp ix=_x_<numtyp>(i,0);
-  numtyp iy=_x_<numtyp>(i,1);
-  numtyp iz=_x_<numtyp>(i,2);
-  int itype=_x_<numtyp>(i,7);
+  vec4 ix=x_[i];
+  int itype=ix.w;
  numtyp a1[9], b1[9], g1[9];
  {
    numtyp t[9];
-    gpu_quat_to_mat_trans(i,a1);
+    gpu_quat_to_mat_trans(q,i,a1);
    gpu_shape_times3(itype,a1,t);
    gpu_transpose_times3(a1,t,g1);
    gpu_well_times3(itype,a1,t);
@ -146,7 +147,7 @@ __global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj,

  numtyp factor_lj;
  for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
-  
+
  int j=*nbor;
  if (j < nall) 
    factor_lj = (numtyp)1.0;
@ -154,20 +155,21 @@ __global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj,
    factor_lj = sp_lj[j/nall];
    j %= nall;
  }
-  int jtype=_x_<numtyp>(j,7);
+  vec4 jx=x_[j];
+  int jtype=jx.w;

  // Compute r12
  numtyp r12[3];
-  r12[0] = _x_<numtyp>(j,0)-ix;
-  r12[1] = _x_<numtyp>(j,1)-iy;
-  r12[2] = _x_<numtyp>(j,2)-iz;
+  r12[0] = jx.x-ix.x;
+  r12[1] = jx.y-ix.y;
+  r12[2] = jx.z-ix.z;
  numtyp ir = gpu_dot3(r12,r12);

  ir = rsqrt(ir);
  numtyp r = (numtyp)1.0/ir;

  numtyp a2[9];
-  gpu_quat_to_mat_trans(j,a2);
+  gpu_quat_to_mat_trans(q,j,a2);
  
  numtyp u_r, dUr[3], tUr[3], eta, teta[3];
  { // Compute U_r, dUr, eta, and teta
@ -366,35 +368,37 @@ __global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj,
  } // for nbor

  // Store answers
-  acctyp *ap1=ans+ii;
+  acctyp *ap1=ans+ii*ans_pitch;
  if (eflag) {
    *ap1=energy;
-    ap1+=ans_pitch;
+    ap1++;
  }
  if (vflag) {
    for (int i=0; i<6; i++) {
      *ap1=virial[i];
-      ap1+=ans_pitch;
+      ap1++;
    }
  }
  *ap1=fx;
-  ap1+=ans_pitch;
+  ap1++;
  *ap1=fy;
-  ap1+=ans_pitch;
+  ap1++;
  *ap1=fz;
-  ap1+=ans_pitch;
+  ap1++;
  *ap1=torx;
-  ap1+=ans_pitch;
+  ap1++;
  *ap1=tory;
-  ap1+=ans_pitch;
+  ap1++;
  *ap1=torz;

  } // if ii
+
 }

 template<class numtyp, class acctyp>
-__global__ void kernel_sphere_gb(const numtyp *gum, const numtyp *special_lj,
-                                 const int *dev_nbor, const int nbor_pitch, 
+__global__ void kernel_sphere_gb(const vec4 *x_, const vec4 *q,
+                                 const numtyp *gum, const numtyp *special_lj,
+                                 const int *dev_nbor, const size_t nbor_pitch,
                                 acctyp *ans, size_t ans_pitch, int *err_flag, 
                                 const bool eflag, const bool vflag,
                                 const int start, const int inum, 
@ -425,12 +429,10 @@ __global__ void kernel_sphere_gb(const numtyp *gum, const numtyp *special_lj,
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
-    const int *nbor_end=nbor+INT_MUL(nbor_pitch,numj);
+    const int *nbor_end=nbor+nbor_pitch*numj;
  
-    numtyp ix=_x_<numtyp>(i,0);
-    numtyp iy=_x_<numtyp>(i,1);
-    numtyp iz=_x_<numtyp>(i,2);
-    int itype=_x_<numtyp>(i,7);
+    vec4 ix=x_[i];
+    int itype=ix.w;
      
    numtyp oner=_shape_<numtyp>(itype,0);
    numtyp one_well=_well_<numtyp>(itype,0);
@ -445,13 +447,14 @@ __global__ void kernel_sphere_gb(const numtyp *gum, const numtyp *special_lj,
        factor_lj = sp_lj[j/nall];
        j %= nall;
      }
-      int jtype=_x_<numtyp>(j,7);
+      vec4 jx=x_[j];
+      int jtype=jx.w;

      // Compute r12
      numtyp r12[3];
-      r12[0] = _x_<numtyp>(j,0)-ix;
-      r12[1] = _x_<numtyp>(j,1)-iy;
-      r12[2] = _x_<numtyp>(j,2)-iz;
+      r12[0] = jx.x-ix.x;
+      r12[1] = jx.y-ix.y;
+      r12[2] = jx.z-ix.z;
      numtyp ir = gpu_dot3(r12,r12);

      ir = rsqrt(ir);
@ -463,7 +466,7 @@ __global__ void kernel_sphere_gb(const numtyp *gum, const numtyp *special_lj,
      r12hat[2]=r12[2]*ir;

      numtyp a2[9];
-      gpu_quat_to_mat_trans(j,a2);
+      gpu_quat_to_mat_trans(q,j,a2);
  
      numtyp u_r, dUr[3], eta;
      { // Compute U_r, dUr, eta, and teta
@ -611,28 +614,29 @@ __global__ void kernel_sphere_gb(const numtyp *gum, const numtyp *special_lj,
    } // for nbor

    // Store answers
-    acctyp *ap1=ans+ii;
+    acctyp *ap1=ans+ii*ans_pitch;
    if (eflag) {
      *ap1=energy;
-      ap1+=ans_pitch;
+      ap1++;
    }
    if (vflag) {
      for (int i=0; i<6; i++) {
        *ap1=virial[i];
-        ap1+=ans_pitch;
+        ap1++;
      }
    }
    *ap1=fx;
-    ap1+=ans_pitch;
+    ap1++;
    *ap1=fy;
-    ap1+=ans_pitch;
+    ap1++;
    *ap1=fz;
  } // if ii
 }

 template<class numtyp, class acctyp>
-__global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor, 
-                          const int *dev_ij, const int nbor_pitch, acctyp *ans, 
+__global__ void kernel_lj(const vec4 *x_,
+                          const numtyp *special_lj, const int *dev_nbor, 
+                          const size_t nbor_pitch, const int *dev_ij, acctyp *ans, 
                          size_t ans_pitch, int *err_flag, const bool eflag, 
                          const bool vflag, const int start, const int inum,
                          const int nall) {
@ -663,10 +667,8 @@ __global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor,
    const int *list=dev_ij+*nbor;
    const int *list_end=list+numj;
  
-    numtyp ix=_x_<numtyp>(i,0);
-    numtyp iy=_x_<numtyp>(i,1);
-    numtyp iz=_x_<numtyp>(i,2);
-    int itype=_x_<numtyp>(i,7);
+    vec4 ix=x_[i];
+    int itype=ix.w;

    numtyp factor_lj;
    for ( ; list<list_end; list++) {
@ -678,12 +680,13 @@ __global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor,
        factor_lj = sp_lj[j/nall];
        j %= nall;
      }
-      int jtype=_x_<numtyp>(j,7);
+      vec4 jx=x_[j];
+      int jtype=jx.w;

      // Compute r12
-      numtyp delx = ix-_x_<numtyp>(j,0);
-      numtyp dely = iy-_x_<numtyp>(j,1);
-      numtyp delz = iz-_x_<numtyp>(j,2);
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
      numtyp r2inv = delx*delx+dely*dely+delz*delz;
        
      if (r2inv<_cutsq_<numtyp>(itype,jtype) &&
@ -716,29 +719,30 @@ __global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor,
    } // for nbor

    // Store answers
-    acctyp *ap1=ans+ii;
+    acctyp *ap1=ans+ii*ans_pitch;
    if (eflag) {
      *ap1+=energy;
-      ap1+=ans_pitch;
+      ap1++;
    }
    if (vflag) {
      for (int i=0; i<6; i++) {
        *ap1+=virial[i];
-        ap1+=ans_pitch;
+        ap1++;
      }
    }
    *ap1+=fx;
-    ap1+=ans_pitch;
+    ap1++;
    *ap1+=fy;
-    ap1+=ans_pitch;
+    ap1++;
    *ap1+=fz;

  } // if ii
 }

 template<class numtyp, class acctyp>
-__global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor, 
-                               const int *dev_ij, const int nbor_pitch, 
+__global__ void kernel_lj_fast(const vec4 *x_,
+                               const numtyp *special_lj, const int *dev_nbor, 
+                               const size_t nbor_pitch, const int *dev_ij, 
                               acctyp *ans, size_t ans_pitch,int *err_flag,
                               const bool eflag, const bool vflag, 
                               const int start, const int inum, const int nall){
@ -788,10 +792,8 @@ __global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor,
    const int *list=dev_ij+*nbor;
    const int *list_end=list+numj;
  
-    numtyp ix=_x_<numtyp>(i,0);
-    numtyp iy=_x_<numtyp>(i,1);
-    numtyp iz=_x_<numtyp>(i,2);
-    int itype=INT_MUL(MAX_SHARED_TYPES,_x_<numtyp>(i,7));
+    vec4 ix=x_[i];
+    int itype=INT_MUL(MAX_SHARED_TYPES,ix.w);

    numtyp factor_lj;
    for ( ; list<list_end; list++) {
@ -803,12 +805,13 @@ __global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor,
        factor_lj = sp_lj[j/nall];
        j %= nall;
      }
-      int mtype=itype+_x_<numtyp>(j,7);
+      vec4 jx=x_[j];
+      int mtype=itype+jx.w;

      // Compute r12
-      numtyp delx = ix-_x_<numtyp>(j,0);
-      numtyp dely = iy-_x_<numtyp>(j,1);
-      numtyp delz = iz-_x_<numtyp>(j,2);
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
      numtyp r2inv = delx*delx+dely*dely+delz*delz;
        
      if (r2inv<cutsq[mtype] && form[mtype]==SPHERE_SPHERE) {
@ -837,21 +840,21 @@ __global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor,
    } // for nbor

    // Store answers
-    acctyp *ap1=ans+ii;
+    acctyp *ap1=ans+ii*ans_pitch;
    if (eflag) {
      *ap1+=energy;
-      ap1+=ans_pitch;
+      ap1++;
    }
    if (vflag) {
      for (int i=0; i<6; i++) {
        *ap1+=virial[i];
-        ap1+=ans_pitch;
+        ap1++;
      }
    }
    *ap1+=fx;
-    ap1+=ans_pitch;
+    ap1++;
    *ap1+=fy;
-    ap1+=ans_pitch;
+    ap1++;
    *ap1+=fz;

  } // if ii
--- a/lib/gpu/gb_gpu_memory.cu
+++ b/lib/gpu/gb_gpu_memory.cu
@ -41,14 +41,17 @@ bool GB_GPU_MemoryT::init(const int ij_size, const int ntypes,
                          double *host_lshape, int **h_form, double **host_lj1, 
                          double **host_lj2, double **host_lj3, 
                          double **host_lj4, double **host_offset, 
-                          double *host_special_lj, const int max_nbors, 
+                          double *host_special_lj, const int nlocal,
+                          const int nall, const int max_nbors, 
                          const bool force_d, const int me) {
+  _max_nbors=max_nbors;
  if (this->allocated)
    clear();
    
  bool p=LJ_GPU_MemoryT::init(ij_size,ntypes,host_cutsq,host_sigma,host_epsilon,
                              host_lj1, host_lj2, host_lj3, host_lj4, 
-                              host_offset, host_special_lj, max_nbors, me);
+                              host_offset, host_special_lj, max_nbors, me,
+                              nlocal, nall);
  if (!p)
    return false;                              
    
@ -96,11 +99,28 @@ bool GB_GPU_MemoryT::init(const int ij_size, const int ntypes,
        multiple_forms=true;
        
  // Memory for ilist ordered by particle type
-  host_olist.safe_alloc_rw(this->max_atoms);
-
-  return true;
+  return host_olist.alloc_rw(this->max_local);
 }
-  
+
+template <class numtyp, class acctyp>
+void GB_GPU_MemoryT::resize_atom(const int nall, bool &success) {
+  this->max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
+  this->atom.resize(this->max_atoms, success);
+}
+
+template <class numtyp, class acctyp>
+void GB_GPU_MemoryT::resize_local(const int nlocal, const int max_nbors,
+                                  bool &success) {
+  if (nlocal>this->max_local) {
+    this->max_local=static_cast<int>(static_cast<double>(nlocal)*1.10);
+    host_olist.clear();
+    success=success && host_olist.alloc_rw(this->max_local);
+  }
+  if (max_nbors>_max_nbors)
+    _max_nbors=static_cast<int>(static_cast<double>(max_nbors)*1.10);
+  this->nbor.resize(this->max_local,_max_nbors,success);
+}
+
 template <class numtyp, class acctyp>
 void GB_GPU_MemoryT::clear() {
  if (!this->allocated)
--- a/lib/gpu/gb_gpu_memory.h
+++ b/lib/gpu/gb_gpu_memory.h
@ -37,8 +37,11 @@ class GB_GPU_Memory : public LJ_GPU_Memory<numtyp,acctyp> {
            double **host_epsilon, double *host_lshape, int **h_form,
            double **host_lj1, double **host_lj2, double **host_lj3, 
            double **host_lj4, double **host_offset, double *host_special_lj,
-            const int max_nbors, const bool force_d, const int me);
+            const int max_nbors, const int nlocal, const int nall,
+            const bool force_d, const int me);

+  void resize_atom(const int nall, bool &success);
+  void resize_local(const int nlocal, const int max_nbors, bool &success);
  void clear();
 
  double host_memory_usage();
@ -61,6 +64,7 @@ class GB_GPU_Memory : public LJ_GPU_Memory<numtyp,acctyp> {
  bool multiple_forms;
  int **host_form;
  int last_ellipse;
+  int _max_nbors;
   
 private:
 };
--- a/lib/gpu/lj_gpu.cu
+++ b/lib/gpu/lj_gpu.cu
@ -74,7 +74,8 @@ EXTERN bool lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **
  ij_size=IJ_SIZE;

  bool ret = LJMF.init(ij_size, ntypes, cutsq, sigma, epsilon, host_lj1, host_lj2, 
-		       host_lj3, host_lj4, offset, special_lj, max_nbors, gpu_id);
+		       host_lj3, host_lj4, offset, special_lj, max_nbors, gpu_id,
+                       0,0);

  ncellx = ceil(((boxhi[0] - boxlo[0]) + 2.0*cell_size) / cell_size);
  ncelly = ceil(((boxhi[1] - boxlo[1]) + 2.0*cell_size) / cell_size);
@ -101,37 +102,6 @@ EXTERN void lj_gpu_clear() {
 }


-// ---------------------------------------------------------------------------
-// Calculate energies and forces for all ij interactions
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-void _lj_gpu(LJMT &ljm, const bool eflag, const bool vflag, const bool rebuild){
-  // Compute the block size and grid size to keep all cores busy
-  const int BX=BLOCK_1D;
-
-  int GX=static_cast<int>(ceil(static_cast<double>(ljm.atom.inum())/BX));
-
-  ljm.time_pair.start();
-
-  if (ljm.shared_types)
-    kernel_lj_fast<numtyp,acctyp><<<GX,BX,0,ljm.pair_stream>>>
-           (ljm.special_lj.begin(), ljm.nbor.dev_nbor.begin(), 
-            ljm.nbor.ij.begin(), ljm.nbor.dev_nbor.row_size(), 
-            ljm.atom.ans.begin(), ljm.atom.ans.row_size(), eflag,
-            vflag, ljm.atom.inum(), ljm.atom.nall());
-  else
-    kernel_lj<numtyp,acctyp><<<GX,BX,0,ljm.pair_stream>>>
-           (ljm.special_lj.begin(), ljm.nbor.dev_nbor.begin(), 
-            ljm.nbor.ij.begin(), ljm.nbor.dev_nbor.row_size(), 
-            ljm.atom.ans.begin(), ljm.atom.ans.row_size(), eflag, 
-            vflag, ljm.atom.inum(), ljm.atom.nall());
-	    ljm.time_pair.stop();
-}
-
-EXTERN void lj_gpu(const bool eflag, const bool vflag, const bool rebuild) {
-  _lj_gpu<PRECISION,ACC_PRECISION>(LJMF, eflag,vflag,rebuild);
-}
-
 template <class numtyp, class acctyp>
 double _lj_gpu_cell(LJMT &ljm, double **force, double *virial,
 		    double **host_x, int *host_type, const int inum, 
--- a/lib/gpu/lj_gpu_kernel.h
+++ b/lib/gpu/lj_gpu_kernel.h
@ -217,229 +217,4 @@ __global__ void kernel_lj_cell(float3 *force3,

 }

-
-/* Neigbhor list version of LJ kernel */
-template<class numtyp, class acctyp>
-__global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor, 
-                          const int *dev_ij, const int nbor_pitch, acctyp *ans, 
-                          size_t ans_pitch, const bool eflag, 
-                          const bool vflag, const int inum, const int nall) {
-  __shared__ numtyp sp_lj[4];
-                            
-  // ii indexes the two interacting particles in gi
-  int ii=threadIdx.x;
-  if (ii<4)
-    sp_lj[ii]=special_lj[ii];    
-  ii+=INT_MUL(blockIdx.x,blockDim.x);
-
-  if (ii<inum) {
-  
-    acctyp energy=(numtyp)0;
-    acctyp fx=(numtyp)0;
-    acctyp fy=(numtyp)0;
-    acctyp fz=(numtyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(numtyp)0;
-  
-    const int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-    const int *list=dev_ij+*nbor;
-    const int *list_end=list+numj;
-  
-    numtyp ix=_x_<numtyp>(i,0);
-    numtyp iy=_x_<numtyp>(i,1);
-    numtyp iz=_x_<numtyp>(i,2);
-    int itype=_x_<numtyp>(i,3);
-
-    numtyp factor_lj;
-    for ( ; list<list_end; list++) {
-  
-      int j=*list;
-      if (j < nall) 
-        factor_lj = 1.0;
-      else {
-        factor_lj = sp_lj[j/nall];
-        j %= nall;
-      }
-      int jtype=_x_<numtyp>(j,3);
-
-      // Compute r12
-      numtyp delx = ix-_x_<numtyp>(j,0);
-      numtyp dely = iy-_x_<numtyp>(j,1);
-      numtyp delz = iz-_x_<numtyp>(j,2);
-      numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
-      if (r2inv<_cutsq_<numtyp>(itype,jtype)) {
-        r2inv=(numtyp)1.0/r2inv;
-        numtyp r6inv =r2inv*r2inv*r2inv;
-        numtyp force =factor_lj*r2inv*r6inv*(_lj1_<numtyp>(itype,jtype).x*r6inv-
-                                             _lj1_<numtyp>(itype,jtype).y);
-      
-        fx+=delx*force;
-        fy+=dely*force;
-        fz+=delz*force;
-
-        if (eflag) {
-          numtyp e=r6inv*(_lj3_<numtyp>(itype,jtype).x*r6inv-
-                          _lj3_<numtyp>(itype,jtype).y);
-          energy+=factor_lj*(e-_offset_<numtyp>(1,1)); 
-        }
-        if (vflag) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-
-    // Store answers
-    acctyp *ap1=ans+ii;
-    if (eflag) {
-      *ap1=energy;
-      ap1+=ans_pitch;
-    }
-    if (vflag) {
-      for (int i=0; i<6; i++) {
-        *ap1=virial[i];
-        ap1+=ans_pitch;
-      }
-    }
-    *ap1=fx;
-    ap1+=ans_pitch;
-    *ap1=fy;
-    ap1+=ans_pitch;
-    *ap1=fz;
-
-  } // if ii
-}
-
-template<class numtyp, class acctyp>
-__global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor, 
-                               const int *dev_ij, const int nbor_pitch, 
-                               acctyp *ans, size_t ans_pitch,const bool eflag, 
-                               const bool vflag, const int inum, 
-                               const int nall) {
-                                
-  // ii indexes the two interacting particles in gi
-  int ii=threadIdx.x;
-  __shared__ numtyp sp_lj[4];
-  __shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp lj2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp lj4[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp offset[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  if (ii<4)
-    sp_lj[ii]=special_lj[ii];    
-  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    int itype=ii/MAX_SHARED_TYPES;
-    int jtype=ii%MAX_SHARED_TYPES;
-    cutsq[ii]=_cutsq_<numtyp>(itype,jtype);
-    lj1[ii]=_lj1_<numtyp>(itype,jtype).x;
-    lj2[ii]=_lj1_<numtyp>(itype,jtype).y;
-    if (eflag) {
-      lj3[ii]=_lj3_<numtyp>(itype,jtype).x;
-      lj4[ii]=_lj3_<numtyp>(itype,jtype).y;
-      offset[ii]=_offset_<numtyp>(itype,jtype);
-    }
-  }
-  ii+=INT_MUL(blockIdx.x,blockDim.x);
-  
-  if (ii<inum) {
-  
-    acctyp energy=(numtyp)0;
-    acctyp fx=(numtyp)0;
-    acctyp fy=(numtyp)0;
-    acctyp fz=(numtyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(numtyp)0;
-  
-    const int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-    const int *list=dev_ij+*nbor;
-    const int *list_end=list+numj;
-  
-    numtyp ix=_x_<numtyp>(i,0);
-    numtyp iy=_x_<numtyp>(i,1);
-    numtyp iz=_x_<numtyp>(i,2);
-    int itype=INT_MUL(MAX_SHARED_TYPES,_x_<numtyp>(i,3));
-
-    numtyp factor_lj;
-
-    for ( ; list<list_end; list++) {
-
-      int j= *list;
-      
-      if (j < nall) 
-        factor_lj = 1.0;
-      else {
-        factor_lj = sp_lj[j/nall];
-        j %= nall;
-      }
-      int mtype=itype+_x_<numtyp>(j,3);
-
-      // Compute r12
-      numtyp delx = ix-_x_<numtyp>(j,0);
-      numtyp dely = iy-_x_<numtyp>(j,1);
-      numtyp delz = iz-_x_<numtyp>(j,2);
-      numtyp r2inv = delx*delx+dely*dely+delz*delz;
-
-      if (r2inv<cutsq[mtype]) {
-        r2inv=(numtyp)1.0/r2inv;
-        numtyp r6inv = r2inv*r2inv*r2inv;
-        numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype]*r6inv-lj2[mtype]);
-      
-        fx+=delx*force;
-        fy+=dely*force;
-        fz+=delz*force;
-
-        if (eflag) {
-          numtyp e=r6inv*(lj3[mtype]*r6inv-lj4[mtype]);
-          energy+=factor_lj*(e-offset[mtype]); 
-        }
-        if (vflag) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-
-    // Store answers
-    acctyp *ap1=ans+ii;
-    if (eflag) {
-      *ap1=energy;
-      ap1+=ans_pitch;
-    }
-    if (vflag) {
-      for (int i=0; i<6; i++) {
-        *ap1=virial[i];
-        ap1+=ans_pitch;
-      }
-    }
-    *ap1=fx;
-    ap1+=ans_pitch;
-    *ap1=fy;
-    ap1+=ans_pitch;
-    *ap1=fz;
-
-  } // if ii
-}
-
 #endif
--- a/lib/gpu/lj_gpu_memory.cu
+++ b/lib/gpu/lj_gpu_memory.cu
@ -25,15 +25,6 @@ int LJ_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
  return atom.bytes_per_atom()+nbor.bytes_per_atom(max_nbors); 
 }

-template <class numtyp, class acctyp>
-int LJ_GPU_MemoryT::get_max_atoms(const size_t gpu_bytes, const int max_nbors) {
-  int matoms=static_cast<int>(PERCENT_GPU_MEMORY*gpu_bytes/
-                              bytes_per_atom(max_nbors));
-  if (matoms>MAX_ATOMS)
-    matoms=MAX_ATOMS;
-  return matoms;
-}
-  
 template <class numtyp, class acctyp>
 bool LJ_GPU_MemoryT::init(const int ij_size, const int ntypes, 
                          double **host_cutsq, double **host_sigma, 
@ -41,7 +32,7 @@ bool LJ_GPU_MemoryT::init(const int ij_size, const int ntypes,
                          double **host_lj2, double **host_lj3, 
                          double **host_lj4, double **host_offset, 
                          double *host_special_lj, const int max_nbors, 
-                          const int me) {
+                          const int me, const int nlocal, const int nall) {
  if (allocated)
    clear();
    
@ -55,9 +46,18 @@ bool LJ_GPU_MemoryT::init(const int ij_size, const int ntypes,
  time_pair.init();

  // Initialize atom and nbor data
-  max_atoms=get_max_atoms(gpu.bytes(),max_nbors);
-  atom.init(max_atoms);
-  nbor.init(ij_size,max_atoms,max_nbors);
+  max_local=static_cast<int>(static_cast<double>(nlocal)*1.10);
+  if (max_local==0)
+    max_local=1000;
+  if (nall<=nlocal)
+    max_atoms=max_local*2;
+  else
+    max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
+  
+  if (!atom.init(max_atoms))
+    return false;
+  if (!nbor.init(ij_size,max_local,max_nbors))
+    return false;
  
  // Get a stream for computing pair potentials
  CUDA_SAFE_CALL(cudaStreamCreate(&pair_stream));
@ -112,7 +112,7 @@ bool LJ_GPU_MemoryT::init(const int ij_size, const int ntypes,
  allocated=true;
  return true;
 }
-  
+
 template <class numtyp, class acctyp>
 void LJ_GPU_MemoryT::clear() {
  if (!allocated)
--- a/lib/gpu/lj_gpu_memory.h
+++ b/lib/gpu/lj_gpu_memory.h
@ -28,7 +28,6 @@
 #define BLOCK_1D 64           // max value = 256
 #define CELL_SIZE BLOCK_1D
 #define MAX_SHARED_TYPES 8
-#define PERCENT_GPU_MEMORY 0.7
 #define BIG_NUMBER 100000000

 template <class numtyp, class acctyp>
@ -42,14 +41,13 @@ class LJ_GPU_Memory {
            double **host_sigma, double **host_epsilon, 
            double **host_lj1, double **host_lj2, double **host_lj3, 
            double **host_lj4, double **host_offset, double *host_special_lj,
-            const int max_nbors, const int me);
+            const int max_nbors, const int me, const int nlocal,
+            const int nall);
  /// Free any memory on host and device
  void clear();

  /// Returns memory usage on GPU per atom
  int bytes_per_atom(const int max_nbors) const;
-  /// Maximum number of atoms that can be stored on GPU
-  int get_max_atoms(const size_t gpu_bytes, const int max_nbors);
  /// Total host memory used by library
  double host_memory_usage() const;
  
@ -72,7 +70,7 @@ class LJ_GPU_Memory {
  NVC_ConstMat< typename nvc_vec_traits<numtyp>::vec2 > lj1, lj3;
  NVC_VecT special_lj;
  
-  size_t max_atoms;
+  size_t max_atoms, max_local;
  
  // Timing for pair calculation
  NVCTimer time_pair;
--- a/lib/gpu/nvc_get_devices.cu
+++ b/lib/gpu/nvc_get_devices.cu
@ -8,7 +8,6 @@
   certain rights in this software.  This software is distributed under 
   the GNU General Public License.

-   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */

 /* ----------------------------------------------------------------------
@ -17,6 +16,12 @@
                         Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */

+#if defined(__APPLE__)
+#if _GLIBCXX_ATOMIC_BUILTINS == 1
+#undef _GLIBCXX_ATOMIC_BUILTINS
+#endif // _GLIBCXX_ATOMIC_BUILTINS
+#endif // __APPLE__
+
 #include "nvc_device.h"

 int main(int argc, char** argv) {
--- a/lib/gpu/nvc_macros.h
+++ b/lib/gpu/nvc_macros.h
@ -20,6 +20,12 @@
 #ifndef NVC_MACROS_H
 #define NVC_MACROS_H

+#if defined(__APPLE__)
+#if _GLIBCXX_ATOMIC_BUILTINS == 1
+#undef _GLIBCXX_ATOMIC_BUILTINS
+#endif // _GLIBCXX_ATOMIC_BUILTINS
+#endif // __APPLE__
+
 #include <stdio.h>
 #include "math_constants.h"
 #define INT_MUL(x,y)	(__mul24(x,y))
--- a/lib/gpu/nvc_memory.h
+++ b/lib/gpu/nvc_memory.h
@ -96,6 +96,17 @@ class NVC_Host {
    _end=_array+cols;
  }

+  // Allocate page-locked memory with fast write/slow read on host
+  inline bool alloc_w(const size_t cols) {
+    _cols=cols;
+    _row_bytes=cols*sizeof(numtyp);
+    if (cudaHostAlloc((void **)&_array,_row_bytes,cudaHostAllocWriteCombined)!=
+        cudaSuccess)
+      return false;
+    _end=_array+cols;
+    return true;
+  }
+
  // Allocate page-locked memory with fast read/write on host
  inline void safe_alloc_rw(const size_t cols) {
    _cols=cols;
@ -104,6 +115,16 @@ class NVC_Host {
    _end=_array+cols;
  }
  
+  // Allocate page-locked memory with fast read/write on host
+  inline bool alloc_rw(const size_t cols) {
+    _cols=cols;
+    _row_bytes=cols*sizeof(numtyp);
+    if (cudaMallocHost((void **)&_array,_row_bytes)!=cudaSuccess)
+      return false;
+    _end=_array+cols;
+    return true;
+  }
+  
  /// Free any memory associated with device
  inline void clear() 
    { if (_cols>0) { _cols=0; CUDA_SAFE_CALL(cudaFreeHost(_array)); } }
@ -163,6 +184,13 @@ class NVC_Host {
                                           cudaMemcpyHostToDevice,stream));
  }
  
+  /// Asynchronous copy to device (numel is not bytes)
+  inline void copy_to_device(size_t offset, numtyp *device_p, size_t numel,
+                             cudaStream_t &stream) {
+    CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(device_p,_array+offset,numel*sizeof(numtyp),
+                                           cudaMemcpyHostToDevice,stream));
+  }
+  
  /// Asynchronous copy to 2D matrix on device (numel is not bytes)
  inline void copy_to_2Ddevice(numtyp *device_p, const size_t dev_row_size,
                               const size_t rows, const size_t cols,
@ -194,6 +222,16 @@ class NVC_Vec {
    _end=_array+cols;
  }
  
+  // Row vector on device
+  inline bool alloc(const size_t cols) {
+    _cols=cols;
+    _row_bytes=cols*sizeof(numtyp);
+    if (cudaMalloc((void **)&_array,_row_bytes)!=cudaSuccess)
+      return false;
+    _end=_array+cols;
+    return true;
+  }
+  
  // Row vector on device (allocate and assign texture and bind)
  inline void safe_alloc(const size_t cols, textureReference *t) 
    { safe_alloc(cols); assign_texture(t); bind(); }
@ -221,11 +259,22 @@ class NVC_Vec {
    { CUDA_SAFE_CALL(cudaMemcpy(_array,host_p,row_bytes(), 
                                cudaMemcpyHostToDevice)); }

+  /// Copy from host (n elements)
+  inline void copy_from_host(const numtyp *host_p, const size_t n)
+    { CUDA_SAFE_CALL(cudaMemcpy(_array,host_p,n*sizeof(numtyp), 
+                                cudaMemcpyHostToDevice)); }
+
  /// Asynchronous copy from host
  inline void copy_from_host(const numtyp *host_p, cudaStream_t &stream)
    { CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(_array,host_p,row_bytes(), 
                             cudaMemcpyHostToDevice, stream)); }

+  /// Asynchronous copy from host (n elements)
+  inline void copy_from_host(const numtyp *host_p, const size_t n,
+                             cudaStream_t &stream)
+    { CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(_array,host_p,n*sizeof(numtyp), 
+                             cudaMemcpyHostToDevice, stream)); }
+
  /// Copy to host
  inline void copy_to_host(numtyp *host_p)
    { CUDA_SAFE_CALL(cudaMemcpy(host_p,_array,row_bytes(),
--- a/lib/gpu/pair_gpu_atom.cu
+++ b/lib/gpu/pair_gpu_atom.cu
@ -28,7 +28,9 @@ int PairGPUAtomT::bytes_per_atom() const {
 }

 template <class numtyp, class acctyp>
-void PairGPUAtomT::init(const int max_atoms) {
+bool PairGPUAtomT::init(const int max_atoms) {
+  bool success=true;
+  
  if (allocated)
    clear();
    
@ -39,28 +41,48 @@ void PairGPUAtomT::init(const int max_atoms) {
  time_answer.init();

  // Device matrices for atom and force data
-  dev_x.safe_alloc(atom_fields(),max_atoms,x_get_texture<numtyp>());
-  ans.safe_alloc(ans_fields(),max_atoms);
+  success=success && dev_x.alloc(max_atoms*sizeof(vec4));
+  success=success && dev_q.alloc(max_atoms*sizeof(vec4));
+  success=success && ans.alloc(ans_fields()*max_atoms);
+  // Get a host read/write buffer
+  success=success && host_read.alloc_rw(max_atoms*ans_fields());

  // Get a host write only buffer
-  host_write.safe_alloc_w(max_atoms*atom_fields());
-  // Get a host read/write buffer
-  host_read.safe_alloc_rw(ans.row_size()*ans_fields());
+  success=success && host_write.alloc_w(max_atoms*atom_fields());
    
  allocated=true;
+  
+  return success;
 }
  
+template <class numtyp, class acctyp>
+void PairGPUAtomT::resize(const int max_atoms, bool &success) {
+  ans.clear();
+  dev_x.clear();
+  dev_q.clear();
+  host_write.clear();
+  host_read.clear();
+
+  _max_atoms=max_atoms;
+
+  success = success && dev_x.alloc(_max_atoms*sizeof(vec4));
+  success = success && dev_q.alloc(_max_atoms*sizeof(vec4));
+  success = success && ans.alloc(ans_fields()*_max_atoms);
+  success = success && host_read.alloc_rw(_max_atoms*ans_fields());
+  success = success && host_write.alloc_w(_max_atoms*atom_fields());
+}  
+ 
 template <class numtyp, class acctyp>
 void PairGPUAtomT::clear() {
  if (!allocated)
      return;
  allocated=false;
      
-  dev_x.unbind();
-  ans.clear();                               
+  ans.clear();
+  dev_x.clear();
+  dev_q.clear();
  host_write.clear();
  host_read.clear();
-  dev_x.clear();
 }  
 
 template <class numtyp, class acctyp>
@ -82,88 +104,88 @@ void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag,
  if (!vflag)
    csize-=6;
      
-  host_read.copy_from_device(ans.begin(),ans.row_size()*csize,s);
+  host_read.copy_from_device(ans.begin(),_inum*csize,s);
 }
  
 template <class numtyp, class acctyp>
 double PairGPUAtomT::energy_virial(const int *ilist, const bool eflag_atom,
                                   const bool vflag_atom, double *eatom, 
-                                   double **vatom, double *virial) {
+                                   double **vatom, double *virial,
+                                   double **f, double **tor, const int n) {
  double evdwl=0.0;
-  int gap=ans.row_size()-_inum;

  acctyp *ap=host_read.begin();
-  if (_eflag) {
-    if (eflag_atom) {
-      for (int i=0; i<_inum; i++) {
+  for (int i=0; i<_inum; i++) {
+    int ii=ilist[i];
+    if (_eflag) {
+      if (eflag_atom) {
        evdwl+=*ap;
-        eatom[ilist[i]]+=*ap*0.5;
+        eatom[ii]+=*ap*0.5;
        ap++;
-      }
-    } else
-      for (int i=0; i<_inum; i++) {
+      } else {
        evdwl+=*ap;
        ap++;
      }
-    ap+=gap;
-    evdwl*=0.5;
-  }
-  _read_loc=ap;
-  gap=ans.row_size();
-  if (_vflag) {
-    if (vflag_atom) {
-      for (int ii=0; ii<_inum; ii++) {
-        int i=ilist[ii];
-        ap=_read_loc+ii;
+    }
+    if (_vflag) {
+      if (vflag_atom) {
        for (int j=0; j<6; j++) {
-          vatom[i][j]+=*ap*0.5;
+          vatom[ii][j]+=*ap*0.5;
          virial[j]+=*ap;
-          ap+=gap;
+          ap++;
        }
-      }
-    } else {
-      for (int ii=0; ii<_inum; ii++) {
-        ap=_read_loc+ii;
+      } else {
        for (int j=0; j<6; j++) {
          virial[j]+=*ap;
-          ap+=gap;
+          ap++;
        }
      }
    }
-    for (int j=0; j<6; j++)
-      virial[j]*=0.5;
-    _read_loc+=gap*6;
+    f[ii][0]+=*ap;
+    ap++;
+    f[ii][1]+=*ap;
+    ap++;
+    f[ii][2]+=*ap;
+    ap++;
+    if (i<n) {
+      tor[ii][0]+=*ap;
+      ap++;
+      tor[ii][1]+=*ap;
+      ap++;
+      tor[ii][2]+=*ap;
+      ap++;
+    } else {
+      ap+=3;
+    }
  }
-  
+  for (int j=0; j<6; j++)
+    virial[j]*=0.5;
+  evdwl*=0.5;
  return evdwl;
 }

 template <class numtyp, class acctyp>
-void PairGPUAtomT::add_forces(const int *ilist, double **f) {
-  int gap=ans.row_size();
-  for (int ii=0; ii<_inum; ii++) {
-    acctyp *ap=_read_loc+ii;
-    int i=ilist[ii];
-    f[i][0]+=*ap;
-    ap+=gap;
-    f[i][1]+=*ap;
-    ap+=gap;
-    f[i][2]+=*ap;
-  }
-}
-
-template <class numtyp, class acctyp>
-void PairGPUAtomT::add_torques(const int *ilist, double **tor, const int n) {
-  int gap=ans.row_size();
-  _read_loc+=gap*3;
-  for (int ii=0; ii<n; ii++) {
-    acctyp *ap=_read_loc+ii;
-    int i=ilist[ii];
-    tor[i][0]+=*ap;
-    ap+=gap;
-    tor[i][1]+=*ap;
-    ap+=gap;
-    tor[i][2]+=*ap;
+void PairGPUAtomT::copy_asphere(const int *ilist, double **f, double **tor, 
+                                const int n) {
+  acctyp *ap=host_read.begin();
+  for (int i=0; i<_inum; i++) {
+    int ii=ilist[i];
+    f[ii][0]+=*ap;
+    ap++;
+    f[ii][1]+=*ap;
+    ap++;
+    f[ii][2]+=*ap;
+    ap++;
+    if (i<n) {
+      tor[ii][0]+=*ap;
+      ap++;
+      tor[ii][1]+=*ap;
+      ap++;
+      tor[ii][2]+=*ap;
+      ap++;
+    } else {
+      ap+=3;
+    }
  }
 }

--- a/lib/gpu/pair_gpu_atom.h
+++ b/lib/gpu/pair_gpu_atom.h
@ -26,18 +26,21 @@
 #define PRECISION float
 #define ACC_PRECISION double
 #define MAX_ATOMS 65536
+#define vec4 float4
 #endif

 #ifdef _DOUBLE_DOUBLE
 #define PRECISION double
 #define ACC_PRECISION double
 #define MAX_ATOMS 32768
+struct vec4 { double x; double y; double z; double w; };
 #endif

 #ifndef PRECISION
 #define PRECISION float
 #define ACC_PRECISION float
 #define MAX_ATOMS 65536
+#define vec4 float4
 #endif

 #include "nvc_timer.h"
@ -71,7 +74,8 @@ class PairGPUAtom {

  /// Must be called once to allocate host and device memory
  /** \note atom_fields and ans_fields should be set first if not default **/
-  void init(const int max_atoms);
+  bool init(const int max_atoms);
+  void resize(const int max_atoms, bool &success);
  
  /// Free all memory on host and device
  void clear();
@ -99,11 +103,35 @@ class PairGPUAtom {
    for (int i=0; i<t; i+=stride) { *_write_loc=hostptr[i]; _write_loc++; }
  }
  
-  /// Copy num_rows x nall() write buffer to x in GPU
-  /** num_rows<=atom_fields() **/
-  inline void copy_atom_data(const int num_rows, cudaStream_t &stream) 
-    { dev_x.copy_2Dfrom_host(host_write.begin(),num_rows,nall(),stream); }
+  /// Add positions to write buffer
+  /** Copies nall() elements **/
+  inline void add_x_data(double **host_ptr, const int *host_type) {
+    for (int i=0; i<_nall; i++) {
+      *_write_loc=host_ptr[i][0];
+      _write_loc++;
+      *_write_loc=host_ptr[i][1];
+      _write_loc++;
+      *_write_loc=host_ptr[i][2];
+      _write_loc++;
+      *_write_loc=host_type[i];
+      _write_loc++;
+    }
+  }      

+  /// Add quaternions to write buffer
+  /** Copies nall() elements **/
+  template<class cpytyp>
+  inline void add_q_data(const cpytyp *host_ptr) {
+    const int end=_nall*4;
+    for (int i=0; i<end; i++) { *_write_loc=host_ptr[i]; _write_loc++; } 
+  } 
+
+  /// Copy num_rows positions+type to x in GPU
+  /** num_rows<=atom_fields() **/
+  inline void copy_x_data(cudaStream_t &stream) 
+    { dev_x.copy_from_host(host_write.begin(),_nall*4,stream); }
+  inline void copy_q_data(cudaStream_t &stream) 
+    { dev_q.copy_from_host(host_write.begin()+_nall*4,_nall*4,stream); }
    
  // -------------------------COPY FROM GPU -------------------------------

@ -113,22 +141,19 @@ class PairGPUAtom {
  /// Copy energy and virial data into LAMMPS memory
  double energy_virial(const int *ilist, const bool eflag_atom,
                       const bool vflag_atom, double *eatom, double **vatom,
-                       double *virial);
-
-  /// Add forces from the GPU into a LAMMPS pointer
-  void add_forces(const int *ilist, double **f);
-
-  /// Add torques from the GPU into a LAMMPS pointer
-  void add_torques(const int *ilist, double **tor, const int n);
-    
+                       double *virial, double **f, double **tor, const int);
+                       
+  /// Add forces and torques from the GPU into a LAMMPS pointer
+  void copy_asphere(const int *ilist, double **f, double **tor, const int n);
  // ------------------------------ DATA ----------------------------------

-  // atom_fields() x n (example: rows 1-3 position, 4 is type)
-  NVC_ConstMatT dev_x;
-  // ans_fields() x n Storage for Forces, etc.
-  // example: if (eflag and vflag) 1 is energy, 2-7 is virial, 8-10 is force 
-  // example: if (!eflag) 1-3 is force
-  NVC_Mat<acctyp> ans;                               
+  // atom coordinates
+  NVC_Vec<numtyp> dev_x;
+  // quaterions
+  NVC_Vec<numtyp> dev_q;
+  // ans_fields()
+  // example: if (eflag and vflag) 1 is energy, 2-7 is virial
+  NVC_Vec<acctyp> ans;                               

  // Buffer for moving floating point data to GPU
  NVC_HostT host_write;
--- a/lib/gpu/pair_gpu_cell.cu
+++ b/lib/gpu/pair_gpu_cell.cu
@ -17,6 +17,12 @@
                         Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */

+#if defined(__APPLE__)
+#if _GLIBCXX_ATOMIC_BUILTINS == 1
+#undef _GLIBCXX_ATOMIC_BUILTINS
+#endif // _GLIBCXX_ATOMIC_BUILTINS
+#endif // __APPLE__
+
 #include <assert.h>
 #include "lj_gpu_memory.h"
 #include "pair_gpu_cell.h"
--- a/lib/gpu/pair_gpu_nbor.cu
+++ b/lib/gpu/pair_gpu_nbor.cu
@ -26,8 +26,9 @@ int PairGPUNbor::bytes_per_atom(const int max_nbors) const {
    return (max_nbors+3)*sizeof(int);
 }

-void PairGPUNbor::init(const int ij_size, const int max_atoms, 
+bool PairGPUNbor::init(const int ij_size, const int max_atoms, 
                       const int max_nbors) {
+  bool success=true;
  if (allocated)
    clear();
    
@ -35,16 +36,29 @@ void PairGPUNbor::init(const int ij_size, const int max_atoms,
  time_nbor.init();

  if (_use_packing)
-    dev_nbor.safe_alloc(max_nbors+4,max_atoms);
+    success=success && dev_nbor.alloc((max_nbors+4)*max_atoms);
  else  
-    dev_nbor.safe_alloc(3,max_atoms);
+    success=success && dev_nbor.alloc(3*max_atoms);
  
-  ij.safe_alloc(max_nbors*max_atoms);
-  host_ij.safe_alloc_w(ij_size);
+  success=success && ij.alloc(max_nbors*max_atoms);
+  success=success && host_ij.alloc_w(ij_size);
    
  allocated=true;
+  
+  return success;
 }
  
+void PairGPUNbor::resize(const int nlocal, const int max_nbor, bool &success) {
+  dev_nbor.clear();
+  ij.clear();
+  if (_use_packing)
+    success=success && dev_nbor.alloc((max_nbor+4)*nlocal);
+  else  
+    success=success && dev_nbor.alloc(3*nlocal);
+  success=success && ij.alloc(max_nbor*nlocal);
+  allocated=true;
+}
+
 void PairGPUNbor::clear() {
  if (!allocated)
    return;
@ -54,7 +68,7 @@ void PairGPUNbor::clear() {
  host_ij.clear();
  dev_nbor.clear();
 }  
- 
+
 double PairGPUNbor::host_memory_usage() const {
  return IJ_SIZE*sizeof(int)+sizeof(PairGPUNbor);
 }
@ -65,12 +79,35 @@ void PairGPUNbor::reset(const int inum, int *ilist, const int *numj,

  dev_nbor.copy_from_host(ilist,inum);
  int acc=0;
-  for (int i=0; i<inum; i++) {
-    host_ij[i]=numj[ilist[i]];
-    host_ij[i+inum]=acc;
-    acc+=numj[ilist[i]];
-  }
-  
-  host_ij.copy_to_2Ddevice(dev_nbor.begin()+dev_nbor.row_size(),
-                           dev_nbor.row_size(),2,inum, s);
+   
+  int ij_size=host_ij.numel();
+  if (inum*2<ij_size) {
+    for (int i=0; i<inum; i++) {
+      host_ij[i]=numj[ilist[i]];
+      host_ij[i+inum]=acc;
+      acc+=numj[ilist[i]];
+    }
+    host_ij.copy_to_device(dev_nbor.begin()+inum,2*inum, s);
+  } else {
+    int offset=0;
+    int half=ij_size/2;
+    int hi=0;
+    for (int i=0; i<inum; i++) {
+      host_ij[hi]=numj[ilist[i]];
+      host_ij[hi+half]=acc;
+      acc+=numj[ilist[i]];
+      hi++;
+      if (hi==half) {
+        host_ij.copy_to_device(dev_nbor.begin()+inum+offset,half,s);
+        host_ij.copy_to_device(half,dev_nbor.begin()+2*inum+offset,half,s);
+        offset+=half;
+        hi=0;
+        CUDA_SAFE_CALL(cudaStreamSynchronize(s));
+      }
+   }
+   if (hi>0) {
+     host_ij.copy_to_device(dev_nbor.begin()+inum+offset,hi,s);
+     host_ij.copy_to_device(half,dev_nbor.begin()+2*inum+offset,hi,s);
+   }
+ }
 }
--- a/lib/gpu/pair_gpu_nbor.h
+++ b/lib/gpu/pair_gpu_nbor.h
@ -40,8 +40,10 @@ class PairGPUNbor {
  void packing(const bool use_packing) { _use_packing=use_packing; }
  
  /// Called once to allocate memory
-  void init(const int ij_size, const int max_atoms, const int max_nbors);
+  bool init(const int ij_size, const int max_atoms, const int max_nbors);
  
+  void resize(const int nlocal, const int max_nbor, bool &success);
+
  /// Free all memory on host and device
  void clear();
 
@ -73,7 +75,7 @@ class PairGPUNbor {
  // - 1st row is i
  // - 2nd row is numj (number of neighbors)
  // - 3rd row is starting address in host_ij of neighbors
-  NVC_MatI dev_nbor;
+  NVC_VecI dev_nbor;

  // --------------- Timing Stuff
  NVCTimer time_nbor;
--- a/lib/gpu/pair_gpu_texture.h
+++ b/lib/gpu/pair_gpu_texture.h
@ -31,32 +31,6 @@
 #define GB_GPU_DOUBLE
 #endif

-// ------------------------------- x ------------------------------------
-
-static texture<float, 2, cudaReadModeElementType> x_float_tex;
-static texture<int2, 2, cudaReadModeElementType> x_double_tex;
-template <class numtyp> inline textureReference * x_get_texture() {
-  const textureReference *ptr;
-  cudaGetTextureReference(&ptr,"x_float_tex");
-  return const_cast<textureReference *>(ptr);
-}
-template <> inline textureReference * x_get_texture<double>() {
-  const textureReference *ptr;
-  cudaGetTextureReference(&ptr,"x_double_tex");
-  return const_cast<textureReference *>(ptr);
-}
-template <class numtyp>
-static __inline__ __device__ numtyp _x_(const int i, const int j) {
-  return tex2D(x_float_tex,i,j);
-}
-#ifdef GB_GPU_DOUBLE
-template <>
-static __inline__ __device__ double _x_<double>(const int i,const int j) {
-  int2 t=tex2D(x_double_tex,i,j);
-  return __hiloint2double(t.y, t.x);
-}
-#endif
-
 // ------------------------------- form ------------------------------------

 static texture<int, 2, cudaReadModeElementType> form_tex;
--- a/lib/gpu/pair_tex_tar.cu
+++ b/lib/gpu/pair_tex_tar.cu
@ -17,6 +17,12 @@
                         Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */

+#if defined(__APPLE__)
+#if _GLIBCXX_ATOMIC_BUILTINS == 1
+#undef _GLIBCXX_ATOMIC_BUILTINS
+#endif // _GLIBCXX_ATOMIC_BUILTINS
+#endif // __APPLE__
+
 #include "pair_gpu_atom.cu"
 #include "lj_gpu.cu"
 #include "gb_gpu.cu"