diff --git a/lib/gpu/README b/lib/gpu/README
index 02859b17ad..d459a59c96 100644
--- a/lib/gpu/README
+++ b/lib/gpu/README
@@ -74,13 +74,3 @@ NOTE: For Tesla and other graphics cards with compute capability>=1.3,
 NOTE: The gayberne/gpu pair style will only be installed if the ASPHERE
       package has been installed before installing the GPU package in LAMMPS.
       
-                           GPU MEMORY
-
-Upon initialization of the gayberne/gpu pair style, the library will reserve
-memory for 64K atoms per GPU or 70% of each cards GPU memory, whichever value
-is limiting.  The value of 70% can be changed by editing the
-PERCENT_GPU_MEMORY definition in the source file. For gayberne/gpu, the value
-of 64K cannot be increased and is the maximum number of atoms allowed per
-GPU. Using the 'neigh_modify one' modifier in your LAMMPS input script
-can help to increase maximum number of atoms per GPU for cards with
-limited memory.
diff --git a/lib/gpu/gb_gpu.cu b/lib/gpu/gb_gpu.cu
index 10c6557b31..f95e1cbd77 100644
--- a/lib/gpu/gb_gpu.cu
+++ b/lib/gpu/gb_gpu.cu
@@ -36,7 +36,7 @@ static GB_GPU_Memory<PRECISION,ACC_PRECISION> GBMF[MAX_GPU_THREADS];
 // -- Only pack neighbors within cutoff
 // ---------------------------------------------------------------------------
 template<class numtyp>
-__global__ void kernel_pack_nbor(int *dev_nbor, const int nbor_pitch, 
+__global__ void kernel_pack_nbor(const vec4 *x_, int *dev_nbor, const int nbor_pitch, 
                                  const int start, const int inum, 
                                  const int *dev_ij, const int form_low, 
                                  const int form_high, const int nall) {
@@ -56,25 +56,24 @@ __global__ void kernel_pack_nbor(int *dev_nbor, const int nbor_pitch,
     int *nbor_newj=nbor;
     nbor+=nbor_pitch;
   
-    numtyp ix=_x_<numtyp>(i,0);
-    numtyp iy=_x_<numtyp>(i,1);
-    numtyp iz=_x_<numtyp>(i,2);
-    int itype=_x_<numtyp>(i,7);
+    vec4 ix=x_[i];
+    int itype=ix.w;
 
     int newj=0;  
     for ( ; list<list_end; list++) {
       int j=*list;
       if (j>=nall)
         j%=nall;
-      int jtype=_x_<numtyp>(j,7);
+      vec4 jx=x_[j];
+      int jtype=jx.w;
       
       if (_form_(itype,jtype)>=form_low && _form_(itype,jtype)<=form_high) {
         // Compute r12;
-        numtyp rsq=_x_<numtyp>(j,0)-ix;
+        numtyp rsq=jx.x-ix.x;
         rsq*=rsq;
-        numtyp t=_x_<numtyp>(j,1)-iy;
+        numtyp t=jx.y-ix.y;
         rsq+=t*t;
-        t=_x_<numtyp>(j,2)-iz;
+        t=jx.z-ix.z;
         rsq+=t*t;
 
         if (rsq< _cutsq_<numtyp>(itype,jtype)) {
@@ -95,7 +94,7 @@ __global__ void kernel_pack_nbor(int *dev_nbor, const int nbor_pitch,
 // -- Fast version of routine that uses shared memory for LJ constants
 // ---------------------------------------------------------------------------
 template<class numtyp>
-__global__ void kernel_pack_nbor_fast(int *dev_nbor, const int nbor_pitch, 
+__global__ void kernel_pack_nbor_fast(const vec4 *x_, int *dev_nbor, const int nbor_pitch, 
                                       const int start, const int inum, 
                                       const int *dev_ij, const int form_low, 
                                       const int form_high, const int nall) {
@@ -124,25 +123,25 @@ __global__ void kernel_pack_nbor_fast(int *dev_nbor, const int nbor_pitch,
     int *nbor_newj=nbor;
     nbor+=nbor_pitch;
   
-    numtyp ix=_x_<numtyp>(i,0);
-    numtyp iy=_x_<numtyp>(i,1);
-    numtyp iz=_x_<numtyp>(i,2);
-    int itype=INT_MUL(MAX_SHARED_TYPES,_x_<numtyp>(i,7));
+    vec4 ix=x_[i];
+    int itype=INT_MUL(MAX_SHARED_TYPES,ix.w);
 
     int newj=0;  
     for ( ; list<list_end; list++) {
       int j=*list;
       if (j>=nall)
         j%=nall;
-      int mtype=itype+_x_<numtyp>(j,7);
+      vec4 jx=x_[j];
+      int jtype=jx.w;
+      int mtype=itype+jtype;
       
       if (form[mtype]>=form_low && form[mtype]<=form_high) {
         // Compute r12;
-        numtyp rsq=_x_<numtyp>(j,0)-ix;
+        numtyp rsq=jx.x-ix.x;
         rsq*=rsq;
-        numtyp t=_x_<numtyp>(j,1)-iy;
+        numtyp t=jx.y-ix.y;
         rsq+=t*t;
-        t=_x_<numtyp>(j,2)-iz;
+        t=jx.z-ix.z;
         rsq+=t*t;
 
         if (rsq<cutsq[mtype]) {
@@ -159,13 +158,15 @@ __global__ void kernel_pack_nbor_fast(int *dev_nbor, const int nbor_pitch,
 template<class numtyp, class acctyp>
 void pack_nbors(GBMT &gbm, const int GX, const int BX, const int start, 
                 const int inum, const int form_low, const int form_high) {
-  if (gbm.shared_types)
+  if (gbm.shared_types) {
     kernel_pack_nbor_fast<numtyp><<<GX,BX,0,gbm.pair_stream>>>
-          (gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), start, inum,
+          ((vec4 *)gbm.atom.dev_x.begin(),gbm.nbor.dev_nbor.begin(), 
+           gbm.atom.inum(), start, inum,
            gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall());
-  else
+  } else
     kernel_pack_nbor<numtyp><<<GX,BX,0,gbm.pair_stream>>>
-          (gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), start, inum,
+          ((vec4 *)gbm.atom.dev_x.begin(),gbm.nbor.dev_nbor.begin(), 
+           gbm.atom.inum(), start, inum,
            gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall());
 }
 
@@ -188,9 +189,7 @@ EXTERN void gb_gpu_name(const int id, const int max_nbors, char * name) {
   string sname=GBMF[0].gpu.name(id)+", "+
               gb_gpu_toa(GBMF[0].gpu.cores(id))+" cores, "+
               gb_gpu_toa(GBMF[0].gpu.gigabytes(id))+" GB, "+
-              gb_gpu_toa(GBMF[0].gpu.clock_rate(id))+" GHZ, "+
-              gb_gpu_toa(GBMF[0].get_max_atoms(GBMF[0].gpu.bytes(id),
-                                               max_nbors))+" Atoms";
+              gb_gpu_toa(GBMF[0].gpu.clock_rate(id))+" GHZ";
   strcpy(name,sname.c_str());
 }
 
@@ -203,6 +202,7 @@ EXTERN bool gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
                   double **epsilon, double *host_lshape, int **form,
                   double **host_lj1, double **host_lj2, double **host_lj3, 
                   double **host_lj4, double **offset, double *special_lj,
+                  const int nlocal, const int nall, 
                   const int max_nbors, const int thread, const int gpu_id) {
   assert(thread<MAX_GPU_THREADS);
   
@@ -215,7 +215,8 @@ EXTERN bool gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
   return GBMF[thread].init(ij_size, ntypes, gamma, upsilon, mu, shape,
                            well, cutsq, sigma, epsilon, host_lshape, form,
                            host_lj1, host_lj2, host_lj3, host_lj4, offset,
-                           special_lj, max_nbors, false, gpu_id);
+                           special_lj, nlocal, nall, max_nbors, false, 
+                           gpu_id);
 }
 
 // ---------------------------------------------------------------------------
@@ -230,34 +231,23 @@ EXTERN void gb_gpu_clear(const int thread) {
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 inline void _gb_gpu_atom(PairGPUAtom<numtyp,acctyp> &atom, double **host_x, 
-                         double **host_quat, const int *host_type, 
-                         const bool rebuild, cudaStream_t &stream) {
+                          double **host_quat, const int *host_type, 
+                          const bool rebuild, cudaStream_t &stream) {
   atom.time_atom.start();
   atom.reset_write_buffer();
  
   // Rows 1-3 of dev_x are position; rows 4-7 are quaternion
-  atom.add_atom_data(host_x[0],3);
-  atom.add_atom_data(host_x[0]+1,3);
-  atom.add_atom_data(host_x[0]+2,3);
-  atom.add_atom_data(host_quat[0],4);
-  atom.add_atom_data(host_quat[0]+1,4);
-  atom.add_atom_data(host_quat[0]+2,4);
-  atom.add_atom_data(host_quat[0]+3,4);
-   
-  int csize=7;
-  
-  // If a rebuild occured, copy type data
-  if (rebuild) {
-    atom.add_atom_data(host_type);
-    csize++;
-  }
-  
-  atom.copy_atom_data(csize,stream);
+  atom.add_x_data(host_x,host_type);
+  atom.add_q_data(host_quat[0]);
+
+  atom.copy_x_data(stream);
+  atom.copy_q_data(stream);
   atom.time_atom.stop();
 }
 
 EXTERN void gb_gpu_atom(double **host_x, double **host_quat, 
-                 const int *host_type, const bool rebuild, const int thread) {
+                        const int *host_type, const bool rebuild, 
+                        const int thread) {
   _gb_gpu_atom(GBMF[thread].atom, host_x, host_quat, host_type, rebuild,
                GBMF[thread].pair_stream);
 }
@@ -269,46 +259,111 @@ template <class gbmtyp>
 int * _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall, const int nlocal, 
                           const int inum, int *ilist, const int *numj,
                           const int *type, bool &success) {
-  if (nall>gbm.max_atoms) {
-    success=false;
-    return 0;
-  }
   success=true;
     
   gbm.nbor.time_nbor.start();
 
+  int mn=0;
+  for (int i=0; i<inum; i++)
+    mn=std::max(mn,numj[i]);
+
+  if (nall>gbm.max_atoms)
+    gbm.resize_atom(nall,success);  
+  if (nlocal>gbm.max_local || mn>gbm._max_nbors)
+    gbm.resize_local(nlocal,mn,success);
+  if (!success)
+    return false;
+    
   gbm.atom.nall(nall);
   gbm.atom.inum(inum);
 
   if (gbm.multiple_forms) {
-    int p=0, acc=0;
-    for (int i=0; i<inum; i++) {
-      int itype=type[ilist[i]];
-      if (gbm.host_form[itype][itype]==ELLIPSE_ELLIPSE) {
-        gbm.host_olist[p]=ilist[i];
-        gbm.nbor.host_ij[p]=numj[ilist[i]];
-        gbm.nbor.host_ij[p+inum]=acc;
-        acc+=numj[ilist[i]];
-        p++;
+    int ij_size=gbm.nbor.host_ij.numel();
+    if (inum*2<ij_size) {
+      int p=0, acc=0;
+      for (int i=0; i<inum; i++) {
+        int itype=type[ilist[i]];
+        if (gbm.host_form[itype][itype]==ELLIPSE_ELLIPSE) {
+          gbm.host_olist[p]=ilist[i];
+          gbm.nbor.host_ij[p]=numj[ilist[i]];
+          gbm.nbor.host_ij[p+inum]=acc;
+          acc+=numj[ilist[i]];
+          p++;
+        }
       }
-    }
-    gbm.last_ellipse=p;
-    for (int i=0; i<inum; i++) {
-      int itype=type[ilist[i]];
-      if (gbm.host_form[itype][itype]!=ELLIPSE_ELLIPSE) {
-        gbm.host_olist[p]=ilist[i];
-        gbm.nbor.host_ij[p]=numj[ilist[i]];
-        gbm.nbor.host_ij[p+inum]=acc;
-        acc+=numj[ilist[i]];
-        p++;
+      gbm.last_ellipse=p;
+      for (int i=0; i<inum; i++) {
+        int itype=type[ilist[i]];
+        if (gbm.host_form[itype][itype]!=ELLIPSE_ELLIPSE) {
+          gbm.host_olist[p]=ilist[i];
+          gbm.nbor.host_ij[p]=numj[ilist[i]];
+          gbm.nbor.host_ij[p+inum]=acc;
+          acc+=numj[ilist[i]];
+          p++;
+        }
       }
+      gbm.nbor.ij_total=0;
+      gbm.nbor.dev_nbor.copy_from_host(gbm.host_olist.begin(),inum);
+      gbm.nbor.host_ij.copy_to_device(gbm.nbor.dev_nbor.begin()+inum,
+                                      2*inum,gbm.pair_stream);
+    } else {
+      int p=0, acc=0;
+      int offset=0;
+      int half=ij_size/2;
+      int hi=0;
+      for (int i=0; i<inum; i++) {
+        int itype=type[ilist[i]];
+        if (gbm.host_form[itype][itype]==ELLIPSE_ELLIPSE) {
+          gbm.host_olist[p]=ilist[i];
+          gbm.nbor.host_ij[hi]=numj[ilist[i]];
+          gbm.nbor.host_ij[hi+half]=acc;
+          acc+=numj[ilist[i]];
+          p++;
+          hi++;
+          if (hi==half) {
+            gbm.nbor.host_ij.copy_to_device(gbm.nbor.dev_nbor.begin()+inum+offset,
+                                            half,gbm.pair_stream);
+            gbm.nbor.host_ij.copy_to_device(half,gbm.nbor.dev_nbor.begin()+
+                                                 inum*2+offset,
+                                            half,gbm.pair_stream);
+            hi=0;
+            offset+=half;
+            CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream));
+          }
+        }
+      }
+      gbm.last_ellipse=p;
+      for (int i=0; i<inum; i++) {
+        int itype=type[ilist[i]];
+        if (gbm.host_form[itype][itype]!=ELLIPSE_ELLIPSE) {
+          gbm.host_olist[p]=ilist[i];
+          gbm.nbor.host_ij[hi]=numj[ilist[i]];
+          gbm.nbor.host_ij[hi+half]=acc;
+          acc+=numj[ilist[i]];
+          p++;
+          hi++;
+          if (hi==half) {
+            gbm.nbor.host_ij.copy_to_device(gbm.nbor.dev_nbor.begin()+inum+offset,
+                                            half,gbm.pair_stream);
+            gbm.nbor.host_ij.copy_to_device(half,gbm.nbor.dev_nbor.begin()+
+                                                 inum*2+offset,
+                                            half,gbm.pair_stream);
+            hi=0;
+            offset+=half;
+            CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream));
+          }
+        }
+      }
+      gbm.nbor.dev_nbor.copy_from_host(gbm.host_olist.begin(),inum);
+      if (hi>0) {
+        gbm.nbor.host_ij.copy_to_device(gbm.nbor.dev_nbor.begin()+inum+offset,
+                                        hi,gbm.pair_stream);
+        gbm.nbor.host_ij.copy_to_device(half,gbm.nbor.dev_nbor.begin()+
+                                             inum*2+offset,
+                                        hi,gbm.pair_stream);
+      }
+      gbm.nbor.ij_total=0;
     }
-    gbm.nbor.ij_total=0;
-    gbm.nbor.dev_nbor.copy_from_host(gbm.host_olist.begin(),inum);
-    gbm.nbor.host_ij.copy_to_2Ddevice(gbm.nbor.dev_nbor.begin()+
-                                      gbm.nbor.dev_nbor.row_size(),
-                                      gbm.nbor.dev_nbor.row_size(),2,inum,
-                                      gbm.pair_stream);
   } else {
     gbm.nbor.reset(inum,ilist,numj,gbm.pair_stream);
     gbm.last_ellipse=inum;
@@ -349,6 +404,14 @@ EXTERN void gb_gpu_nbors(const int *ij, const int num_ij, const bool eflag,
   _gb_gpu_nbors(GBMF[thread],ij,num_ij,eflag);
 }
 
+
+template<class numtyp, class acctyp>
+void _gb_gpu_enqueue(GBMT &gbm, const bool eflag, const bool vflag) {
+  gbm.atom.time_answer.start();
+  gbm.atom.copy_answers(eflag,vflag,gbm.pair_stream);
+  gbm.atom.time_answer.stop();
+}
+
 // ---------------------------------------------------------------------------
 // Calculate energies, forces, and torques for all ij interactions
 // ---------------------------------------------------------------------------
@@ -357,7 +420,12 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag,
                       const bool rebuild) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=BLOCK_1D;
-
+  int ans_pitch=6;
+  if (eflag)
+    ans_pitch++;
+  if (vflag)
+    ans_pitch+=6;
+  
   int GX=static_cast<int>(ceil(static_cast<double>(gbm.atom.inum())/BX));
 
   if (gbm.multiple_forms) {
@@ -371,9 +439,10 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag,
   
       gbm.time_gayberne.start();                                 
       kernel_gayberne<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
-           (gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), 
-            gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(),
-            gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(),
+           ((vec4*)gbm.atom.dev_x.begin(), (vec4*)gbm.atom.dev_q.begin(), 
+            gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), 
+            gbm.nbor.dev_nbor.begin(), gbm.atom.inum(),
+            gbm.atom.ans.begin(), ans_pitch,gbm.dev_error.begin(),
             eflag, vflag, gbm.last_ellipse, gbm.atom.nall());
       gbm.time_gayberne.stop();
 
@@ -393,14 +462,15 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag,
       GX=static_cast<int>(ceil(static_cast<double>(gbm.atom.inum()-
                                gbm.last_ellipse)/BX));
       pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom.inum(),ELLIPSE_SPHERE,
-                 ELLIPSE_SPHERE);
+                  ELLIPSE_SPHERE);
       gbm.time_kernel2.stop();
 
       gbm.time_gayberne2.start();
       kernel_sphere_gb<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
-           (gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), 
-            gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), 
-            gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(),
+           ((vec4*)gbm.atom.dev_x.begin(), (vec4*)gbm.atom.dev_q.begin(), 
+            gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), 
+            gbm.nbor.dev_nbor.begin(), gbm.atom.inum(),
+            gbm.atom.ans.begin(), ans_pitch,gbm.dev_error.begin(),
             eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());
       gbm.time_gayberne2.stop();
    } else {
@@ -419,16 +489,15 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag,
     if (gbm.last_ellipse<gbm.atom.inum()) {
       if (gbm.shared_types)
         kernel_lj_fast<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
-           (gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(), 
-            gbm.nbor.ij.begin(), gbm.nbor.dev_nbor.row_size(), 
-            gbm.atom.ans.begin(), gbm.atom.ans.row_size(), 
-            gbm.dev_error.begin(), eflag, vflag, gbm.last_ellipse, 
-            gbm.atom.inum(), gbm.atom.nall());
+           ((vec4*)gbm.atom.dev_x.begin(), gbm.special_lj.begin(), 
+            gbm.nbor.dev_nbor.begin(), gbm.atom.inum(), gbm.nbor.ij.begin(),
+            gbm.atom.ans.begin(), ans_pitch, gbm.dev_error.begin(), eflag, 
+            vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());
       else
         kernel_lj<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
-           (gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(), 
-            gbm.nbor.ij.begin(), gbm.nbor.dev_nbor.row_size(), 
-            gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(),
+           ((vec4*)gbm.atom.dev_x.begin(), gbm.special_lj.begin(), 
+            gbm.nbor.dev_nbor.begin(), gbm.atom.inum(), gbm.nbor.ij.begin(),
+            gbm.atom.ans.begin(), ans_pitch,gbm.dev_error.begin(),
             eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());
     }
     gbm.time_pair.stop();
@@ -437,19 +506,21 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag,
     pack_nbors(gbm, GX, BX, 0, gbm.atom.inum(),SPHERE_SPHERE,ELLIPSE_ELLIPSE);
     gbm.time_kernel.stop();
   
-    gbm.time_gayberne.start();                                 
+    gbm.time_gayberne.start(); 
     kernel_gayberne<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
-         (gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), 
-          gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), 
-          gbm.atom.ans.begin(), gbm.atom.ans.row_size(), gbm.dev_error.begin(), 
+         ((vec4*)gbm.atom.dev_x.begin(), (vec4*)gbm.atom.dev_q.begin(), 
+          gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), 
+          gbm.nbor.dev_nbor.begin(), gbm.atom.inum(),
+          gbm.atom.ans.begin(), ans_pitch, gbm.dev_error.begin(), 
           eflag, vflag, gbm.atom.inum(), gbm.atom.nall());
     gbm.time_gayberne.stop();
   }
 }
 
 EXTERN void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild, 
-                     const int thread) {
+                            const int thread) {
   _gb_gpu_gayberne<PRECISION,ACC_PRECISION>(GBMF[thread],eflag,vflag,rebuild);
+  _gb_gpu_enqueue<PRECISION,ACC_PRECISION>(GBMF[thread],eflag,vflag);
 }
 
 // ---------------------------------------------------------------------------
@@ -462,9 +533,6 @@ double _gb_gpu_forces(GBMT &gbm, double **f, double **tor, const int *ilist,
                       double *virial) {
   double evdw;
 
-  gbm.atom.time_answer.start();
-  gbm.atom.copy_answers(eflag,vflag,gbm.pair_stream);
-
   gbm.atom.time_atom.add_to_total();
   gbm.nbor.time_nbor.add_to_total();
   gbm.time_kernel.add_to_total();
@@ -475,11 +543,19 @@ double _gb_gpu_forces(GBMT &gbm, double **f, double **tor, const int *ilist,
     gbm.time_pair.add_to_total();
   }      
   CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream));
-  
-  evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial);
-  gbm.atom.add_forces(ilist,f);
-  gbm.atom.add_torques(ilist,tor,gbm.last_ellipse);
-  gbm.atom.time_answer.stop();
+  if (gbm.last_ellipse>gbm.atom.inum()) {
+    if (eflag || vflag)
+      evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial,
+                                  f,tor,gbm.atom.inum());
+    else
+      gbm.atom.copy_asphere(ilist,f,tor,gbm.atom.inum());
+  } else {
+    if (eflag || vflag)
+      evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial,
+                                  f,tor,gbm.last_ellipse);
+    else
+      gbm.atom.copy_asphere(ilist,f,tor,gbm.last_ellipse);
+  }
   gbm.atom.time_answer.add_to_total();
   return evdw;
 }
@@ -516,3 +592,4 @@ EXTERN int gb_gpu_num_devices() {
 EXTERN double gb_gpu_bytes() {
   return GBMF[0].host_memory_usage();
 }
+
diff --git a/lib/gpu/gb_gpu_extra.h b/lib/gpu/gb_gpu_extra.h
index 0e060a80a8..1e51389197 100644
--- a/lib/gpu/gb_gpu_extra.h
+++ b/lib/gpu/gb_gpu_extra.h
@@ -298,24 +298,22 @@ static __inline__ __device__ void gpu_mldivide3(const numtyp m[9],
 ------------------------------------------------------------------------- */
 
 template <class numtyp>
-static __inline__ __device__ void gpu_quat_to_mat_trans(const int qi, 
+static __inline__ __device__ void gpu_quat_to_mat_trans(const vec4 *qif,
+                                                        const int qi, 
                                                         numtyp mat[9])
 {
-  numtyp qi3=_x_<numtyp>(qi,3);
-  numtyp qi4=_x_<numtyp>(qi,4);
-  numtyp qi5=_x_<numtyp>(qi,5);
-  numtyp qi6=_x_<numtyp>(qi,6);
-
-  numtyp w2 = qi3*qi3;
-  numtyp i2 = qi4*qi4;
-  numtyp j2 = qi5*qi5;
-  numtyp k2 = qi6*qi6;
-  numtyp twoij = (numtyp)2.0*qi4*qi5;
-  numtyp twoik = (numtyp)2.0*qi4*qi6;
-  numtyp twojk = (numtyp)2.0*qi5*qi6;
-  numtyp twoiw = (numtyp)2.0*qi4*qi3;
-  numtyp twojw = (numtyp)2.0*qi5*qi3;
-  numtyp twokw = (numtyp)2.0*qi6*qi3;
+  vec4 q=qif[qi];
+  
+  numtyp w2 = q.x*q.x;
+  numtyp i2 = q.y*q.y;
+  numtyp j2 = q.z*q.z;
+  numtyp k2 = q.w*q.w;
+  numtyp twoij = (numtyp)2.0*q.y*q.z;
+  numtyp twoik = (numtyp)2.0*q.y*q.w;
+  numtyp twojk = (numtyp)2.0*q.z*q.w;
+  numtyp twoiw = (numtyp)2.0*q.y*q.x;
+  numtyp twojw = (numtyp)2.0*q.z*q.x;
+  numtyp twokw = (numtyp)2.0*q.w*q.x;
 
   mat[0] = w2+i2-j2-k2;
   mat[3] = twoij-twokw;
diff --git a/lib/gpu/gb_gpu_kernel.h b/lib/gpu/gb_gpu_kernel.h
index 110a41fecf..b3f8b3ccc0 100644
--- a/lib/gpu/gb_gpu_kernel.h
+++ b/lib/gpu/gb_gpu_kernel.h
@@ -92,16 +92,19 @@ static __inline__ __device__ void compute_eta_torque(numtyp m[9],
 		    m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
 }
 
+#include "gb_gpu_kernel.h"
+
 template<class numtyp, class acctyp>
-__global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj,
-                                const int *dev_nbor, const int nbor_pitch, 
+__global__ void kernel_gayberne(const vec4* x_, const vec4 *q,
+                                const numtyp *gum, const numtyp *special_lj,
+                                const int *dev_nbor, const size_t nbor_pitch, 
                                 acctyp *ans, size_t ans_pitch, int *err_flag, 
                                 const bool eflag, const bool vflag,
                                 const int inum, const int nall) {
                                 
   __shared__ numtyp sp_lj[4];
 
-   // ii indexes the two interacting particles in gi
+  // ii indexes the two interacting particles in gi
   int ii=threadIdx.x;
   if (ii<4)
     sp_lj[ii]=special_lj[ii];    
@@ -109,7 +112,7 @@ __global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj,
   __syncthreads();
 
   if (ii<inum) {
-  
+
   acctyp energy=(numtyp)0;
   acctyp fx=(numtyp)0;
   acctyp fy=(numtyp)0;
@@ -128,16 +131,14 @@ __global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj,
   nbor+=nbor_pitch;
   int numj=*nbor;
   nbor+=nbor_pitch;
-  const int *nbor_end=nbor+INT_MUL(nbor_pitch,numj);
+  const int *nbor_end=nbor+nbor_pitch*numj;
   
-  numtyp ix=_x_<numtyp>(i,0);
-  numtyp iy=_x_<numtyp>(i,1);
-  numtyp iz=_x_<numtyp>(i,2);
-  int itype=_x_<numtyp>(i,7);
+  vec4 ix=x_[i];
+  int itype=ix.w;
   numtyp a1[9], b1[9], g1[9];
   {
     numtyp t[9];
-    gpu_quat_to_mat_trans(i,a1);
+    gpu_quat_to_mat_trans(q,i,a1);
     gpu_shape_times3(itype,a1,t);
     gpu_transpose_times3(a1,t,g1);
     gpu_well_times3(itype,a1,t);
@@ -146,7 +147,7 @@ __global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj,
 
   numtyp factor_lj;
   for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
-  
+
   int j=*nbor;
   if (j < nall) 
     factor_lj = (numtyp)1.0;
@@ -154,20 +155,21 @@ __global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj,
     factor_lj = sp_lj[j/nall];
     j %= nall;
   }
-  int jtype=_x_<numtyp>(j,7);
+  vec4 jx=x_[j];
+  int jtype=jx.w;
 
   // Compute r12
   numtyp r12[3];
-  r12[0] = _x_<numtyp>(j,0)-ix;
-  r12[1] = _x_<numtyp>(j,1)-iy;
-  r12[2] = _x_<numtyp>(j,2)-iz;
+  r12[0] = jx.x-ix.x;
+  r12[1] = jx.y-ix.y;
+  r12[2] = jx.z-ix.z;
   numtyp ir = gpu_dot3(r12,r12);
 
   ir = rsqrt(ir);
   numtyp r = (numtyp)1.0/ir;
 
   numtyp a2[9];
-  gpu_quat_to_mat_trans(j,a2);
+  gpu_quat_to_mat_trans(q,j,a2);
   
   numtyp u_r, dUr[3], tUr[3], eta, teta[3];
   { // Compute U_r, dUr, eta, and teta
@@ -366,35 +368,37 @@ __global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj,
   } // for nbor
 
   // Store answers
-  acctyp *ap1=ans+ii;
+  acctyp *ap1=ans+ii*ans_pitch;
   if (eflag) {
     *ap1=energy;
-    ap1+=ans_pitch;
+    ap1++;
   }
   if (vflag) {
     for (int i=0; i<6; i++) {
       *ap1=virial[i];
-      ap1+=ans_pitch;
+      ap1++;
     }
   }
   *ap1=fx;
-  ap1+=ans_pitch;
+  ap1++;
   *ap1=fy;
-  ap1+=ans_pitch;
+  ap1++;
   *ap1=fz;
-  ap1+=ans_pitch;
+  ap1++;
   *ap1=torx;
-  ap1+=ans_pitch;
+  ap1++;
   *ap1=tory;
-  ap1+=ans_pitch;
+  ap1++;
   *ap1=torz;
 
   } // if ii
+
 }
 
 template<class numtyp, class acctyp>
-__global__ void kernel_sphere_gb(const numtyp *gum, const numtyp *special_lj,
-                                 const int *dev_nbor, const int nbor_pitch, 
+__global__ void kernel_sphere_gb(const vec4 *x_, const vec4 *q,
+                                 const numtyp *gum, const numtyp *special_lj,
+                                 const int *dev_nbor, const size_t nbor_pitch,
                                  acctyp *ans, size_t ans_pitch, int *err_flag, 
                                  const bool eflag, const bool vflag,
                                  const int start, const int inum, 
@@ -425,12 +429,10 @@ __global__ void kernel_sphere_gb(const numtyp *gum, const numtyp *special_lj,
     nbor+=nbor_pitch;
     int numj=*nbor;
     nbor+=nbor_pitch;
-    const int *nbor_end=nbor+INT_MUL(nbor_pitch,numj);
+    const int *nbor_end=nbor+nbor_pitch*numj;
   
-    numtyp ix=_x_<numtyp>(i,0);
-    numtyp iy=_x_<numtyp>(i,1);
-    numtyp iz=_x_<numtyp>(i,2);
-    int itype=_x_<numtyp>(i,7);
+    vec4 ix=x_[i];
+    int itype=ix.w;
       
     numtyp oner=_shape_<numtyp>(itype,0);
     numtyp one_well=_well_<numtyp>(itype,0);
@@ -445,13 +447,14 @@ __global__ void kernel_sphere_gb(const numtyp *gum, const numtyp *special_lj,
         factor_lj = sp_lj[j/nall];
         j %= nall;
       }
-      int jtype=_x_<numtyp>(j,7);
+      vec4 jx=x_[j];
+      int jtype=jx.w;
 
       // Compute r12
       numtyp r12[3];
-      r12[0] = _x_<numtyp>(j,0)-ix;
-      r12[1] = _x_<numtyp>(j,1)-iy;
-      r12[2] = _x_<numtyp>(j,2)-iz;
+      r12[0] = jx.x-ix.x;
+      r12[1] = jx.y-ix.y;
+      r12[2] = jx.z-ix.z;
       numtyp ir = gpu_dot3(r12,r12);
 
       ir = rsqrt(ir);
@@ -463,7 +466,7 @@ __global__ void kernel_sphere_gb(const numtyp *gum, const numtyp *special_lj,
       r12hat[2]=r12[2]*ir;
 
       numtyp a2[9];
-      gpu_quat_to_mat_trans(j,a2);
+      gpu_quat_to_mat_trans(q,j,a2);
   
       numtyp u_r, dUr[3], eta;
       { // Compute U_r, dUr, eta, and teta
@@ -611,28 +614,29 @@ __global__ void kernel_sphere_gb(const numtyp *gum, const numtyp *special_lj,
     } // for nbor
 
     // Store answers
-    acctyp *ap1=ans+ii;
+    acctyp *ap1=ans+ii*ans_pitch;
     if (eflag) {
       *ap1=energy;
-      ap1+=ans_pitch;
+      ap1++;
     }
     if (vflag) {
       for (int i=0; i<6; i++) {
         *ap1=virial[i];
-        ap1+=ans_pitch;
+        ap1++;
       }
     }
     *ap1=fx;
-    ap1+=ans_pitch;
+    ap1++;
     *ap1=fy;
-    ap1+=ans_pitch;
+    ap1++;
     *ap1=fz;
   } // if ii
 }
 
 template<class numtyp, class acctyp>
-__global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor, 
-                          const int *dev_ij, const int nbor_pitch, acctyp *ans, 
+__global__ void kernel_lj(const vec4 *x_,
+                          const numtyp *special_lj, const int *dev_nbor, 
+                          const size_t nbor_pitch, const int *dev_ij, acctyp *ans, 
                           size_t ans_pitch, int *err_flag, const bool eflag, 
                           const bool vflag, const int start, const int inum,
                           const int nall) {
@@ -663,10 +667,8 @@ __global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor,
     const int *list=dev_ij+*nbor;
     const int *list_end=list+numj;
   
-    numtyp ix=_x_<numtyp>(i,0);
-    numtyp iy=_x_<numtyp>(i,1);
-    numtyp iz=_x_<numtyp>(i,2);
-    int itype=_x_<numtyp>(i,7);
+    vec4 ix=x_[i];
+    int itype=ix.w;
 
     numtyp factor_lj;
     for ( ; list<list_end; list++) {
@@ -678,12 +680,13 @@ __global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor,
         factor_lj = sp_lj[j/nall];
         j %= nall;
       }
-      int jtype=_x_<numtyp>(j,7);
+      vec4 jx=x_[j];
+      int jtype=jx.w;
 
       // Compute r12
-      numtyp delx = ix-_x_<numtyp>(j,0);
-      numtyp dely = iy-_x_<numtyp>(j,1);
-      numtyp delz = iz-_x_<numtyp>(j,2);
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
         
       if (r2inv<_cutsq_<numtyp>(itype,jtype) &&
@@ -716,29 +719,30 @@ __global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor,
     } // for nbor
 
     // Store answers
-    acctyp *ap1=ans+ii;
+    acctyp *ap1=ans+ii*ans_pitch;
     if (eflag) {
       *ap1+=energy;
-      ap1+=ans_pitch;
+      ap1++;
     }
     if (vflag) {
       for (int i=0; i<6; i++) {
         *ap1+=virial[i];
-        ap1+=ans_pitch;
+        ap1++;
       }
     }
     *ap1+=fx;
-    ap1+=ans_pitch;
+    ap1++;
     *ap1+=fy;
-    ap1+=ans_pitch;
+    ap1++;
     *ap1+=fz;
 
   } // if ii
 }
 
 template<class numtyp, class acctyp>
-__global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor, 
-                               const int *dev_ij, const int nbor_pitch, 
+__global__ void kernel_lj_fast(const vec4 *x_,
+                               const numtyp *special_lj, const int *dev_nbor, 
+                               const size_t nbor_pitch, const int *dev_ij, 
                                acctyp *ans, size_t ans_pitch,int *err_flag,
                                const bool eflag, const bool vflag, 
                                const int start, const int inum, const int nall){
@@ -788,10 +792,8 @@ __global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor,
     const int *list=dev_ij+*nbor;
     const int *list_end=list+numj;
   
-    numtyp ix=_x_<numtyp>(i,0);
-    numtyp iy=_x_<numtyp>(i,1);
-    numtyp iz=_x_<numtyp>(i,2);
-    int itype=INT_MUL(MAX_SHARED_TYPES,_x_<numtyp>(i,7));
+    vec4 ix=x_[i];
+    int itype=INT_MUL(MAX_SHARED_TYPES,ix.w);
 
     numtyp factor_lj;
     for ( ; list<list_end; list++) {
@@ -803,12 +805,13 @@ __global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor,
         factor_lj = sp_lj[j/nall];
         j %= nall;
       }
-      int mtype=itype+_x_<numtyp>(j,7);
+      vec4 jx=x_[j];
+      int mtype=itype+jx.w;
 
       // Compute r12
-      numtyp delx = ix-_x_<numtyp>(j,0);
-      numtyp dely = iy-_x_<numtyp>(j,1);
-      numtyp delz = iz-_x_<numtyp>(j,2);
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
         
       if (r2inv<cutsq[mtype] && form[mtype]==SPHERE_SPHERE) {
@@ -837,21 +840,21 @@ __global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor,
     } // for nbor
 
     // Store answers
-    acctyp *ap1=ans+ii;
+    acctyp *ap1=ans+ii*ans_pitch;
     if (eflag) {
       *ap1+=energy;
-      ap1+=ans_pitch;
+      ap1++;
     }
     if (vflag) {
       for (int i=0; i<6; i++) {
         *ap1+=virial[i];
-        ap1+=ans_pitch;
+        ap1++;
       }
     }
     *ap1+=fx;
-    ap1+=ans_pitch;
+    ap1++;
     *ap1+=fy;
-    ap1+=ans_pitch;
+    ap1++;
     *ap1+=fz;
 
   } // if ii
diff --git a/lib/gpu/gb_gpu_memory.cu b/lib/gpu/gb_gpu_memory.cu
index b092c5cd8b..c313f8be01 100644
--- a/lib/gpu/gb_gpu_memory.cu
+++ b/lib/gpu/gb_gpu_memory.cu
@@ -41,14 +41,17 @@ bool GB_GPU_MemoryT::init(const int ij_size, const int ntypes,
                           double *host_lshape, int **h_form, double **host_lj1, 
                           double **host_lj2, double **host_lj3, 
                           double **host_lj4, double **host_offset, 
-                          double *host_special_lj, const int max_nbors, 
+                          double *host_special_lj, const int nlocal,
+                          const int nall, const int max_nbors, 
                           const bool force_d, const int me) {
+  _max_nbors=max_nbors;
   if (this->allocated)
     clear();
     
   bool p=LJ_GPU_MemoryT::init(ij_size,ntypes,host_cutsq,host_sigma,host_epsilon,
                               host_lj1, host_lj2, host_lj3, host_lj4, 
-                              host_offset, host_special_lj, max_nbors, me);
+                              host_offset, host_special_lj, max_nbors, me,
+                              nlocal, nall);
   if (!p)
     return false;                              
     
@@ -96,11 +99,28 @@ bool GB_GPU_MemoryT::init(const int ij_size, const int ntypes,
         multiple_forms=true;
         
   // Memory for ilist ordered by particle type
-  host_olist.safe_alloc_rw(this->max_atoms);
-
-  return true;
+  return host_olist.alloc_rw(this->max_local);
 }
-  
+
+template <class numtyp, class acctyp>
+void GB_GPU_MemoryT::resize_atom(const int nall, bool &success) {
+  this->max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
+  this->atom.resize(this->max_atoms, success);
+}
+
+template <class numtyp, class acctyp>
+void GB_GPU_MemoryT::resize_local(const int nlocal, const int max_nbors,
+                                  bool &success) {
+  if (nlocal>this->max_local) {
+    this->max_local=static_cast<int>(static_cast<double>(nlocal)*1.10);
+    host_olist.clear();
+    success=success && host_olist.alloc_rw(this->max_local);
+  }
+  if (max_nbors>_max_nbors)
+    _max_nbors=static_cast<int>(static_cast<double>(max_nbors)*1.10);
+  this->nbor.resize(this->max_local,_max_nbors,success);
+}
+
 template <class numtyp, class acctyp>
 void GB_GPU_MemoryT::clear() {
   if (!this->allocated)
diff --git a/lib/gpu/gb_gpu_memory.h b/lib/gpu/gb_gpu_memory.h
index 496ce2033d..63627065bf 100644
--- a/lib/gpu/gb_gpu_memory.h
+++ b/lib/gpu/gb_gpu_memory.h
@@ -37,8 +37,11 @@ class GB_GPU_Memory : public LJ_GPU_Memory<numtyp,acctyp> {
             double **host_epsilon, double *host_lshape, int **h_form,
             double **host_lj1, double **host_lj2, double **host_lj3, 
             double **host_lj4, double **host_offset, double *host_special_lj,
-            const int max_nbors, const bool force_d, const int me);
+            const int max_nbors, const int nlocal, const int nall,
+            const bool force_d, const int me);
 
+  void resize_atom(const int nall, bool &success);
+  void resize_local(const int nlocal, const int max_nbors, bool &success);
   void clear();
  
   double host_memory_usage();
@@ -61,6 +64,7 @@ class GB_GPU_Memory : public LJ_GPU_Memory<numtyp,acctyp> {
   bool multiple_forms;
   int **host_form;
   int last_ellipse;
+  int _max_nbors;
    
  private:
 };
diff --git a/lib/gpu/lj_gpu.cu b/lib/gpu/lj_gpu.cu
index e31321ad2e..e96d3efb4f 100644
--- a/lib/gpu/lj_gpu.cu
+++ b/lib/gpu/lj_gpu.cu
@@ -74,7 +74,8 @@ EXTERN bool lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **
   ij_size=IJ_SIZE;
 
   bool ret = LJMF.init(ij_size, ntypes, cutsq, sigma, epsilon, host_lj1, host_lj2, 
-		       host_lj3, host_lj4, offset, special_lj, max_nbors, gpu_id);
+		       host_lj3, host_lj4, offset, special_lj, max_nbors, gpu_id,
+                       0,0);
 
   ncellx = ceil(((boxhi[0] - boxlo[0]) + 2.0*cell_size) / cell_size);
   ncelly = ceil(((boxhi[1] - boxlo[1]) + 2.0*cell_size) / cell_size);
@@ -101,37 +102,6 @@ EXTERN void lj_gpu_clear() {
 }
 
 
-// ---------------------------------------------------------------------------
-// Calculate energies and forces for all ij interactions
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-void _lj_gpu(LJMT &ljm, const bool eflag, const bool vflag, const bool rebuild){
-  // Compute the block size and grid size to keep all cores busy
-  const int BX=BLOCK_1D;
-
-  int GX=static_cast<int>(ceil(static_cast<double>(ljm.atom.inum())/BX));
-
-  ljm.time_pair.start();
-
-  if (ljm.shared_types)
-    kernel_lj_fast<numtyp,acctyp><<<GX,BX,0,ljm.pair_stream>>>
-           (ljm.special_lj.begin(), ljm.nbor.dev_nbor.begin(), 
-            ljm.nbor.ij.begin(), ljm.nbor.dev_nbor.row_size(), 
-            ljm.atom.ans.begin(), ljm.atom.ans.row_size(), eflag,
-            vflag, ljm.atom.inum(), ljm.atom.nall());
-  else
-    kernel_lj<numtyp,acctyp><<<GX,BX,0,ljm.pair_stream>>>
-           (ljm.special_lj.begin(), ljm.nbor.dev_nbor.begin(), 
-            ljm.nbor.ij.begin(), ljm.nbor.dev_nbor.row_size(), 
-            ljm.atom.ans.begin(), ljm.atom.ans.row_size(), eflag, 
-            vflag, ljm.atom.inum(), ljm.atom.nall());
-	    ljm.time_pair.stop();
-}
-
-EXTERN void lj_gpu(const bool eflag, const bool vflag, const bool rebuild) {
-  _lj_gpu<PRECISION,ACC_PRECISION>(LJMF, eflag,vflag,rebuild);
-}
-
 template <class numtyp, class acctyp>
 double _lj_gpu_cell(LJMT &ljm, double **force, double *virial,
 		    double **host_x, int *host_type, const int inum, 
diff --git a/lib/gpu/lj_gpu_kernel.h b/lib/gpu/lj_gpu_kernel.h
index f783616a83..b2f03cde2b 100644
--- a/lib/gpu/lj_gpu_kernel.h
+++ b/lib/gpu/lj_gpu_kernel.h
@@ -217,229 +217,4 @@ __global__ void kernel_lj_cell(float3 *force3,
 
 }
 
-
-/* Neigbhor list version of LJ kernel */
-template<class numtyp, class acctyp>
-__global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor, 
-                          const int *dev_ij, const int nbor_pitch, acctyp *ans, 
-                          size_t ans_pitch, const bool eflag, 
-                          const bool vflag, const int inum, const int nall) {
-  __shared__ numtyp sp_lj[4];
-                            
-  // ii indexes the two interacting particles in gi
-  int ii=threadIdx.x;
-  if (ii<4)
-    sp_lj[ii]=special_lj[ii];    
-  ii+=INT_MUL(blockIdx.x,blockDim.x);
-
-  if (ii<inum) {
-  
-    acctyp energy=(numtyp)0;
-    acctyp fx=(numtyp)0;
-    acctyp fy=(numtyp)0;
-    acctyp fz=(numtyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(numtyp)0;
-  
-    const int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-    const int *list=dev_ij+*nbor;
-    const int *list_end=list+numj;
-  
-    numtyp ix=_x_<numtyp>(i,0);
-    numtyp iy=_x_<numtyp>(i,1);
-    numtyp iz=_x_<numtyp>(i,2);
-    int itype=_x_<numtyp>(i,3);
-
-    numtyp factor_lj;
-    for ( ; list<list_end; list++) {
-  
-      int j=*list;
-      if (j < nall) 
-        factor_lj = 1.0;
-      else {
-        factor_lj = sp_lj[j/nall];
-        j %= nall;
-      }
-      int jtype=_x_<numtyp>(j,3);
-
-      // Compute r12
-      numtyp delx = ix-_x_<numtyp>(j,0);
-      numtyp dely = iy-_x_<numtyp>(j,1);
-      numtyp delz = iz-_x_<numtyp>(j,2);
-      numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
-      if (r2inv<_cutsq_<numtyp>(itype,jtype)) {
-        r2inv=(numtyp)1.0/r2inv;
-        numtyp r6inv =r2inv*r2inv*r2inv;
-        numtyp force =factor_lj*r2inv*r6inv*(_lj1_<numtyp>(itype,jtype).x*r6inv-
-                                             _lj1_<numtyp>(itype,jtype).y);
-      
-        fx+=delx*force;
-        fy+=dely*force;
-        fz+=delz*force;
-
-        if (eflag) {
-          numtyp e=r6inv*(_lj3_<numtyp>(itype,jtype).x*r6inv-
-                          _lj3_<numtyp>(itype,jtype).y);
-          energy+=factor_lj*(e-_offset_<numtyp>(1,1)); 
-        }
-        if (vflag) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-
-    // Store answers
-    acctyp *ap1=ans+ii;
-    if (eflag) {
-      *ap1=energy;
-      ap1+=ans_pitch;
-    }
-    if (vflag) {
-      for (int i=0; i<6; i++) {
-        *ap1=virial[i];
-        ap1+=ans_pitch;
-      }
-    }
-    *ap1=fx;
-    ap1+=ans_pitch;
-    *ap1=fy;
-    ap1+=ans_pitch;
-    *ap1=fz;
-
-  } // if ii
-}
-
-template<class numtyp, class acctyp>
-__global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor, 
-                               const int *dev_ij, const int nbor_pitch, 
-                               acctyp *ans, size_t ans_pitch,const bool eflag, 
-                               const bool vflag, const int inum, 
-                               const int nall) {
-                                
-  // ii indexes the two interacting particles in gi
-  int ii=threadIdx.x;
-  __shared__ numtyp sp_lj[4];
-  __shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp lj2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp lj4[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp offset[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  if (ii<4)
-    sp_lj[ii]=special_lj[ii];    
-  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    int itype=ii/MAX_SHARED_TYPES;
-    int jtype=ii%MAX_SHARED_TYPES;
-    cutsq[ii]=_cutsq_<numtyp>(itype,jtype);
-    lj1[ii]=_lj1_<numtyp>(itype,jtype).x;
-    lj2[ii]=_lj1_<numtyp>(itype,jtype).y;
-    if (eflag) {
-      lj3[ii]=_lj3_<numtyp>(itype,jtype).x;
-      lj4[ii]=_lj3_<numtyp>(itype,jtype).y;
-      offset[ii]=_offset_<numtyp>(itype,jtype);
-    }
-  }
-  ii+=INT_MUL(blockIdx.x,blockDim.x);
-  
-  if (ii<inum) {
-  
-    acctyp energy=(numtyp)0;
-    acctyp fx=(numtyp)0;
-    acctyp fy=(numtyp)0;
-    acctyp fz=(numtyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(numtyp)0;
-  
-    const int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-    const int *list=dev_ij+*nbor;
-    const int *list_end=list+numj;
-  
-    numtyp ix=_x_<numtyp>(i,0);
-    numtyp iy=_x_<numtyp>(i,1);
-    numtyp iz=_x_<numtyp>(i,2);
-    int itype=INT_MUL(MAX_SHARED_TYPES,_x_<numtyp>(i,3));
-
-    numtyp factor_lj;
-
-    for ( ; list<list_end; list++) {
-
-      int j= *list;
-      
-      if (j < nall) 
-        factor_lj = 1.0;
-      else {
-        factor_lj = sp_lj[j/nall];
-        j %= nall;
-      }
-      int mtype=itype+_x_<numtyp>(j,3);
-
-      // Compute r12
-      numtyp delx = ix-_x_<numtyp>(j,0);
-      numtyp dely = iy-_x_<numtyp>(j,1);
-      numtyp delz = iz-_x_<numtyp>(j,2);
-      numtyp r2inv = delx*delx+dely*dely+delz*delz;
-
-      if (r2inv<cutsq[mtype]) {
-        r2inv=(numtyp)1.0/r2inv;
-        numtyp r6inv = r2inv*r2inv*r2inv;
-        numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype]*r6inv-lj2[mtype]);
-      
-        fx+=delx*force;
-        fy+=dely*force;
-        fz+=delz*force;
-
-        if (eflag) {
-          numtyp e=r6inv*(lj3[mtype]*r6inv-lj4[mtype]);
-          energy+=factor_lj*(e-offset[mtype]); 
-        }
-        if (vflag) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-
-    // Store answers
-    acctyp *ap1=ans+ii;
-    if (eflag) {
-      *ap1=energy;
-      ap1+=ans_pitch;
-    }
-    if (vflag) {
-      for (int i=0; i<6; i++) {
-        *ap1=virial[i];
-        ap1+=ans_pitch;
-      }
-    }
-    *ap1=fx;
-    ap1+=ans_pitch;
-    *ap1=fy;
-    ap1+=ans_pitch;
-    *ap1=fz;
-
-  } // if ii
-}
-
 #endif
diff --git a/lib/gpu/lj_gpu_memory.cu b/lib/gpu/lj_gpu_memory.cu
index 20ac9e1440..5e7e9207ae 100644
--- a/lib/gpu/lj_gpu_memory.cu
+++ b/lib/gpu/lj_gpu_memory.cu
@@ -25,15 +25,6 @@ int LJ_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
   return atom.bytes_per_atom()+nbor.bytes_per_atom(max_nbors); 
 }
 
-template <class numtyp, class acctyp>
-int LJ_GPU_MemoryT::get_max_atoms(const size_t gpu_bytes, const int max_nbors) {
-  int matoms=static_cast<int>(PERCENT_GPU_MEMORY*gpu_bytes/
-                              bytes_per_atom(max_nbors));
-  if (matoms>MAX_ATOMS)
-    matoms=MAX_ATOMS;
-  return matoms;
-}
-  
 template <class numtyp, class acctyp>
 bool LJ_GPU_MemoryT::init(const int ij_size, const int ntypes, 
                           double **host_cutsq, double **host_sigma, 
@@ -41,7 +32,7 @@ bool LJ_GPU_MemoryT::init(const int ij_size, const int ntypes,
                           double **host_lj2, double **host_lj3, 
                           double **host_lj4, double **host_offset, 
                           double *host_special_lj, const int max_nbors, 
-                          const int me) {
+                          const int me, const int nlocal, const int nall) {
   if (allocated)
     clear();
     
@@ -55,9 +46,18 @@ bool LJ_GPU_MemoryT::init(const int ij_size, const int ntypes,
   time_pair.init();
 
   // Initialize atom and nbor data
-  max_atoms=get_max_atoms(gpu.bytes(),max_nbors);
-  atom.init(max_atoms);
-  nbor.init(ij_size,max_atoms,max_nbors);
+  max_local=static_cast<int>(static_cast<double>(nlocal)*1.10);
+  if (max_local==0)
+    max_local=1000;
+  if (nall<=nlocal)
+    max_atoms=max_local*2;
+  else
+    max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
+  
+  if (!atom.init(max_atoms))
+    return false;
+  if (!nbor.init(ij_size,max_local,max_nbors))
+    return false;
   
   // Get a stream for computing pair potentials
   CUDA_SAFE_CALL(cudaStreamCreate(&pair_stream));
@@ -112,7 +112,7 @@ bool LJ_GPU_MemoryT::init(const int ij_size, const int ntypes,
   allocated=true;
   return true;
 }
-  
+
 template <class numtyp, class acctyp>
 void LJ_GPU_MemoryT::clear() {
   if (!allocated)
diff --git a/lib/gpu/lj_gpu_memory.h b/lib/gpu/lj_gpu_memory.h
index f5a407ba0b..3a292cb74c 100644
--- a/lib/gpu/lj_gpu_memory.h
+++ b/lib/gpu/lj_gpu_memory.h
@@ -28,7 +28,6 @@
 #define BLOCK_1D 64           // max value = 256
 #define CELL_SIZE BLOCK_1D
 #define MAX_SHARED_TYPES 8
-#define PERCENT_GPU_MEMORY 0.7
 #define BIG_NUMBER 100000000
 
 template <class numtyp, class acctyp>
@@ -42,14 +41,13 @@ class LJ_GPU_Memory {
             double **host_sigma, double **host_epsilon, 
             double **host_lj1, double **host_lj2, double **host_lj3, 
             double **host_lj4, double **host_offset, double *host_special_lj,
-            const int max_nbors, const int me);
+            const int max_nbors, const int me, const int nlocal,
+            const int nall);
   /// Free any memory on host and device
   void clear();
 
   /// Returns memory usage on GPU per atom
   int bytes_per_atom(const int max_nbors) const;
-  /// Maximum number of atoms that can be stored on GPU
-  int get_max_atoms(const size_t gpu_bytes, const int max_nbors);
   /// Total host memory used by library
   double host_memory_usage() const;
   
@@ -72,7 +70,7 @@ class LJ_GPU_Memory {
   NVC_ConstMat< typename nvc_vec_traits<numtyp>::vec2 > lj1, lj3;
   NVC_VecT special_lj;
   
-  size_t max_atoms;
+  size_t max_atoms, max_local;
   
   // Timing for pair calculation
   NVCTimer time_pair;
diff --git a/lib/gpu/nvc_get_devices.cu b/lib/gpu/nvc_get_devices.cu
index 652c4fc22a..6b54f10f41 100644
--- a/lib/gpu/nvc_get_devices.cu
+++ b/lib/gpu/nvc_get_devices.cu
@@ -8,7 +8,6 @@
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
 
-   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
@@ -17,6 +16,12 @@
                          Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */
 
+#if defined(__APPLE__)
+#if _GLIBCXX_ATOMIC_BUILTINS == 1
+#undef _GLIBCXX_ATOMIC_BUILTINS
+#endif // _GLIBCXX_ATOMIC_BUILTINS
+#endif // __APPLE__
+
 #include "nvc_device.h"
 
 int main(int argc, char** argv) {
diff --git a/lib/gpu/nvc_macros.h b/lib/gpu/nvc_macros.h
index 2273b96895..ef61684c46 100644
--- a/lib/gpu/nvc_macros.h
+++ b/lib/gpu/nvc_macros.h
@@ -20,6 +20,12 @@
 #ifndef NVC_MACROS_H
 #define NVC_MACROS_H
 
+#if defined(__APPLE__)
+#if _GLIBCXX_ATOMIC_BUILTINS == 1
+#undef _GLIBCXX_ATOMIC_BUILTINS
+#endif // _GLIBCXX_ATOMIC_BUILTINS
+#endif // __APPLE__
+
 #include <stdio.h>
 #include "math_constants.h"
 #define INT_MUL(x,y)	(__mul24(x,y))
diff --git a/lib/gpu/nvc_memory.h b/lib/gpu/nvc_memory.h
index 5b2c164a7f..821ff7b568 100644
--- a/lib/gpu/nvc_memory.h
+++ b/lib/gpu/nvc_memory.h
@@ -96,6 +96,17 @@ class NVC_Host {
     _end=_array+cols;
   }
 
+  // Allocate page-locked memory with fast write/slow read on host
+  inline bool alloc_w(const size_t cols) {
+    _cols=cols;
+    _row_bytes=cols*sizeof(numtyp);
+    if (cudaHostAlloc((void **)&_array,_row_bytes,cudaHostAllocWriteCombined)!=
+        cudaSuccess)
+      return false;
+    _end=_array+cols;
+    return true;
+  }
+
   // Allocate page-locked memory with fast read/write on host
   inline void safe_alloc_rw(const size_t cols) {
     _cols=cols;
@@ -104,6 +115,16 @@ class NVC_Host {
     _end=_array+cols;
   }
   
+  // Allocate page-locked memory with fast read/write on host
+  inline bool alloc_rw(const size_t cols) {
+    _cols=cols;
+    _row_bytes=cols*sizeof(numtyp);
+    if (cudaMallocHost((void **)&_array,_row_bytes)!=cudaSuccess)
+      return false;
+    _end=_array+cols;
+    return true;
+  }
+  
   /// Free any memory associated with device
   inline void clear() 
     { if (_cols>0) { _cols=0; CUDA_SAFE_CALL(cudaFreeHost(_array)); } }
@@ -163,6 +184,13 @@ class NVC_Host {
                                            cudaMemcpyHostToDevice,stream));
   }
   
+  /// Asynchronous copy to device (numel is not bytes)
+  inline void copy_to_device(size_t offset, numtyp *device_p, size_t numel,
+                             cudaStream_t &stream) {
+    CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(device_p,_array+offset,numel*sizeof(numtyp),
+                                           cudaMemcpyHostToDevice,stream));
+  }
+  
   /// Asynchronous copy to 2D matrix on device (numel is not bytes)
   inline void copy_to_2Ddevice(numtyp *device_p, const size_t dev_row_size,
                                const size_t rows, const size_t cols,
@@ -194,6 +222,16 @@ class NVC_Vec {
     _end=_array+cols;
   }
   
+  // Row vector on device
+  inline bool alloc(const size_t cols) {
+    _cols=cols;
+    _row_bytes=cols*sizeof(numtyp);
+    if (cudaMalloc((void **)&_array,_row_bytes)!=cudaSuccess)
+      return false;
+    _end=_array+cols;
+    return true;
+  }
+  
   // Row vector on device (allocate and assign texture and bind)
   inline void safe_alloc(const size_t cols, textureReference *t) 
     { safe_alloc(cols); assign_texture(t); bind(); }
@@ -221,11 +259,22 @@ class NVC_Vec {
     { CUDA_SAFE_CALL(cudaMemcpy(_array,host_p,row_bytes(), 
                                 cudaMemcpyHostToDevice)); }
 
+  /// Copy from host (n elements)
+  inline void copy_from_host(const numtyp *host_p, const size_t n)
+    { CUDA_SAFE_CALL(cudaMemcpy(_array,host_p,n*sizeof(numtyp), 
+                                cudaMemcpyHostToDevice)); }
+
   /// Asynchronous copy from host
   inline void copy_from_host(const numtyp *host_p, cudaStream_t &stream)
     { CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(_array,host_p,row_bytes(), 
                              cudaMemcpyHostToDevice, stream)); }
 
+  /// Asynchronous copy from host (n elements)
+  inline void copy_from_host(const numtyp *host_p, const size_t n,
+                             cudaStream_t &stream)
+    { CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(_array,host_p,n*sizeof(numtyp), 
+                             cudaMemcpyHostToDevice, stream)); }
+
   /// Copy to host
   inline void copy_to_host(numtyp *host_p)
     { CUDA_SAFE_CALL(cudaMemcpy(host_p,_array,row_bytes(),
diff --git a/lib/gpu/pair_gpu_atom.cu b/lib/gpu/pair_gpu_atom.cu
index a5573101d0..7f2f13f3b4 100644
--- a/lib/gpu/pair_gpu_atom.cu
+++ b/lib/gpu/pair_gpu_atom.cu
@@ -28,7 +28,9 @@ int PairGPUAtomT::bytes_per_atom() const {
 }
 
 template <class numtyp, class acctyp>
-void PairGPUAtomT::init(const int max_atoms) {
+bool PairGPUAtomT::init(const int max_atoms) {
+  bool success=true;
+  
   if (allocated)
     clear();
     
@@ -39,28 +41,48 @@ void PairGPUAtomT::init(const int max_atoms) {
   time_answer.init();
 
   // Device matrices for atom and force data
-  dev_x.safe_alloc(atom_fields(),max_atoms,x_get_texture<numtyp>());
-  ans.safe_alloc(ans_fields(),max_atoms);
+  success=success && dev_x.alloc(max_atoms*sizeof(vec4));
+  success=success && dev_q.alloc(max_atoms*sizeof(vec4));
+  success=success && ans.alloc(ans_fields()*max_atoms);
+  // Get a host read/write buffer
+  success=success && host_read.alloc_rw(max_atoms*ans_fields());
 
   // Get a host write only buffer
-  host_write.safe_alloc_w(max_atoms*atom_fields());
-  // Get a host read/write buffer
-  host_read.safe_alloc_rw(ans.row_size()*ans_fields());
+  success=success && host_write.alloc_w(max_atoms*atom_fields());
     
   allocated=true;
+  
+  return success;
 }
   
+template <class numtyp, class acctyp>
+void PairGPUAtomT::resize(const int max_atoms, bool &success) {
+  ans.clear();
+  dev_x.clear();
+  dev_q.clear();
+  host_write.clear();
+  host_read.clear();
+
+  _max_atoms=max_atoms;
+
+  success = success && dev_x.alloc(_max_atoms*sizeof(vec4));
+  success = success && dev_q.alloc(_max_atoms*sizeof(vec4));
+  success = success && ans.alloc(ans_fields()*_max_atoms);
+  success = success && host_read.alloc_rw(_max_atoms*ans_fields());
+  success = success && host_write.alloc_w(_max_atoms*atom_fields());
+}  
+ 
 template <class numtyp, class acctyp>
 void PairGPUAtomT::clear() {
   if (!allocated)
       return;
   allocated=false;
       
-  dev_x.unbind();
-  ans.clear();                               
+  ans.clear();
+  dev_x.clear();
+  dev_q.clear();
   host_write.clear();
   host_read.clear();
-  dev_x.clear();
 }  
  
 template <class numtyp, class acctyp>
@@ -82,88 +104,88 @@ void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag,
   if (!vflag)
     csize-=6;
       
-  host_read.copy_from_device(ans.begin(),ans.row_size()*csize,s);
+  host_read.copy_from_device(ans.begin(),_inum*csize,s);
 }
   
 template <class numtyp, class acctyp>
 double PairGPUAtomT::energy_virial(const int *ilist, const bool eflag_atom,
                                    const bool vflag_atom, double *eatom, 
-                                   double **vatom, double *virial) {
+                                   double **vatom, double *virial,
+                                   double **f, double **tor, const int n) {
   double evdwl=0.0;
-  int gap=ans.row_size()-_inum;
 
   acctyp *ap=host_read.begin();
-  if (_eflag) {
-    if (eflag_atom) {
-      for (int i=0; i<_inum; i++) {
+  for (int i=0; i<_inum; i++) {
+    int ii=ilist[i];
+    if (_eflag) {
+      if (eflag_atom) {
         evdwl+=*ap;
-        eatom[ilist[i]]+=*ap*0.5;
+        eatom[ii]+=*ap*0.5;
         ap++;
-      }
-    } else
-      for (int i=0; i<_inum; i++) {
+      } else {
         evdwl+=*ap;
         ap++;
       }
-    ap+=gap;
-    evdwl*=0.5;
-  }
-  _read_loc=ap;
-  gap=ans.row_size();
-  if (_vflag) {
-    if (vflag_atom) {
-      for (int ii=0; ii<_inum; ii++) {
-        int i=ilist[ii];
-        ap=_read_loc+ii;
+    }
+    if (_vflag) {
+      if (vflag_atom) {
         for (int j=0; j<6; j++) {
-          vatom[i][j]+=*ap*0.5;
+          vatom[ii][j]+=*ap*0.5;
           virial[j]+=*ap;
-          ap+=gap;
+          ap++;
         }
-      }
-    } else {
-      for (int ii=0; ii<_inum; ii++) {
-        ap=_read_loc+ii;
+      } else {
         for (int j=0; j<6; j++) {
           virial[j]+=*ap;
-          ap+=gap;
+          ap++;
         }
       }
     }
-    for (int j=0; j<6; j++)
-      virial[j]*=0.5;
-    _read_loc+=gap*6;
+    f[ii][0]+=*ap;
+    ap++;
+    f[ii][1]+=*ap;
+    ap++;
+    f[ii][2]+=*ap;
+    ap++;
+    if (i<n) {
+      tor[ii][0]+=*ap;
+      ap++;
+      tor[ii][1]+=*ap;
+      ap++;
+      tor[ii][2]+=*ap;
+      ap++;
+    } else {
+      ap+=3;
+    }
   }
-  
+  for (int j=0; j<6; j++)
+    virial[j]*=0.5;
+  evdwl*=0.5;
   return evdwl;
 }
 
 template <class numtyp, class acctyp>
-void PairGPUAtomT::add_forces(const int *ilist, double **f) {
-  int gap=ans.row_size();
-  for (int ii=0; ii<_inum; ii++) {
-    acctyp *ap=_read_loc+ii;
-    int i=ilist[ii];
-    f[i][0]+=*ap;
-    ap+=gap;
-    f[i][1]+=*ap;
-    ap+=gap;
-    f[i][2]+=*ap;
-  }
-}
-
-template <class numtyp, class acctyp>
-void PairGPUAtomT::add_torques(const int *ilist, double **tor, const int n) {
-  int gap=ans.row_size();
-  _read_loc+=gap*3;
-  for (int ii=0; ii<n; ii++) {
-    acctyp *ap=_read_loc+ii;
-    int i=ilist[ii];
-    tor[i][0]+=*ap;
-    ap+=gap;
-    tor[i][1]+=*ap;
-    ap+=gap;
-    tor[i][2]+=*ap;
+void PairGPUAtomT::copy_asphere(const int *ilist, double **f, double **tor, 
+                                const int n) {
+  acctyp *ap=host_read.begin();
+  for (int i=0; i<_inum; i++) {
+    int ii=ilist[i];
+    f[ii][0]+=*ap;
+    ap++;
+    f[ii][1]+=*ap;
+    ap++;
+    f[ii][2]+=*ap;
+    ap++;
+    if (i<n) {
+      tor[ii][0]+=*ap;
+      ap++;
+      tor[ii][1]+=*ap;
+      ap++;
+      tor[ii][2]+=*ap;
+      ap++;
+    } else {
+      ap+=3;
+    }
   }
 }
 
diff --git a/lib/gpu/pair_gpu_atom.h b/lib/gpu/pair_gpu_atom.h
index dcbcaa83b2..a129cd5850 100644
--- a/lib/gpu/pair_gpu_atom.h
+++ b/lib/gpu/pair_gpu_atom.h
@@ -26,18 +26,21 @@
 #define PRECISION float
 #define ACC_PRECISION double
 #define MAX_ATOMS 65536
+#define vec4 float4
 #endif
 
 #ifdef _DOUBLE_DOUBLE
 #define PRECISION double
 #define ACC_PRECISION double
 #define MAX_ATOMS 32768
+struct vec4 { double x; double y; double z; double w; };
 #endif
 
 #ifndef PRECISION
 #define PRECISION float
 #define ACC_PRECISION float
 #define MAX_ATOMS 65536
+#define vec4 float4
 #endif
 
 #include "nvc_timer.h"
@@ -71,7 +74,8 @@ class PairGPUAtom {
 
   /// Must be called once to allocate host and device memory
   /** \note atom_fields and ans_fields should be set first if not default **/
-  void init(const int max_atoms);
+  bool init(const int max_atoms);
+  void resize(const int max_atoms, bool &success);
   
   /// Free all memory on host and device
   void clear();
@@ -99,11 +103,35 @@ class PairGPUAtom {
     for (int i=0; i<t; i+=stride) { *_write_loc=hostptr[i]; _write_loc++; }
   }
   
-  /// Copy num_rows x nall() write buffer to x in GPU
-  /** num_rows<=atom_fields() **/
-  inline void copy_atom_data(const int num_rows, cudaStream_t &stream) 
-    { dev_x.copy_2Dfrom_host(host_write.begin(),num_rows,nall(),stream); }
+  /// Add positions to write buffer
+  /** Copies nall() elements **/
+  inline void add_x_data(double **host_ptr, const int *host_type) {
+    for (int i=0; i<_nall; i++) {
+      *_write_loc=host_ptr[i][0];
+      _write_loc++;
+      *_write_loc=host_ptr[i][1];
+      _write_loc++;
+      *_write_loc=host_ptr[i][2];
+      _write_loc++;
+      *_write_loc=host_type[i];
+      _write_loc++;
+    }
+  }      
 
+  /// Add quaternions to write buffer
+  /** Copies nall() elements **/
+  template<class cpytyp>
+  inline void add_q_data(const cpytyp *host_ptr) {
+    const int end=_nall*4;
+    for (int i=0; i<end; i++) { *_write_loc=host_ptr[i]; _write_loc++; } 
+  } 
+
+  /// Copy num_rows positions+type to x in GPU
+  /** num_rows<=atom_fields() **/
+  inline void copy_x_data(cudaStream_t &stream) 
+    { dev_x.copy_from_host(host_write.begin(),_nall*4,stream); }
+  inline void copy_q_data(cudaStream_t &stream) 
+    { dev_q.copy_from_host(host_write.begin()+_nall*4,_nall*4,stream); }
     
   // -------------------------COPY FROM GPU -------------------------------
 
@@ -113,22 +141,19 @@ class PairGPUAtom {
   /// Copy energy and virial data into LAMMPS memory
   double energy_virial(const int *ilist, const bool eflag_atom,
                        const bool vflag_atom, double *eatom, double **vatom,
-                       double *virial);
-
-  /// Add forces from the GPU into a LAMMPS pointer
-  void add_forces(const int *ilist, double **f);
-
-  /// Add torques from the GPU into a LAMMPS pointer
-  void add_torques(const int *ilist, double **tor, const int n);
-    
+                       double *virial, double **f, double **tor, const int);
+                       
+  /// Add forces and torques from the GPU into a LAMMPS pointer
+  void copy_asphere(const int *ilist, double **f, double **tor, const int n);
   // ------------------------------ DATA ----------------------------------
 
-  // atom_fields() x n (example: rows 1-3 position, 4 is type)
-  NVC_ConstMatT dev_x;
-  // ans_fields() x n Storage for Forces, etc.
-  // example: if (eflag and vflag) 1 is energy, 2-7 is virial, 8-10 is force 
-  // example: if (!eflag) 1-3 is force
-  NVC_Mat<acctyp> ans;                               
+  // atom coordinates
+  NVC_Vec<numtyp> dev_x;
+  // quaterions
+  NVC_Vec<numtyp> dev_q;
+  // ans_fields()
+  // example: if (eflag and vflag) 1 is energy, 2-7 is virial
+  NVC_Vec<acctyp> ans;                               
 
   // Buffer for moving floating point data to GPU
   NVC_HostT host_write;
diff --git a/lib/gpu/pair_gpu_cell.cu b/lib/gpu/pair_gpu_cell.cu
index 2f167df15f..3e6aa39bc5 100644
--- a/lib/gpu/pair_gpu_cell.cu
+++ b/lib/gpu/pair_gpu_cell.cu
@@ -17,6 +17,12 @@
                          Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */
 
+#if defined(__APPLE__)
+#if _GLIBCXX_ATOMIC_BUILTINS == 1
+#undef _GLIBCXX_ATOMIC_BUILTINS
+#endif // _GLIBCXX_ATOMIC_BUILTINS
+#endif // __APPLE__
+
 #include <assert.h>
 #include "lj_gpu_memory.h"
 #include "pair_gpu_cell.h"
diff --git a/lib/gpu/pair_gpu_nbor.cu b/lib/gpu/pair_gpu_nbor.cu
index 88fdef8dfe..20fba00691 100644
--- a/lib/gpu/pair_gpu_nbor.cu
+++ b/lib/gpu/pair_gpu_nbor.cu
@@ -26,8 +26,9 @@ int PairGPUNbor::bytes_per_atom(const int max_nbors) const {
     return (max_nbors+3)*sizeof(int);
 }
 
-void PairGPUNbor::init(const int ij_size, const int max_atoms, 
+bool PairGPUNbor::init(const int ij_size, const int max_atoms, 
                        const int max_nbors) {
+  bool success=true;
   if (allocated)
     clear();
     
@@ -35,16 +36,29 @@ void PairGPUNbor::init(const int ij_size, const int max_atoms,
   time_nbor.init();
 
   if (_use_packing)
-    dev_nbor.safe_alloc(max_nbors+4,max_atoms);
+    success=success && dev_nbor.alloc((max_nbors+4)*max_atoms);
   else  
-    dev_nbor.safe_alloc(3,max_atoms);
+    success=success && dev_nbor.alloc(3*max_atoms);
   
-  ij.safe_alloc(max_nbors*max_atoms);
-  host_ij.safe_alloc_w(ij_size);
+  success=success && ij.alloc(max_nbors*max_atoms);
+  success=success && host_ij.alloc_w(ij_size);
     
   allocated=true;
+  
+  return success;
 }
   
+void PairGPUNbor::resize(const int nlocal, const int max_nbor, bool &success) {
+  dev_nbor.clear();
+  ij.clear();
+  if (_use_packing)
+    success=success && dev_nbor.alloc((max_nbor+4)*nlocal);
+  else  
+    success=success && dev_nbor.alloc(3*nlocal);
+  success=success && ij.alloc(max_nbor*nlocal);
+  allocated=true;
+}
+
 void PairGPUNbor::clear() {
   if (!allocated)
     return;
@@ -54,7 +68,7 @@ void PairGPUNbor::clear() {
   host_ij.clear();
   dev_nbor.clear();
 }  
- 
+
 double PairGPUNbor::host_memory_usage() const {
   return IJ_SIZE*sizeof(int)+sizeof(PairGPUNbor);
 }
@@ -65,12 +79,35 @@ void PairGPUNbor::reset(const int inum, int *ilist, const int *numj,
 
   dev_nbor.copy_from_host(ilist,inum);
   int acc=0;
-  for (int i=0; i<inum; i++) {
-    host_ij[i]=numj[ilist[i]];
-    host_ij[i+inum]=acc;
-    acc+=numj[ilist[i]];
-  }
-  
-  host_ij.copy_to_2Ddevice(dev_nbor.begin()+dev_nbor.row_size(),
-                           dev_nbor.row_size(),2,inum, s);
+   
+  int ij_size=host_ij.numel();
+  if (inum*2<ij_size) {
+    for (int i=0; i<inum; i++) {
+      host_ij[i]=numj[ilist[i]];
+      host_ij[i+inum]=acc;
+      acc+=numj[ilist[i]];
+    }
+    host_ij.copy_to_device(dev_nbor.begin()+inum,2*inum, s);
+  } else {
+    int offset=0;
+    int half=ij_size/2;
+    int hi=0;
+    for (int i=0; i<inum; i++) {
+      host_ij[hi]=numj[ilist[i]];
+      host_ij[hi+half]=acc;
+      acc+=numj[ilist[i]];
+      hi++;
+      if (hi==half) {
+        host_ij.copy_to_device(dev_nbor.begin()+inum+offset,half,s);
+        host_ij.copy_to_device(half,dev_nbor.begin()+2*inum+offset,half,s);
+        offset+=half;
+        hi=0;
+        CUDA_SAFE_CALL(cudaStreamSynchronize(s));
+      }
+   }
+   if (hi>0) {
+     host_ij.copy_to_device(dev_nbor.begin()+inum+offset,hi,s);
+     host_ij.copy_to_device(half,dev_nbor.begin()+2*inum+offset,hi,s);
+   }
+ }
 }
diff --git a/lib/gpu/pair_gpu_nbor.h b/lib/gpu/pair_gpu_nbor.h
index 6d19972530..c505120785 100644
--- a/lib/gpu/pair_gpu_nbor.h
+++ b/lib/gpu/pair_gpu_nbor.h
@@ -40,8 +40,10 @@ class PairGPUNbor {
   void packing(const bool use_packing) { _use_packing=use_packing; }
   
   /// Called once to allocate memory
-  void init(const int ij_size, const int max_atoms, const int max_nbors);
+  bool init(const int ij_size, const int max_atoms, const int max_nbors);
   
+  void resize(const int nlocal, const int max_nbor, bool &success);
+
   /// Free all memory on host and device
   void clear();
  
@@ -73,7 +75,7 @@ class PairGPUNbor {
   // - 1st row is i
   // - 2nd row is numj (number of neighbors)
   // - 3rd row is starting address in host_ij of neighbors
-  NVC_MatI dev_nbor;
+  NVC_VecI dev_nbor;
 
   // --------------- Timing Stuff
   NVCTimer time_nbor;
diff --git a/lib/gpu/pair_gpu_texture.h b/lib/gpu/pair_gpu_texture.h
index accc4e6b29..6c26b47952 100644
--- a/lib/gpu/pair_gpu_texture.h
+++ b/lib/gpu/pair_gpu_texture.h
@@ -31,32 +31,6 @@
 #define GB_GPU_DOUBLE
 #endif
 
-// ------------------------------- x ------------------------------------
-
-static texture<float, 2, cudaReadModeElementType> x_float_tex;
-static texture<int2, 2, cudaReadModeElementType> x_double_tex;
-template <class numtyp> inline textureReference * x_get_texture() {
-  const textureReference *ptr;
-  cudaGetTextureReference(&ptr,"x_float_tex");
-  return const_cast<textureReference *>(ptr);
-}
-template <> inline textureReference * x_get_texture<double>() {
-  const textureReference *ptr;
-  cudaGetTextureReference(&ptr,"x_double_tex");
-  return const_cast<textureReference *>(ptr);
-}
-template <class numtyp>
-static __inline__ __device__ numtyp _x_(const int i, const int j) {
-  return tex2D(x_float_tex,i,j);
-}
-#ifdef GB_GPU_DOUBLE
-template <>
-static __inline__ __device__ double _x_<double>(const int i,const int j) {
-  int2 t=tex2D(x_double_tex,i,j);
-  return __hiloint2double(t.y, t.x);
-}
-#endif
-
 // ------------------------------- form ------------------------------------
 
 static texture<int, 2, cudaReadModeElementType> form_tex;
diff --git a/lib/gpu/pair_tex_tar.cu b/lib/gpu/pair_tex_tar.cu
index b9a275ee22..426d2ff059 100644
--- a/lib/gpu/pair_tex_tar.cu
+++ b/lib/gpu/pair_tex_tar.cu
@@ -17,6 +17,12 @@
                          Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */
 
+#if defined(__APPLE__)
+#if _GLIBCXX_ATOMIC_BUILTINS == 1
+#undef _GLIBCXX_ATOMIC_BUILTINS
+#endif // _GLIBCXX_ATOMIC_BUILTINS
+#endif // __APPLE__
+
 #include "pair_gpu_atom.cu"
 #include "lj_gpu.cu"
 #include "gb_gpu.cu"