diff --git a/src/KOKKOS/kokkos.cpp b/src/KOKKOS/kokkos.cpp
index 4f6031f229..b73e25dbc6 100644
--- a/src/KOKKOS/kokkos.cpp
+++ b/src/KOKKOS/kokkos.cpp
@@ -35,8 +35,8 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
   // process any command-line args that invoke Kokkos settings
 
   int device = 0;
-  int num_threads = 1;
-  int numa = 1;
+  num_threads = 1;
+  numa = 1;
 
   int iarg = 0;
   while (iarg < narg) {
diff --git a/src/KOKKOS/kokkos.h b/src/KOKKOS/kokkos.h
index 512c76a489..84a87279fc 100644
--- a/src/KOKKOS/kokkos.h
+++ b/src/KOKKOS/kokkos.h
@@ -27,6 +27,8 @@ class KokkosLMP : protected Pointers {
   int forward_comm_classic;
   int exchange_comm_on_host;
   int forward_comm_on_host;
+  int num_threads;
+  int numa;
 
   KokkosLMP(class LAMMPS *, int, char **);
   ~KokkosLMP();
diff --git a/src/USER-CUDA/comm_cuda.cpp b/src/USER-CUDA/comm_cuda.cpp
index 4c130e67b5..1644fca80a 100644
--- a/src/USER-CUDA/comm_cuda.cpp
+++ b/src/USER-CUDA/comm_cuda.cpp
@@ -75,7 +75,7 @@ CommCuda::CommCuda(LAMMPS *lmp) : CommBrick(lmp)
   buf_send = NULL;
   buf_recv = NULL;
 
-  Comm::free_swap();
+  CommBrick::free_swap();
   allocate_swap(maxswap);
 }
 
@@ -132,7 +132,7 @@ void CommCuda::init()
   cuda->shared_data.comm.slablo.dev_data=cu_slablo->dev_data();
   cuda->shared_data.comm.slabhi.dev_data=cu_slabhi->dev_data();
 
-  Comm::init();
+  CommBrick::init();
 }
 
 /* ----------------------------------------------------------------------
@@ -145,7 +145,7 @@ void CommCuda::init()
 void CommCuda::setup()
 {
   if(cuda->shared_data.pair.neighall) cutghostuser = MAX(2.0*neighbor->cutneighmax,cutghostuser);
-  Comm::setup();
+  CommBrick::setup();
 
   //upload changed geometry to device
     if(style == SINGLE)
@@ -197,7 +197,7 @@ void CommCuda::forward_comm_cuda()
   if(not comm_x_only && not avec->cudable)
   {
           cuda->downloadAll();
-    Comm::forward_comm();
+    CommBrick::forward_comm();
     cuda->uploadAll();
     return;
   }
@@ -630,7 +630,7 @@ void CommCuda::forward_comm_pair(Pair *pair)
 {
   if(not cuda->shared_data.pair.cudable_force)
   {
-          return Comm::forward_comm_pair(pair);
+          return CommBrick::forward_comm_pair(pair);
   }
 
   int iswap,n;
@@ -753,7 +753,7 @@ void CommCuda::exchange()
 
   if(not cuda->oncpu) cuda->downloadAll();
 
-  Comm::exchange();
+  CommBrick::exchange();
 }
 
 
@@ -887,7 +887,7 @@ void CommCuda::borders()
            return;
   }
 
-  Comm::borders();
+  CommBrick::borders();
 
   cuda->setSystemParams();
   if(cuda->finished_setup) {cuda->checkResize(); cuda->uploadAll();}
@@ -1313,7 +1313,7 @@ void CommCuda::grow_list(int iswap, int n)
 void CommCuda::grow_swap(int n)
 {
   int oldmaxswap=maxswap;
-  Comm::grow_swap(n);
+  CommBrick::grow_swap(n);
   if(n>cu_sendlist->get_dim()[0])
   {
    MYDBG(printf(" # CUDA CommCuda::grow_swap\n");)
@@ -1357,7 +1357,7 @@ void CommCuda::grow_swap(int n)
 
 void CommCuda::allocate_swap(int n)
 {
-   Comm::allocate_swap(n);
+   CommBrick::allocate_swap(n);
 
           delete cu_pbc;
           delete cu_slablo;
@@ -1392,7 +1392,7 @@ void CommCuda::allocate_swap(int n)
 
 void CommCuda::allocate_multi(int n)
 {
-  Comm::allocate_multi(n);
+  CommBrick::allocate_multi(n);
 
           delete cu_multilo;
           delete cu_multihi;
@@ -1410,7 +1410,7 @@ void CommCuda::allocate_multi(int n)
 void CommCuda::free_swap()
 {
 
-  Comm::free_swap();
+  CommBrick::free_swap();
 
   delete cuda->shared_data.comm.nsend_swap; cuda->shared_data.comm.nsend_swap=NULL;
   delete cu_pbc; cu_pbc = NULL;
@@ -1431,7 +1431,7 @@ void CommCuda::free_swap()
 
 void CommCuda::free_multi()
 {
-  Comm::free_multi();
+  CommBrick::free_multi();
   delete cu_multilo; cu_multilo = NULL;
   delete cu_multihi; cu_multihi = NULL;
 }
diff --git a/src/USER-CUDA/pair_gran_hooke_cuda.cpp b/src/USER-CUDA/pair_gran_hooke_cuda.cpp
index 08360a996b..4b31d0d869 100644
--- a/src/USER-CUDA/pair_gran_hooke_cuda.cpp
+++ b/src/USER-CUDA/pair_gran_hooke_cuda.cpp
@@ -168,8 +168,6 @@ void PairGranHookeCuda::init_style()
 
   dt = update->dt;
 
-
-
   // check for Fix freeze and set freeze_group_bit
 
   for (i = 0; i < modify->nfix; i++)
@@ -178,42 +176,52 @@ void PairGranHookeCuda::init_style()
   else freeze_group_bit = 0;
 
   cuda->shared_data.pair.freeze_group_bit=freeze_group_bit;
-  // check for Fix pour and set pour_type and pour_maxdiam
 
-  int pour_type = 0;
-  double pour_maxrad = 0.0;
-  for (i = 0; i < modify->nfix; i++)
-    if (strcmp(modify->fix[i]->style,"pour") == 0) break;
-  if (i < modify->nfix) {
-    pour_type = ((FixPour *) modify->fix[i])->ntype;
-    pour_maxrad = ((FixPour *) modify->fix[i])->radius_max;
-  }
+  // check for FixPour and FixDeposit so can extract particle radii
+
+  int ipour;
+  for (ipour = 0; ipour < modify->nfix; ipour++)
+    if (strcmp(modify->fix[ipour]->style,"pour") == 0) break;
+  if (ipour == modify->nfix) ipour = -1;
+
+  int idep;
+  for (idep = 0; idep < modify->nfix; idep++)
+    if (strcmp(modify->fix[idep]->style,"deposit") == 0) break;
+  if (idep == modify->nfix) idep = -1;
 
   // set maxrad_dynamic and maxrad_frozen for each type
-  // include future Fix pour particles as dynamic
+  // include future FixPour and FixDeposit particles as dynamic
 
-  for (i = 1; i <= atom->ntypes; i++)
+  int itype;
+  for (i = 1; i <= atom->ntypes; i++) {
     onerad_dynamic[i] = onerad_frozen[i] = 0.0;
-  if (pour_type) onerad_dynamic[pour_type] = pour_maxrad;
+    if (ipour >= 0) {
+      itype = i;
+      onerad_dynamic[i] = 
+        *((double *) modify->fix[ipour]->extract("radius",itype));
+    }
+    if (idep >= 0) {
+      itype = i;
+      onerad_dynamic[i] = 
+        *((double *) modify->fix[idep]->extract("radius",itype));
+    }
+  }
 
   double *radius = atom->radius;
   int *mask = atom->mask;
   int *type = atom->type;
   int nlocal = atom->nlocal;
 
-  for (i = 0; i < nlocal; i++){
+  for (i = 0; i < nlocal; i++)
     if (mask[i] & freeze_group_bit)
       onerad_frozen[type[i]] = MAX(onerad_frozen[type[i]],radius[i]);
     else
       onerad_dynamic[type[i]] = MAX(onerad_dynamic[type[i]],radius[i]);
-  }
 
   MPI_Allreduce(&onerad_dynamic[1],&maxrad_dynamic[1],atom->ntypes,
                 MPI_DOUBLE,MPI_MAX,world);
   MPI_Allreduce(&onerad_frozen[1],&maxrad_frozen[1],atom->ntypes,
                 MPI_DOUBLE,MPI_MAX,world);
-
-  MYDBG(printf("# CUDA PairGranHookeCuda::init_style end\n"); )
 }
 
 void PairGranHookeCuda::init_list(int id, NeighList *ptr)
diff --git a/src/USER-OMP/fix_omp.cpp b/src/USER-OMP/fix_omp.cpp
index f5808a3457..9b0771ceef 100644
--- a/src/USER-OMP/fix_omp.cpp
+++ b/src/USER-OMP/fix_omp.cpp
@@ -134,6 +134,7 @@ FixOMP::FixOMP(LAMMPS *lmp, int narg, char **arg)
   // allocate list for per thread accumulator manager class instances
   // and then have each thread create an instance of this class to
   // encourage the OS to use storage that is "close" to each thread's CPU.
+
   thr = new ThrData *[nthreads];
   _nthr = nthreads;
 #if defined(_OPENMP)
@@ -207,6 +208,7 @@ void FixOMP::init()
   // kspace_split == 0 : regular processing
   // kspace_split < 0  : master partition, does not do kspace
   // kspace_split > 0  : slave partition, only does kspace
+
   if (strstr(update->integrate_style,"verlet/split") != NULL) {
     if (universe->iworld == 0) kspace_split = -1;
     else kspace_split = 1;
diff --git a/src/comm_brick.cpp b/src/comm_brick.cpp
index 2be56389a3..93a81c3ad2 100644
--- a/src/comm_brick.cpp
+++ b/src/comm_brick.cpp
@@ -35,6 +35,7 @@
 #include "compute.h"
 #include "output.h"
 #include "dump.h"
+#include "accelerator_kokkos.h"
 #include "math_extra.h"
 #include "error.h"
 #include "memory.h"
@@ -76,7 +77,9 @@ CommBrick::CommBrick(LAMMPS *lmp) : Comm(lmp)
 
   nthreads = 1;
 #ifdef _OPENMP
-  if (getenv("OMP_NUM_THREADS") == NULL) {
+  if (lmp->kokkos) {
+    nthreads = lmp->kokkos->num_threads * lmp->kokkos->numa;
+  } else if (getenv("OMP_NUM_THREADS") == NULL) {
     nthreads = 1;
     if (me == 0)
       error->warning(FLERR,"OMP_NUM_THREADS environment is not set.");
@@ -87,7 +90,7 @@ CommBrick::CommBrick(LAMMPS *lmp) : Comm(lmp)
   // enforce consistent number of threads across all MPI tasks
 
   MPI_Bcast(&nthreads,1,MPI_INT,0,world);
-  omp_set_num_threads(nthreads);
+  if (!lmp->kokkos) omp_set_num_threads(nthreads);
 
   if (me == 0) {
     if (screen)