diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
new file mode 100644
index 0000000000..fdee6325d0
--- /dev/null
+++ b/.github/CODEOWNERS
@@ -0,0 +1,21 @@
+# This file contains file patterns that triggers automatic
+# code review requests from users that are owners of these files
+# Order matters, the last match has the highest precedence
+
+# library folders
+lib/colvars/*         @giacomofiorin
+lib/compress/*        @akohlmey
+lib/kokkos/*          @stanmoore1
+lib/molfile/*         @akohlmey
+lib/qmmm/*            @akohlmey
+lib/vtk/*             @rbberger
+
+# packages
+src/KOKKOS            @stanmoore1
+src/USER-CGSDK        @akohlmey
+src/USER-COLVARS      @giacomofiorin
+src/USER-OMP          @akohlmey
+src/USER-QMMM         @akohlmey
+
+# tools
+tools/msi2lmp/*       @akohlmey
diff --git a/.gitignore b/.gitignore
index 74e511515e..50b970249a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,3 +32,11 @@ log.cite
 .Trashes
 ehthumbs.db
 Thumbs.db
+
+#cmake
+/build*
+/CMakeCache.txt
+/CMakeFiles/
+/Makefile
+/cmake_install.cmake
+/lmp
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
new file mode 100644
index 0000000000..76c28fcb72
--- /dev/null
+++ b/cmake/CMakeLists.txt
@@ -0,0 +1,547 @@
+########################################
+# CMake build system
+# This file is part of LAMMPS
+# Created by Christoph Junghans and Richard Berger
+cmake_minimum_required(VERSION 3.1)
+
+project(lammps)
+set(SOVERSION 0)
+set(LAMMPS_SOURCE_DIR ${CMAKE_SOURCE_DIR}/../src)
+set(LAMMPS_LIB_SOURCE_DIR ${CMAKE_SOURCE_DIR}/../lib)
+set(LAMMPS_LIB_BINARY_DIR ${CMAKE_BINARY_DIR}/lib)
+
+#To not conflict with old Makefile build system, we build everything here
+file(GLOB LIB_SOURCES ${LAMMPS_SOURCE_DIR}/*.cpp)
+file(GLOB LMP_SOURCES ${LAMMPS_SOURCE_DIR}/main.cpp)
+list(REMOVE_ITEM LIB_SOURCES ${LMP_SOURCES})
+
+# Cmake modules/macros are in a subdirectory to keep this file cleaner
+set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/Modules)
+
+if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CXX_FLAGS)
+  #release comes with -O3 by default
+  set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." FORCE)
+endif(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CXX_FLAGS)
+
+foreach(STYLE_FILE style_angle.h style_atom.h style_body.h style_bond.h style_command.h style_compute.h style_dihedral.h style_dump.h
+             style_fix.h style_improper.h style_integrate.h style_kspace.h style_minimize.h style_nbin.h style_npair.h style_nstencil.h
+             style_ntopo.h style_pair.h style_reader.h style_region.h)
+  if(EXISTS ${LAMMPS_SOURCE_DIR}/${STYLE_FILE})
+    message(FATAL_ERROR "There is a ${STYLE_FILE} in ${LAMMPS_SOURCE_DIR}, please clean up the source directory first")
+  endif()
+endforeach()
+
+enable_language(CXX)
+
+######################################################################
+# compiler tests
+# these need ot be done early (before further tests).
+#####################################################################
+include(CheckCCompilerFlag)
+
+########################################################################
+# User input options                                                   #
+########################################################################
+option(BUILD_SHARED_LIBS "Build shared libs" OFF)
+option(INSTALL_LIB "Install lammps library and header" ON)
+include(GNUInstallDirs)
+
+set(LAMMPS_LINK_LIBS)
+option(ENABLE_MPI "Build MPI version" OFF)
+if(ENABLE_MPI)
+  find_package(MPI REQUIRED)
+  include_directories(${MPI_C_INCLUDE_PATH})
+  list(APPEND LAMMPS_LINK_LIBS ${MPI_CXX_LIBRARIES})
+  option(LAMMPS_LONGLONG_TO_LONG "Workaround if your system or MPI version does not recognize 'long long' data types" OFF)
+  if(LAMMPS_LONGLONG_TO_LONG)
+    add_definitions(-DLAMMPS_LONGLONG_TO_LONG)
+  endif()
+else()
+  file(GLOB MPI_SOURCES ${LAMMPS_SOURCE_DIR}/STUBS/mpi.c)
+  list(APPEND LIB_SOURCES ${MPI_SOURCES})
+  include_directories(${LAMMPS_SOURCE_DIR}/STUBS)
+endif()
+
+set(LAMMPS_SIZE_LIMIT "LAMMPS_SMALLBIG" CACHE STRING "Lammps size limit")
+set_property(CACHE LAMMPS_SIZE_LIMIT PROPERTY STRINGS LAMMPS_SMALLBIG LAMMPS_BIGBIG LAMMPS_SMALLSMALL)
+add_definitions(-D${LAMMPS_SIZE_LIMIT})
+
+set(LAMMPS_MEMALIGN "64" CACHE STRING "enables the use of the posix_memalign() call instead of malloc() when large chunks or memory are allocated by LAMMPS")
+add_definitions(-DLAMMPS_MEMALIGN=${LAMMPS_MEMALIGN})
+
+option(LAMMPS_EXCEPTIONS "enable the use of C++ exceptions for error messages (useful for library interface)" OFF)
+if(LAMMPS_EXCEPTIONS)
+  add_definitions(-DLAMMPS_EXCEPTIONS)
+endif()
+
+option(CMAKE_VERBOSE_MAKEFILE "Verbose makefile" OFF)
+
+option(ENABLE_TESTING "Enable testing" OFF)
+if(ENABLE_TESTING)
+  enable_testing()
+endif(ENABLE_TESTING)
+
+option(ENABLE_ALL "Build all default packages" OFF)
+set(DEFAULT_PACKAGES ASPHERE BODY CLASS2 COLLOID COMPRESS CORESHELL DIPOLE GRANULAR
+  KSPACE MANYBODY MC MEAM MISC MOLECULE PERI QEQ
+  REAX REPLICA RIGID SHOCK SNAP SRD)
+set(OTHER_PACKAGES KIM PYTHON MSCG MPIIO VORONOI POEMS
+   USER-ATC USER-AWPMD USER-CGDNA
+  USER-CGSDK USER-COLVARS USER-DIFFRACTION USER-DPD USER-DRUDE USER-EFF
+  USER-FEP USER-H5MD USER-LB USER-MANIFOLD USER-MEAMC USER-MGPT USER-MISC
+  USER-MOLFILE USER-NETCDF USER-PHONON USER-QTB USER-REAXC USER-SMD 
+  USER-SMTBQ USER-SPH USER-TALLY USER-VTK USER-QUIP USER-QMMM)
+set(ACCEL_PACKAGES USER-OMP KOKKOS OPT USER-INTEL GPU)
+foreach(PKG ${DEFAULT_PACKAGES})
+  option(ENABLE_${PKG} "Build ${PKG} Package" ${ENABLE_ALL})
+endforeach()
+foreach(PKG ${ACCEL_PACKAGES} ${OTHER_PACKAGES})
+  option(ENABLE_${PKG} "Build ${PKG} Package" OFF)
+endforeach()
+
+macro(pkg_depends PKG1 PKG2)
+  if(ENABLE_${PKG1} AND NOT ENABLE_${PKG2})
+    message(FATAL_ERROR "${PKG1} package needs LAMMPS to be build with ${PKG2}")
+  endif()
+endmacro()
+
+pkg_depends(MPIIO MPI)
+pkg_depends(QEQ MANYBODY)
+pkg_depends(USER-ATC MANYBODY)
+pkg_depends(USER-H5MD MPI)
+pkg_depends(USER-LB MPI)
+pkg_depends(USER-MISC MANYBODY)
+pkg_depends(USER-PHONON KSPACE)
+
+if(ENABLE_BODY AND ENABLE_POEMS)
+  message(FATAL_ERROR "BODY and POEMS cannot be enabled at the same time")
+endif()
+
+######################################################
+# packages with special compiler needs or external libs
+######################################################
+if(ENABLE_REAX OR ENABLE_MEAM OR ENABLE_USER-QUIP OR ENABLE_USER-QMMM)
+  enable_language(Fortran)
+endif()
+
+if(ENABLE_KOKKOS OR ENABLE_MSCG)
+  # starting with CMake 3.1 this is all you have to do to enforce C++11
+  set(CMAKE_CXX_STANDARD 11) # C++11...
+  set(CMAKE_CXX_STANDARD_REQUIRED ON) #...is required...
+  set(CMAKE_CXX_EXTENSIONS OFF) #...without compiler extensions like gnu++11
+endif()
+
+if(ENABLE_USER-OMP OR ENABLE_KOKKOS OR ENABLE_USER-INTEL)
+  find_package(OpenMP REQUIRED)
+  set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+endif()
+
+if(ENABLE_KSPACE)
+  set(FFT "KISSFFT" CACHE STRING "FFT library for KSPACE package")
+  set_property(CACHE FFT PROPERTY STRINGS KISSFFT FFTW3 MKL FFTW2)
+  if(NOT FFT STREQUAL "KISSFFT")
+    find_package(${FFT} REQUIRED)
+    add_definitions(-DFFT_${FFT})
+    include_directories(${${FFT}_INCLUDE_DIRS})
+    list(APPEND LAMMPS_LINK_LIBS ${${FFT}_LIBRARIES})
+  endif()
+  set(PACK_OPTIMIZATION "PACK_ARRAY" CACHE STRING "Optimization for FFT")
+  set_property(CACHE PACK_OPTIMIZATION PROPERTY STRINGS PACK_ARRAY PACK_POINTER PACK_MEMCPY)
+  if(NOT PACK_OPTIMIZATION STREQUAL "PACK_ARRAY")
+    add_definitions(-D${PACK_OPTIMIZATION})
+  endif()
+endif()
+
+if(ENABLE_MISC)
+  option(LAMMPS_XDR "include XDR compatibility files for doing particle dumps in XTC format" OFF)
+  if(LAMMPS_XDR)
+    add_definitions(-DLAMMPS_XDR)
+  endif()
+endif()
+
+if(ENABLE_MSCG OR ENABLE_USER-ATC OR ENABLE_USER-AWPMD OR ENABLE_USER-QUIP)
+  find_package(LAPACK)
+  if(LAPACK_FOUND)
+    list(APPEND LAMMPS_LINK_LIBS ${LAPACK_LIBRARIES})
+  else()
+    enable_language(Fortran)
+    file(GLOB LAPACK_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/linalg/*.f)
+    list(APPEND LIB_SOURCES ${LAPACK_SOURCES})
+  endif()
+endif()
+
+if(ENABLE_PYTHON)
+  find_package(PythonInterp REQUIRED)
+  find_package(PythonLibs REQUIRED)
+  add_definitions(-DLMP_PYTHON)
+  include_directories(${PYTHON_INCLUDE_DIR})
+  list(APPEND LAMMPS_LINK_LIBS ${PYTHON_LIBRARY})
+  if(NOT PYTHON_INSTDIR)
+    execute_process(COMMAND ${PYTHON_EXECUTABLE}
+	  -c "import distutils.sysconfig as cg; print(cg.get_python_lib(1,0,prefix='${CMAKE_INSTALL_PREFIX}'))"
+      OUTPUT_VARIABLE PYTHON_INSTDIR OUTPUT_STRIP_TRAILING_WHITESPACE)
+  endif()
+  install(FILES ${CMAKE_SOURCE_DIR}/../python/lammps.py DESTINATION ${PYTHON_INSTDIR})
+  if(NOT BUILD_SHARED_LIBS)
+    message(FATAL_ERROR "Python package need lammps to be build shared, use -DBUILD_SHARED_LIBS=ON")
+  endif()
+endif()
+
+find_package(JPEG)
+if(JPEG_FOUND)
+  add_definitions(-DLAMMPS_JPEG)
+  include_directories(${JPEG_INCLUDE_DIR})
+  list(APPEND LAMMPS_LINK_LIBS ${JPEG_LIBRARIES})
+endif()
+
+find_package(PNG)
+find_package(ZLIB)
+if(PNG_FOUND AND ZLIB_FOUND)
+  include_directories(${PNG_INCLUDE_DIRS} ${ZLIB_INCLUDE_DIRS})
+  list(APPEND LAMMPS_LINK_LIBS ${PNG_LIBRARIES} ${ZLIB_LIBRARIES})
+  add_definitions(-DLAMMPS_PNG)
+endif()
+
+find_program(GZIP_EXECUTABLE gzip)
+find_package_handle_standard_args(GZIP REQUIRED_VARS GZIP_EXECUTABLE)
+if(GZIP_FOUND)
+  add_definitions(-DLAMMPS_GZIP)
+endif()
+
+find_program(FFMPEG_EXECUTABLE ffmpeg)
+find_package_handle_standard_args(FFMPEG REQUIRED_VARS FFMPEG_EXECUTABLE)
+if(FFMPEG_FOUND)
+  add_definitions(-DLAMMPS_FFMPEG)
+endif()
+
+if(ENABLE_VORONOI)
+  find_package(VORO REQUIRED) #some distros
+  include_directories(${VORO_INCLUDE_DIRS})
+  list(APPEND LAMMPS_LINK_LIBS ${VORO_LIBRARIES})
+endif()
+
+if(ENABLE_USER-MOLFILE)
+  list(APPEND LAMMPS_LINK_LIBS ${CMAKE_DL_LIBS})
+endif()
+
+if(ENABLE_USER-NETCDF)
+  find_package(NetCDF REQUIRED)
+  include_directories(NETCDF_INCLUDE_DIR)
+  list(APPEND LAMMPS_LINK_LIBS ${NETCDF_LIBRARY})
+  add_definitions(-DLMP_HAS_NETCDF -DNC_64BIT_DATA=0x0020)
+endif()
+
+if(ENABLE_USER-SMD)
+  find_package(Eigen3 REQUIRED)
+  include_directories(${EIGEN3_INCLUDE_DIR})
+endif()
+
+if(ENABLE_USER-QUIP)
+  find_package(QUIP REQUIRED)
+  list(APPEND LAMMPS_LINK_LIBS ${QUIP_LIBRARIES} ${CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES})
+endif()
+
+if(ENABLE_USER-QMMM)
+  find_package(QE REQUIRED)
+  include_directories(${QE_INCLUDE_DIRS})
+  list(APPEND LAMMPS_LINK_LIBS ${QE_LIBRARIES} ${CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES})
+endif()
+
+if(ENABLE_USER-AWPMD)
+  include_directories(${LAMMPS_LIB_SOURCE_DIR}/awpmd/systems/interact
+    ${LAMMPS_LIB_SOURCE_DIR}/awpmd/ivutils/include)
+endif()
+
+if(ENABLE_USER-H5MD)
+  find_package(HDF5 REQUIRED)
+  list(APPEND LAMMPS_LINK_LIBS ${HDF5_LIBRARIES})
+  include_directories(${HDF5_INCLUDE_DIRS} ${LAMMPS_LIB_SOURCE_DIR}/h5md/include)
+endif()
+
+if(ENABLE_USER-VTK)
+  find_package(VTK REQUIRED NO_MODULE)
+  include(${VTK_USE_FILE})
+  add_definitions(-DLAMMPS_VTK)
+  list(APPEND LAMMPS_LINK_LIBS ${VTK_LIBRARIES})
+endif()
+
+if(ENABLE_KIM)
+  find_package(KIM REQUIRED)
+  list(APPEND LAMMPS_LINK_LIBS ${KIM_LIBRARIES})
+  include_directories(${KIM_INCLUDE_DIRS})
+endif()
+
+if(ENABLE_MSCG)
+  find_package(GSL REQUIRED)
+  set(LAMMPS_LIB_MSCG_BIN_DIR ${LAMMPS_LIB_BINARY_DIR}/mscg)
+  set(MSCG_TARBALL ${LAMMPS_LIB_MSCG_BIN_DIR}/MS-CG-master.zip)
+  set(LAMMPS_LIB_MSCG_BIN_DIR ${LAMMPS_LIB_MSCG_BIN_DIR}/MSCG-release-master/src)
+  if(NOT EXISTS ${LAMMPS_LIB_MSCG_BIN_DIR})
+    if(NOT EXISTS ${MSCG_TARBALL})
+      message(STATUS "Downloading ${MSCG_TARBALL}")
+      file(DOWNLOAD
+        https://github.com/uchicago-voth/MSCG-release/archive/master.zip
+        ${MSCG_TARBALL} SHOW_PROGRESS) #EXPECTED_MD5 cannot be due due to master
+    endif()
+    message(STATUS "Unpacking ${MSCG_TARBALL}")
+    execute_process(COMMAND ${CMAKE_COMMAND} -E tar xvf ${MSCG_TARBALL}
+      WORKING_DIRECTORY ${LAMMPS_LIB_BINARY_DIR}/mscg)
+  endif()
+  file(GLOB MSCG_SOURCES ${LAMMPS_LIB_MSCG_BIN_DIR}/*.cpp)
+  list(APPEND LIB_SOURCES ${MSCG_SOURCES})
+  foreach(MSCG_SOURCE ${MSCG_SOURCES})
+    set_property(SOURCE ${MSCG_SOURCE} APPEND PROPERTY COMPILE_DEFINITIONS
+      DIMENSION=3 _exclude_gromacs=1)
+  endforeach()
+  include_directories(${LAMMPS_LIB_MSCG_BIN_DIR} ${GSL_INCLUDE_DIRS})
+  list(APPEND LAMMPS_LINK_LIBS ${GSL_LIBRARIES})
+endif()
+
+########################################################################
+# Basic system tests (standard libraries, headers, functions, types)   #
+########################################################################
+include(CheckIncludeFile)
+foreach(HEADER math.h)
+  check_include_file(${HEADER} FOUND_${HEADER})
+  if(NOT FOUND_${HEADER})
+    message(FATAL_ERROR "Could not find needed header - ${HEADER}")
+  endif(NOT FOUND_${HEADER})
+endforeach(HEADER)
+
+set(MATH_LIBRARIES "m" CACHE STRING "math library")
+mark_as_advanced( MATH_LIBRARIES )
+include(CheckLibraryExists)
+foreach(FUNC sin cos)
+  check_library_exists(${MATH_LIBRARIES} ${FUNC} "" FOUND_${FUNC}_${MATH_LIBRARIES})
+  if(NOT FOUND_${FUNC}_${MATH_LIBRARIES})
+    message(FATAL_ERROR "Could not find needed math function - ${FUNC}")
+  endif(NOT FOUND_${FUNC}_${MATH_LIBRARIES})
+endforeach(FUNC)
+list(APPEND LAMMPS_LINK_LIBS ${MATH_LIBRARIES})
+
+######################################
+# Generate Basic Style files 
+######################################
+include(StyleHeaderUtils)
+RegisterStyles(${LAMMPS_SOURCE_DIR})
+
+##############################################
+# add sources of enabled packages
+############################################
+foreach(PKG ${DEFAULT_PACKAGES} ${OTHER_PACKAGES})
+  if(ENABLE_${PKG})
+    set(${PKG}_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/${PKG})
+
+    # detects styles in package and adds them to global list
+    RegisterStyles(${${PKG}_SOURCES_DIR})
+
+    file(GLOB ${PKG}_SOURCES ${${PKG}_SOURCES_DIR}/*.cpp)
+    list(APPEND LIB_SOURCES ${${PKG}_SOURCES})
+    include_directories(${${PKG}_SOURCES_DIR})
+  endif()
+endforeach()
+
+##############################################
+# add lib sources of (simple) enabled packages
+############################################
+foreach(SIMPLE_LIB REAX MEAM POEMS USER-ATC USER-AWPMD USER-COLVARS USER-H5MD
+  USER-MOLFILE USER-QMMM)
+  if(ENABLE_${SIMPLE_LIB})
+    string(REGEX REPLACE "^USER-" "" SIMPLE_LIB "${SIMPLE_LIB}")
+    string(TOLOWER "${SIMPLE_LIB}" INC_DIR)
+    file(GLOB_RECURSE ${SIMPLE_LIB}_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/${INC_DIR}/*.F
+      ${LAMMPS_LIB_SOURCE_DIR}/${INC_DIR}/*.c ${LAMMPS_LIB_SOURCE_DIR}/${INC_DIR}/*.cpp)
+    list(APPEND LIB_SOURCES ${${SIMPLE_LIB}_SOURCES})
+    include_directories(${LAMMPS_LIB_SOURCE_DIR}/${INC_DIR})
+  endif()
+endforeach()
+
+######################################################################
+# packages which selectively include variants based on enabled styles
+# e.g. accelerator packages
+######################################################################
+if(ENABLE_USER-OMP)
+    set(USER-OMP_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/USER-OMP)
+    set(USER-OMP_SOURCES ${USER-OMP_SOURCES_DIR}/thr_data.cpp
+                         ${USER-OMP_SOURCES_DIR}/thr_omp.cpp
+                         ${USER-OMP_SOURCES_DIR}/fix_nh_omp.cpp
+                         ${USER-OMP_SOURCES_DIR}/fix_nh_sphere_omp.cpp)
+    set_property(GLOBAL PROPERTY "OMP_SOURCES" "${USER-OMP_SOURCES}")
+
+    # detects styles which have USER-OMP version
+    RegisterStylesExt(${USER-OMP_SOURCES_DIR} omp OMP_SOURCES)
+
+    get_property(USER-OMP_SOURCES GLOBAL PROPERTY OMP_SOURCES)
+
+    list(APPEND LIB_SOURCES ${USER-OMP_SOURCES})
+    include_directories(${USER-OMP_SOURCES_DIR})
+endif()
+
+if(ENABLE_KOKKOS)
+  set(LAMMPS_LIB_KOKKOS_SRC_DIR ${LAMMPS_LIB_SOURCE_DIR}/kokkos)
+  set(LAMMPS_LIB_KOKKOS_BIN_DIR ${LAMMPS_LIB_BINARY_DIR}/kokkos)
+  add_definitions(-DLMP_KOKKOS)
+  add_subdirectory(${LAMMPS_LIB_KOKKOS_SRC_DIR} ${LAMMPS_LIB_KOKKOS_BIN_DIR})
+
+  set(Kokkos_INCLUDE_DIRS ${LAMMPS_LIB_KOKKOS_SRC_DIR}/core/src
+                          ${LAMMPS_LIB_KOKKOS_SRC_DIR}/containers/src
+                          ${LAMMPS_LIB_KOKKOS_SRC_DIR}/algorithms/src
+                          ${LAMMPS_LIB_KOKKOS_BIN_DIR})
+  include_directories(${Kokkos_INCLUDE_DIRS})
+  list(APPEND LAMMPS_LINK_LIBS kokkos)
+
+  set(KOKKOS_PKG_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/KOKKOS)
+  set(KOKKOS_PKG_SOURCES ${KOKKOS_PKG_SOURCES_DIR}/kokkos.cpp
+                         ${KOKKOS_PKG_SOURCES_DIR}/atom_kokkos.cpp
+                         ${KOKKOS_PKG_SOURCES_DIR}/atom_vec_kokkos.cpp
+                         ${KOKKOS_PKG_SOURCES_DIR}/comm_kokkos.cpp
+                         ${KOKKOS_PKG_SOURCES_DIR}/comm_tiled_kokkos.cpp
+                         ${KOKKOS_PKG_SOURCES_DIR}/neighbor_kokkos.cpp
+                         ${KOKKOS_PKG_SOURCES_DIR}/neigh_list_kokkos.cpp
+                         ${KOKKOS_PKG_SOURCES_DIR}/neigh_bond_kokkos.cpp
+                         ${KOKKOS_PKG_SOURCES_DIR}/fix_nh_kokkos.cpp
+                         ${KOKKOS_PKG_SOURCES_DIR}/domain_kokkos.cpp
+                         ${KOKKOS_PKG_SOURCES_DIR}/modify_kokkos.cpp)
+  set_property(GLOBAL PROPERTY "KOKKOS_PKG_SOURCES" "${KOKKOS_PKG_SOURCES}")
+
+  # detects styles which have KOKKOS version
+  RegisterStylesExt(${KOKKOS_PKG_SOURCES_DIR} kokkos KOKKOS_PKG_SOURCES)
+
+  get_property(KOKKOS_PKG_SOURCES GLOBAL PROPERTY KOKKOS_PKG_SOURCES)
+
+  list(APPEND LIB_SOURCES ${KOKKOS_PKG_SOURCES})
+  include_directories(${KOKKOS_PKG_SOURCES_DIR})
+endif()
+
+if(ENABLE_OPT)
+    set(OPT_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/OPT)
+    set(OPT_SOURCES)
+    set_property(GLOBAL PROPERTY "OPT_SOURCES" "${OPT_SOURCES}")
+
+    # detects styles which have OPT version
+    RegisterStylesExt(${OPT_SOURCES_DIR} opt OPT_SOURCES)
+
+    get_property(OPT_SOURCES GLOBAL PROPERTY OPT_SOURCES)
+
+    list(APPEND LIB_SOURCES ${OPT_SOURCES})
+    include_directories(${OPT_SOURCES_DIR})
+endif()
+
+if(ENABLE_USER-INTEL)
+    set(USER-INTEL_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/USER-INTEL)
+    set(USER-INTEL_SOURCES ${USER-INTEL_SOURCES_DIR}/intel_preprocess.h
+                           ${USER-INTEL_SOURCES_DIR}/intel_buffers.h
+                           ${USER-INTEL_SOURCES_DIR}/intel_buffers.cpp
+                           ${USER-INTEL_SOURCES_DIR}/math_extra_intel.h
+                           ${USER-INTEL_SOURCES_DIR}/nbin_intel.h
+                           ${USER-INTEL_SOURCES_DIR}/nbin_intel.cpp
+                           ${USER-INTEL_SOURCES_DIR}/npair_intel.h
+                           ${USER-INTEL_SOURCES_DIR}/npair_intel.cpp
+                           ${USER-INTEL_SOURCES_DIR}/intel_simd.h
+                           ${USER-INTEL_SOURCES_DIR}/intel_intrinsics.h)
+
+    set_property(GLOBAL PROPERTY "USER-INTEL_SOURCES" "${USER-INTEL_SOURCES}")
+
+    # detects styles which have USER-INTEL version
+    RegisterStylesExt(${USER-INTEL_SOURCES_DIR} opt USER-INTEL_SOURCES)
+
+    get_property(USER-INTEL_SOURCES GLOBAL PROPERTY USER-INTEL_SOURCES)
+
+    list(APPEND LIB_SOURCES ${USER-INTEL_SOURCES})
+    include_directories(${USER-INTEL_SOURCES_DIR})
+endif()
+
+if(ENABLE_GPU)
+    find_package(CUDA REQUIRED)
+    find_program(BIN2C bin2c)
+    if(NOT BIN2C)
+      message(FATAL_ERROR "Couldn't find bin2c, use -DBIN2C helping cmake to find it.")
+    endif()
+    include_directories(${CUDA_INCLUDE_DIRS})
+    list(APPEND LAMMPS_LINK_LIBS ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY})
+    set(GPU_PREC "SINGLE_DOUBLE" CACHE STRING "Lammps gpu precision size")
+    set_property(CACHE GPU_PREC PROPERTY STRINGS SINGLE_DOUBLE SINGLE_SINGLE DOUBLE_DOUBLE)
+    add_definitions(-D_${GPU_PREC})
+    add_definitions(-DNV_KERNEL -DUCL_CUDADR)
+    option(CUDPP_OPT "Enable CUDPP_OPT" ON)
+
+    set(GPU_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/GPU)
+    set(GPU_SOURCES ${GPU_SOURCES_DIR}/gpu_extra.h)
+
+    set_property(GLOBAL PROPERTY "GPU_SOURCES" "${GPU_SOURCES}")
+
+    # detects styles which have GPU version
+    RegisterStylesExt(${GPU_SOURCES_DIR} opt GPU_SOURCES)
+
+    get_property(GPU_SOURCES GLOBAL PROPERTY GPU_SOURCES)
+    
+    file(GLOB GPU_LIB_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/gpu/*.cpp)
+    file(GLOB GPU_LIB_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/*.cu ${CMAKE_SOURCE_DIR}/gpu/*.cu)
+    file(GLOB_RECURSE GPU_NOT_LIB_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_pppm.cu)
+    list(REMOVE_ITEM GPU_LIB_CU ${GPU_NOT_LIB_CU})
+    include_directories(${GPU_SOURCES_DIR} ${LAMMPS_LIB_SOURCE_DIR}/gpu ${LAMMPS_LIB_BINARY_DIR}/gpu)
+    if(CUDPP_OPT)
+      include_directories(${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini)
+      add_definitions(-DCUDPP_OPT)
+      file(GLOB GPU_LIB_CUDPP_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini/*.cpp)
+      file(GLOB GPU_LIB_CUDPP_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini/*.cu)
+    endif()
+    cuda_compile(GPU_OBJS ${GPU_LIB_CU} ${GPU_LIB_CUDPP_CU} OPTIONS $<$<BOOL:${BUILD_SHARED_LIBS}>:-Xcompiler=-fPIC>)    
+    file(MAKE_DIRECTORY ${LAMMPS_LIB_BINARY_DIR}/gpu)
+    foreach(CU_OBJ ${GPU_OBJS})
+      get_filename_component(CU_NAME ${CU_OBJ} NAME_WE)
+      string(REGEX REPLACE "^.*_lal_" "" CU_NAME "${CU_NAME}")
+      add_custom_command(OUTPUT ${LAMMPS_LIB_BINARY_DIR}/gpu/${CU_NAME}_cubin.h
+        COMMAND ${BIN2C} -c -n ${CU_NAME} ${CU_OBJ} > ${LAMMPS_LIB_BINARY_DIR}/gpu/${CU_NAME}_cubin.h
+        DEPENDS ${CU_OBJ} 
+        COMMENT "Generating ${CU_NAME}_cubin.h")
+      list(APPEND LIB_SOURCES ${LAMMPS_LIB_BINARY_DIR}/gpu/${CU_NAME}_cubin.h)
+      if(${CU_NAME} STREQUAL "pppm_d") #pppm_d doesn't get linked into the lib
+        set(CU_FORBIDDEN_OBJ "${CU_OBJ}") 
+      endif()
+    endforeach()
+    list(REMOVE_ITEM GPU_OBJS "${CU_FORBIDDEN_OBJ}")
+    list(APPEND LIB_SOURCES ${GPU_SOURCES} ${GPU_LIB_SOURCES} ${GPU_LIB_CUDPP_SOURCES} ${GPU_OBJS})
+    set_directory_properties(PROPERTIES ADDITIONAL_MAKE_CLEAN_FILES "${LAMMPS_LIB_BINARY_DIR}/gpu/*_cubin.h")
+endif()
+
+######################################################
+# Generate style headers based on global list of
+# styles registered during package selection           
+######################################################
+set(LAMMPS_STYLE_HEADERS_DIR ${CMAKE_CURRENT_BINARY_DIR}/styles)
+
+GenerateStyleHeaders(${LAMMPS_STYLE_HEADERS_DIR})
+
+include_directories(${LAMMPS_SOURCE_DIR})
+include_directories(${LAMMPS_STYLE_HEADERS_DIR})
+
+###########################################
+# Actually add executable and lib to build
+############################################
+add_library(lammps ${LIB_SOURCES})
+target_link_libraries(lammps ${LAMMPS_LINK_LIBS})
+set_target_properties(lammps PROPERTIES SOVERSION ${SOVERSION})
+if(INSTALL_LIB)
+  install(TARGETS lammps LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
+  install(FILES ${LAMMPS_SOURCE_DIR}/lammps.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+elseif(BUILD_SHARED_LIBS)
+  message(FATAL_ERROR "Shared library has to be installed, use -DINSTALL_LIB=ON to install lammps with a library")
+endif()
+
+add_executable(lmp ${LMP_SOURCES})
+target_link_libraries(lmp lammps)
+install(TARGETS lmp DESTINATION ${CMAKE_INSTALL_BINDIR})
+if(ENABLE_TESTING)
+  add_test(ShowHelp ${CMAKE_CURRENT_BINARY_DIR}/lmp -help)
+endif()
+
+##################################
+# Print package summary
+##################################
+foreach(PKG ${DEFAULT_PACKAGES} ${OTHER_PACKAGES} ${ACCEL_PACKAGES})
+  if(ENABLE_${PKG})
+    message(STATUS "Building package: ${PKG}")
+  endif()
+endforeach()
diff --git a/cmake/Modules/FindFFTW2.cmake b/cmake/Modules/FindFFTW2.cmake
new file mode 100644
index 0000000000..c77e6cf8e9
--- /dev/null
+++ b/cmake/Modules/FindFFTW2.cmake
@@ -0,0 +1,22 @@
+# - Find fftw2
+# Find the native FFTW2 headers and libraries.
+#
+#  FFTW2_INCLUDE_DIRS - where to find fftw2.h, etc.
+#  FFTW2_LIBRARIES    - List of libraries when using fftw2.
+#  FFTW2_FOUND        - True if fftw2 found.
+#
+
+find_path(FFTW2_INCLUDE_DIR fftw.h)
+
+find_library(FFTW2_LIBRARY NAMES fftw)
+
+set(FFTW2_LIBRARIES ${FFTW2_LIBRARY})
+set(FFTW2_INCLUDE_DIRS ${FFTW2_INCLUDE_DIR})
+
+include(FindPackageHandleStandardArgs)
+# handle the QUIETLY and REQUIRED arguments and set FFTW2_FOUND to TRUE
+# if all listed variables are TRUE
+
+find_package_handle_standard_args(FFTW2 DEFAULT_MSG FFTW2_LIBRARY FFTW2_INCLUDE_DIR)
+
+mark_as_advanced(FFTW2_INCLUDE_DIR FFTW2_LIBRARY )
diff --git a/cmake/Modules/FindFFTW3.cmake b/cmake/Modules/FindFFTW3.cmake
new file mode 100644
index 0000000000..552bcc4257
--- /dev/null
+++ b/cmake/Modules/FindFFTW3.cmake
@@ -0,0 +1,25 @@
+# - Find fftw3
+# Find the native FFTW3 headers and libraries.
+#
+#  FFTW3_INCLUDE_DIRS - where to find fftw3.h, etc.
+#  FFTW3_LIBRARIES    - List of libraries when using fftw3.
+#  FFTW3_FOUND        - True if fftw3 found.
+#
+
+find_package(PkgConfig)
+
+pkg_check_modules(PC_FFTW3 fftw3)
+find_path(FFTW3_INCLUDE_DIR fftw3.h HINTS ${PC_FFTW3_INCLUDE_DIRS})
+
+find_library(FFTW3_LIBRARY NAMES fftw3 HINTS ${PC_FFTW3_LIBRARY_DIRS})
+
+set(FFTW3_LIBRARIES ${FFTW3_LIBRARY})
+set(FFTW3_INCLUDE_DIRS ${FFTW3_INCLUDE_DIR})
+
+include(FindPackageHandleStandardArgs)
+# handle the QUIETLY and REQUIRED arguments and set FFTW3_FOUND to TRUE
+# if all listed variables are TRUE
+
+find_package_handle_standard_args(FFTW3 DEFAULT_MSG FFTW3_LIBRARY FFTW3_INCLUDE_DIR)
+
+mark_as_advanced(FFTW3_INCLUDE_DIR FFTW3_LIBRARY )
diff --git a/cmake/Modules/FindKIM.cmake b/cmake/Modules/FindKIM.cmake
new file mode 100644
index 0000000000..a01f817cf6
--- /dev/null
+++ b/cmake/Modules/FindKIM.cmake
@@ -0,0 +1,22 @@
+# - Find kim
+# Find the native KIM headers and libraries.
+#
+#  KIM_INCLUDE_DIRS - where to find kim.h, etc.
+#  KIM_LIBRARIES    - List of libraries when using kim.
+#  KIM_FOUND        - True if kim found.
+#
+
+find_path(KIM_INCLUDE_DIR KIM_API.h PATH_SUFFIXES kim-api-v1)
+
+find_library(KIM_LIBRARY NAMES kim-api-v1)
+
+set(KIM_LIBRARIES ${KIM_LIBRARY})
+set(KIM_INCLUDE_DIRS ${KIM_INCLUDE_DIR})
+
+include(FindPackageHandleStandardArgs)
+# handle the QUIETLY and REQUIRED arguments and set KIM_FOUND to TRUE
+# if all listed variables are TRUE
+
+find_package_handle_standard_args(KIM DEFAULT_MSG KIM_LIBRARY KIM_INCLUDE_DIR)
+
+mark_as_advanced(KIM_INCLUDE_DIR KIM_LIBRARY )
diff --git a/cmake/Modules/FindMKL.cmake b/cmake/Modules/FindMKL.cmake
new file mode 100644
index 0000000000..4246062103
--- /dev/null
+++ b/cmake/Modules/FindMKL.cmake
@@ -0,0 +1,22 @@
+# - Find mkl
+# Find the native MKL headers and libraries.
+#
+#  MKL_INCLUDE_DIRS - where to find mkl.h, etc.
+#  MKL_LIBRARIES    - List of libraries when using mkl.
+#  MKL_FOUND        - True if mkl found.
+#
+
+find_path(MKL_INCLUDE_DIR mkl_dfti.h HINTS $ENV{MKLROOT}/include)
+
+find_library(MKL_LIBRARY NAMES mkl_rt HINTS $ENV{MKLROOT}/lib $ENV{MKLROOT}/lib/intel64)
+
+set(MKL_LIBRARIES ${MKL_LIBRARY})
+set(MKL_INCLUDE_DIRS ${MKL_INCLUDE_DIR})
+
+include(FindPackageHandleStandardArgs)
+# handle the QUIETLY and REQUIRED arguments and set MKL_FOUND to TRUE
+# if all listed variables are TRUE
+
+find_package_handle_standard_args(MKL DEFAULT_MSG MKL_LIBRARY MKL_INCLUDE_DIR)
+
+mark_as_advanced(MKL_INCLUDE_DIR MKL_LIBRARY )
diff --git a/cmake/Modules/FindNetCDF.cmake b/cmake/Modules/FindNetCDF.cmake
new file mode 100644
index 0000000000..a28c959acf
--- /dev/null
+++ b/cmake/Modules/FindNetCDF.cmake
@@ -0,0 +1,118 @@
+# - Find NetCDF
+# Find the native NetCDF includes and library
+#
+#  NETCDF_INCLUDE_DIR  - user modifiable choice of where netcdf headers are
+#  NETCDF_LIBRARY      - user modifiable choice of where netcdf libraries are
+#
+# Your package can require certain interfaces to be FOUND by setting these
+#
+#  NETCDF_CXX         - require the C++ interface and link the C++ library
+#  NETCDF_F77         - require the F77 interface and link the fortran library
+#  NETCDF_F90         - require the F90 interface and link the fortran library
+#
+# Or equivalently by calling FindNetCDF with a COMPONENTS argument containing one or
+# more of "CXX;F77;F90".
+#
+# When interfaces are requested the user has access to interface specific hints:
+#
+#  NETCDF_${LANG}_INCLUDE_DIR - where to search for interface header files
+#  NETCDF_${LANG}_LIBRARY     - where to search for interface libraries
+#
+# This module returns these variables for the rest of the project to use.
+#
+#  NETCDF_FOUND          - True if NetCDF found including required interfaces (see below)
+#  NETCDF_LIBRARIES      - All netcdf related libraries.
+#  NETCDF_INCLUDE_DIRS   - All directories to include.
+#  NETCDF_HAS_INTERFACES - Whether requested interfaces were found or not.
+#  NETCDF_${LANG}_INCLUDE_DIRS/NETCDF_${LANG}_LIBRARIES - C/C++/F70/F90 only interface
+#
+# Normal usage would be:
+#  set (NETCDF_F90 "YES")
+#  find_package (NetCDF REQUIRED)
+#  target_link_libraries (uses_everthing ${NETCDF_LIBRARIES})
+#  target_link_libraries (only_uses_f90 ${NETCDF_F90_LIBRARIES})
+
+#search starting from user editable cache var
+if (NETCDF_INCLUDE_DIR AND NETCDF_LIBRARY)
+  # Already in cache, be silent
+  set (NETCDF_FIND_QUIETLY TRUE)
+endif ()
+
+set(USE_DEFAULT_PATHS "NO_DEFAULT_PATH")
+if(NETCDF_USE_DEFAULT_PATHS)
+  set(USE_DEFAULT_PATHS "")
+endif()
+
+find_path (NETCDF_INCLUDE_DIR netcdf.h
+  HINTS "${NETCDF_DIR}/include")
+mark_as_advanced (NETCDF_INCLUDE_DIR)
+set (NETCDF_C_INCLUDE_DIRS ${NETCDF_INCLUDE_DIR})
+
+find_library (NETCDF_LIBRARY NAMES netcdf
+  HINTS "${NETCDF_DIR}/lib")
+mark_as_advanced (NETCDF_LIBRARY)
+
+set (NETCDF_C_LIBRARIES ${NETCDF_LIBRARY})
+
+#start finding requested language components
+set (NetCDF_libs "")
+set (NetCDF_includes "${NETCDF_INCLUDE_DIR}")
+
+get_filename_component (NetCDF_lib_dirs "${NETCDF_LIBRARY}" PATH)
+set (NETCDF_HAS_INTERFACES "YES") # will be set to NO if we're missing any interfaces
+
+macro (NetCDF_check_interface lang header libs)
+  if (NETCDF_${lang})
+    #search starting from user modifiable cache var
+    find_path (NETCDF_${lang}_INCLUDE_DIR NAMES ${header}
+      HINTS "${NETCDF_INCLUDE_DIR}"
+      HINTS "${NETCDF_${lang}_ROOT}/include"
+      ${USE_DEFAULT_PATHS})
+
+    find_library (NETCDF_${lang}_LIBRARY NAMES ${libs}
+      HINTS "${NetCDF_lib_dirs}"
+      HINTS "${NETCDF_${lang}_ROOT}/lib"
+      ${USE_DEFAULT_PATHS})
+
+    mark_as_advanced (NETCDF_${lang}_INCLUDE_DIR NETCDF_${lang}_LIBRARY)
+
+    #export to internal varS that rest of project can use directly
+    set (NETCDF_${lang}_LIBRARIES ${NETCDF_${lang}_LIBRARY})
+    set (NETCDF_${lang}_INCLUDE_DIRS ${NETCDF_${lang}_INCLUDE_DIR})
+
+    if (NETCDF_${lang}_INCLUDE_DIR AND NETCDF_${lang}_LIBRARY)
+      list (APPEND NetCDF_libs ${NETCDF_${lang}_LIBRARY})
+      list (APPEND NetCDF_includes ${NETCDF_${lang}_INCLUDE_DIR})
+    else ()
+      set (NETCDF_HAS_INTERFACES "NO")
+      message (STATUS "Failed to find NetCDF interface for ${lang}")
+    endif ()
+  endif ()
+endmacro ()
+
+list (FIND NetCDF_FIND_COMPONENTS "CXX" _nextcomp)
+if (_nextcomp GREATER -1)
+  set (NETCDF_CXX 1)
+endif ()
+list (FIND NetCDF_FIND_COMPONENTS "F77" _nextcomp)
+if (_nextcomp GREATER -1)
+  set (NETCDF_F77 1)
+endif ()
+list (FIND NetCDF_FIND_COMPONENTS "F90" _nextcomp)
+if (_nextcomp GREATER -1)
+  set (NETCDF_F90 1)
+endif ()
+NetCDF_check_interface (CXX netcdfcpp.h netcdf_c++)
+NetCDF_check_interface (F77 netcdf.inc  netcdff)
+NetCDF_check_interface (F90 netcdf.mod  netcdff)
+
+#export accumulated results to internal varS that rest of project can depend on
+list (APPEND NetCDF_libs "${NETCDF_C_LIBRARIES}")
+set (NETCDF_LIBRARIES ${NetCDF_libs})
+set (NETCDF_INCLUDE_DIRS ${NetCDF_includes})
+
+# handle the QUIETLY and REQUIRED arguments and set NETCDF_FOUND to TRUE if
+# all listed variables are TRUE
+include (FindPackageHandleStandardArgs)
+find_package_handle_standard_args (NetCDF
+  DEFAULT_MSG NETCDF_LIBRARIES NETCDF_INCLUDE_DIRS NETCDF_HAS_INTERFACES)
diff --git a/cmake/Modules/FindQE.cmake b/cmake/Modules/FindQE.cmake
new file mode 100644
index 0000000000..4484bd4db2
--- /dev/null
+++ b/cmake/Modules/FindQE.cmake
@@ -0,0 +1,29 @@
+# - Find quantum-espresso
+# Find the native QE headers and libraries.
+#
+#  QE_INCLUDE_DIRS - where to find quantum-espresso.h, etc.
+#  QE_LIBRARIES    - List of libraries when using quantum-espresso.
+#  QE_FOUND        - True if quantum-espresso found.
+#
+
+find_path(QE_INCLUDE_DIR libqecouple.h PATH_SUFFIXES COUPLE/include)
+
+find_library(QECOUPLE_LIBRARY NAMES qecouple)
+find_library(PW_LIBRARY NAMES pw)
+find_library(QEMOD_LIBRARY NAMES qemod)
+find_library(QEFFT_LIBRARY NAMES qefft)
+find_library(QELA_LIBRARY NAMES qela)
+find_library(CLIB_LIBRARY NAMES clib)
+find_library(IOTK_LIBRARY NAMES iotk)
+
+
+set(QE_LIBRARIES ${QECOUPLE_LIBRARY} ${PW_LIBRARY} ${QEMOD_LIBRARY} ${QEFFT_LIBRARY} ${QELA_LIBRARY} ${CLIB_LIBRARY} ${IOTK_LIBRARY})
+set(QE_INCLUDE_DIRS ${QE_INCLUDE_DIR})
+
+include(FindPackageHandleStandardArgs)
+# handle the QUIETLY and REQUIRED arguments and set QE_FOUND to TRUE
+# if all listed variables are TRUE
+
+find_package_handle_standard_args(QE DEFAULT_MSG QECOUPLE_LIBRARY PW_LIBRARY QEMOD_LIBRARY QEFFT_LIBRARY QELA_LIBRARY CLIB_LIBRARY IOTK_LIBRARY QE_INCLUDE_DIR)
+
+mark_as_advanced(QE_INCLUDE_DIR QECOUPLE_LIBRARY PW_LIBRARY QEMOD_LIBRARY QEFFT_LIBRARY QELA_LIBRARY CLIB_LIBRARY IOTK_LIBRARY)
diff --git a/cmake/Modules/FindQUIP.cmake b/cmake/Modules/FindQUIP.cmake
new file mode 100644
index 0000000000..4ee1baf4f8
--- /dev/null
+++ b/cmake/Modules/FindQUIP.cmake
@@ -0,0 +1,18 @@
+# - Find quip
+# Find the native QUIP libraries.
+#
+#  QUIP_LIBRARIES    - List of libraries when using fftw3.
+#  QUIP_FOUND        - True if fftw3 found.
+#
+
+find_library(QUIP_LIBRARY NAMES quip)
+
+set(QUIP_LIBRARIES ${QUIP_LIBRARY})
+
+include(FindPackageHandleStandardArgs)
+# handle the QUIETLY and REQUIRED arguments and set QUIP_FOUND to TRUE
+# if all listed variables are TRUE
+
+find_package_handle_standard_args(QUIP DEFAULT_MSG QUIP_LIBRARY)
+
+mark_as_advanced(QUIP_LIBRARY)
diff --git a/cmake/Modules/FindVORO.cmake b/cmake/Modules/FindVORO.cmake
new file mode 100644
index 0000000000..b0cccbcd1d
--- /dev/null
+++ b/cmake/Modules/FindVORO.cmake
@@ -0,0 +1,22 @@
+# - Find voro++
+# Find the native VORO headers and libraries.
+#
+#  VORO_INCLUDE_DIRS - where to find voro++.hh, etc.
+#  VORO_LIBRARIES    - List of libraries when using voro++.
+#  VORO_FOUND        - True if voro++ found.
+#
+
+find_path(VORO_INCLUDE_DIR voro++.hh PATH_SUFFIXES voro++)
+
+find_library(VORO_LIBRARY NAMES voro++)
+
+set(VORO_LIBRARIES ${VORO_LIBRARY})
+set(VORO_INCLUDE_DIRS ${VORO_INCLUDE_DIR})
+
+include(FindPackageHandleStandardArgs)
+# handle the QUIETLY and REQUIRED arguments and set VORO_FOUND to TRUE
+# if all listed variables are TRUE
+
+find_package_handle_standard_args(VORO DEFAULT_MSG VORO_LIBRARY VORO_INCLUDE_DIR)
+
+mark_as_advanced(VORO_INCLUDE_DIR VORO_LIBRARY )
diff --git a/cmake/Modules/StyleHeaderUtils.cmake b/cmake/Modules/StyleHeaderUtils.cmake
new file mode 100644
index 0000000000..9939a7505a
--- /dev/null
+++ b/cmake/Modules/StyleHeaderUtils.cmake
@@ -0,0 +1,132 @@
+function(FindStyleHeaders path style_class file_pattern headers)
+    file(GLOB files "${path}/${file_pattern}*.h")
+    get_property(hlist GLOBAL PROPERTY ${headers})
+
+    foreach(file_name ${files})
+        file(STRINGS ${file_name} is_style LIMIT_COUNT 1 REGEX ${style_class})
+        if(is_style)
+            list(APPEND hlist ${file_name})
+        endif()
+    endforeach()
+    set_property(GLOBAL PROPERTY ${headers} "${hlist}")
+endfunction(FindStyleHeaders)
+
+function(FindStyleHeadersExt path style_class extension headers sources)
+    get_property(hlist GLOBAL PROPERTY ${headers})
+    get_property(slist GLOBAL PROPERTY ${sources})
+    set(ext_list)
+    get_filename_component(abs_path "${path}" ABSOLUTE)
+
+    foreach(file_name ${hlist})
+        get_filename_component(basename ${file_name} NAME_WE)
+        set(ext_file_name "${abs_path}/${basename}_${extension}.h")
+        if(EXISTS "${ext_file_name}")
+            file(STRINGS ${ext_file_name} is_style LIMIT_COUNT 1 REGEX ${style_class})
+            if(is_style)
+                list(APPEND ext_list ${ext_file_name})
+
+                set(source_file_name "${abs_path}/${basename}_${extension}.cpp")
+                if(EXISTS "${source_file_name}")
+                    list(APPEND slist ${source_file_name})
+                endif()
+            endif()
+        endif()
+    endforeach()
+
+    list(APPEND hlist ${ext_list})
+    set_property(GLOBAL PROPERTY ${headers} "${hlist}")
+    set_property(GLOBAL PROPERTY ${sources} "${slist}")
+endfunction(FindStyleHeadersExt)
+
+function(CreateStyleHeader path filename)
+    math(EXPR N "${ARGC}-2")
+
+    set(temp "")
+    if(N GREATER 0)
+        math(EXPR ARG_END   "${ARGC}-1")
+ 
+        foreach(IDX RANGE 2 ${ARG_END})
+            list(GET ARGV ${IDX} FNAME)
+            get_filename_component(FNAME ${FNAME} NAME)
+            set(temp "${temp}#include \"${FNAME}\"\n")
+        endforeach()
+    endif()
+    message(STATUS "Generating ${filename}...")
+    file(WRITE "${path}/${filename}.tmp" "${temp}" )
+    execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different "${path}/${filename}.tmp" "${path}/${filename}")
+endfunction(CreateStyleHeader)
+
+function(GenerateStyleHeader path property style)
+    get_property(files GLOBAL PROPERTY ${property})
+    #message("${property} = ${files}")
+    CreateStyleHeader("${path}" "style_${style}.h" ${files})
+endfunction(GenerateStyleHeader)
+
+function(RegisterStyles search_path)
+    FindStyleHeaders(${search_path} ANGLE_CLASS     angle_     ANGLE     ) # angle     ) # force
+    FindStyleHeaders(${search_path} ATOM_CLASS      atom_vec_  ATOM_VEC  ) # atom      ) # atom      atom_vec_hybrid
+    FindStyleHeaders(${search_path} BODY_CLASS      body_      BODY      ) # body      ) # atom_vec_body
+    FindStyleHeaders(${search_path} BOND_CLASS      bond_      BOND      ) # bond      ) # force
+    FindStyleHeaders(${search_path} COMMAND_CLASS   ""         COMMAND   ) # command   ) # input
+    FindStyleHeaders(${search_path} COMPUTE_CLASS   compute_   COMPUTE   ) # compute   ) # modify
+    FindStyleHeaders(${search_path} DIHEDRAL_CLASS  dihedral_  DIHEDRAL  ) # dihedral  ) # force
+    FindStyleHeaders(${search_path} DUMP_CLASS      dump_      DUMP      ) # dump      ) # output    write_dump
+    FindStyleHeaders(${search_path} FIX_CLASS       fix_       FIX       ) # fix       ) # modify
+    FindStyleHeaders(${search_path} IMPROPER_CLASS  improper_  IMPROPER  ) # improper  ) # force
+    FindStyleHeaders(${search_path} INTEGRATE_CLASS ""         INTEGRATE ) # integrate ) # update
+    FindStyleHeaders(${search_path} KSPACE_CLASS    ""         KSPACE    ) # kspace    ) # force
+    FindStyleHeaders(${search_path} MINIMIZE_CLASS  min_       MINIMIZE  ) # minimize  ) # update
+    FindStyleHeaders(${search_path} NBIN_CLASS      nbin_      NBIN      ) # nbin      ) # neighbor
+    FindStyleHeaders(${search_path} NPAIR_CLASS     npair_     NPAIR     ) # npair     ) # neighbor
+    FindStyleHeaders(${search_path} NSTENCIL_CLASS  nstencil_  NSTENCIL  ) # nstencil  ) # neighbor
+    FindStyleHeaders(${search_path} NTOPO_CLASS     ntopo_     NTOPO     ) # ntopo     ) # neighbor
+    FindStyleHeaders(${search_path} PAIR_CLASS      pair_      PAIR      ) # pair      ) # force
+    FindStyleHeaders(${search_path} READER_CLASS    reader_    READER    ) # reader    ) # read_dump
+    FindStyleHeaders(${search_path} REGION_CLASS    region_    REGION    ) # region    ) # domain
+endfunction(RegisterStyles)
+
+function(RegisterStylesExt search_path extension sources)
+    FindStyleHeadersExt(${search_path} ANGLE_CLASS     ${extension}  ANGLE     ${sources})
+    FindStyleHeadersExt(${search_path} ATOM_CLASS      ${extension}  ATOM_VEC  ${sources})
+    FindStyleHeadersExt(${search_path} BODY_CLASS      ${extension}  BODY      ${sources})
+    FindStyleHeadersExt(${search_path} BOND_CLASS      ${extension}  BOND      ${sources})
+    FindStyleHeadersExt(${search_path} COMMAND_CLASS   ${extension}  COMMAND   ${sources})
+    FindStyleHeadersExt(${search_path} COMPUTE_CLASS   ${extension}  COMPUTE   ${sources})
+    FindStyleHeadersExt(${search_path} DIHEDRAL_CLASS  ${extension}  DIHEDRAL  ${sources})
+    FindStyleHeadersExt(${search_path} DUMP_CLASS      ${extension}  DUMP      ${sources})
+    FindStyleHeadersExt(${search_path} FIX_CLASS       ${extension}  FIX       ${sources})
+    FindStyleHeadersExt(${search_path} IMPROPER_CLASS  ${extension}  IMPROPER  ${sources})
+    FindStyleHeadersExt(${search_path} INTEGRATE_CLASS ${extension}  INTEGRATE ${sources})
+    FindStyleHeadersExt(${search_path} KSPACE_CLASS    ${extension}  KSPACE    ${sources})
+    FindStyleHeadersExt(${search_path} MINIMIZE_CLASS  ${extension}  MINIMIZE  ${sources})
+    FindStyleHeadersExt(${search_path} NBIN_CLASS      ${extension}  NBIN      ${sources})
+    FindStyleHeadersExt(${search_path} NPAIR_CLASS     ${extension}  NPAIR     ${sources})
+    FindStyleHeadersExt(${search_path} NSTENCIL_CLASS  ${extension}  NSTENCIL  ${sources})
+    FindStyleHeadersExt(${search_path} NTOPO_CLASS     ${extension}  NTOPO     ${sources})
+    FindStyleHeadersExt(${search_path} PAIR_CLASS      ${extension}  PAIR      ${sources})
+    FindStyleHeadersExt(${search_path} READER_CLASS    ${extension}  READER    ${sources})
+    FindStyleHeadersExt(${search_path} REGION_CLASS    ${extension}  REGION    ${sources})
+endfunction(RegisterStylesExt)
+
+function(GenerateStyleHeaders output_path)
+    GenerateStyleHeader(${output_path} ANGLE      angle     ) # force
+    GenerateStyleHeader(${output_path} ATOM_VEC   atom      ) # atom      atom_vec_hybrid
+    GenerateStyleHeader(${output_path} BODY       body      ) # atom_vec_body
+    GenerateStyleHeader(${output_path} BOND       bond      ) # force
+    GenerateStyleHeader(${output_path} COMMAND    command   ) # input
+    GenerateStyleHeader(${output_path} COMPUTE    compute   ) # modify
+    GenerateStyleHeader(${output_path} DIHEDRAL   dihedral  ) # force
+    GenerateStyleHeader(${output_path} DUMP       dump      ) # output    write_dump
+    GenerateStyleHeader(${output_path} FIX        fix       ) # modify
+    GenerateStyleHeader(${output_path} IMPROPER   improper  ) # force
+    GenerateStyleHeader(${output_path} INTEGRATE  integrate ) # update
+    GenerateStyleHeader(${output_path} KSPACE     kspace    ) # force
+    GenerateStyleHeader(${output_path} MINIMIZE   minimize  ) # update
+    GenerateStyleHeader(${output_path} NBIN       nbin      ) # neighbor
+    GenerateStyleHeader(${output_path} NPAIR      npair     ) # neighbor
+    GenerateStyleHeader(${output_path} NSTENCIL   nstencil  ) # neighbor
+    GenerateStyleHeader(${output_path} NTOPO      ntopo     ) # neighbor
+    GenerateStyleHeader(${output_path} PAIR       pair      ) # force
+    GenerateStyleHeader(${output_path} READER     reader    ) # read_dump
+    GenerateStyleHeader(${output_path} REGION     region    ) # domain
+endfunction(GenerateStyleHeaders)
diff --git a/cmake/README b/cmake/README
new file mode 100644
index 0000000000..cc67cceb52
--- /dev/null
+++ b/cmake/README
@@ -0,0 +1,19 @@
+cmake-buildsystem
+-----------------
+
+To use the cmake build system instead of the make-driven one, do:
+```
+cmake /path/to/lammps/source/cmake
+```
+(please note the cmake directory as the very end)
+
+To enable package, e.g. GPU do
+```
+cmake /path/to/lammps/source/cmake -DENABLE_GPU=ON
+```
+
+cmake has many many options, do get an overview use the curses-based cmake interface, ccmake:
+```
+ccmake /path/to/lammps/source/cmake
+```
+(Don't forget to press "g" for generate once you are done with configuring)
diff --git a/cmake/gpu/lal_pppm_d.cu b/cmake/gpu/lal_pppm_d.cu
new file mode 100644
index 0000000000..a49a535013
--- /dev/null
+++ b/cmake/gpu/lal_pppm_d.cu
@@ -0,0 +1,4 @@
+#define grdtyp double
+#define grdtyp4 double4
+
+#include "lal_pppm.cu"
diff --git a/cmake/gpu/lal_pppm_f.cu b/cmake/gpu/lal_pppm_f.cu
new file mode 100644
index 0000000000..e7f5116fa0
--- /dev/null
+++ b/cmake/gpu/lal_pppm_f.cu
@@ -0,0 +1,4 @@
+#define grdtyp float
+#define grdtyp4 float4
+
+#include "lal_pppm.cu"
diff --git a/doc/src/JPG/user_intel.png b/doc/src/JPG/user_intel.png
index 302b50124a..7ec83b3207 100755
Binary files a/doc/src/JPG/user_intel.png and b/doc/src/JPG/user_intel.png differ
diff --git a/doc/src/Manual.txt b/doc/src/Manual.txt
index 2af2ffd4b7..bb2e1b8114 100644
--- a/doc/src/Manual.txt
+++ b/doc/src/Manual.txt
@@ -1,7 +1,7 @@
 <!-- HTML_ONLY -->
 <HEAD>
 <TITLE>LAMMPS Users Manual</TITLE>
-<META NAME="docnumber" CONTENT="10 Aug 2017 version">
+<META NAME="docnumber" CONTENT="17 Aug 2017 version">
 <META NAME="author" CONTENT="http://lammps.sandia.gov - Sandia National Laboratories">
 <META NAME="copyright" CONTENT="Copyright (2003) Sandia Corporation.  This software and manual is distributed under the GNU General Public License.">
 </HEAD>
@@ -21,7 +21,7 @@
 <H1></H1>
 
 LAMMPS Documentation :c,h3
-10 Aug 2017 version :c,h4
+17 Aug 2017 version :c,h4
 
 Version info: :h4
 
@@ -79,7 +79,7 @@ bug reports and feature requests are mainly coordinated through the
 "LAMMPS project on GitHub."_https://github.com/lammps/lammps
 The lammps.org domain, currently hosting "public continuous integration
 testing"_https://ci.lammps.org/job/lammps/ and "precompiled Linux
-RPM and Windows installer packages"_http://rpm.lammps.org is located
+RPM and Windows installer packages"_http://packages.lammps.org is located
 at Temple University and managed by Richard Berger,
 richard.berger at temple.edu.
 
diff --git a/doc/src/PDF/colvars-refman-lammps.pdf b/doc/src/PDF/colvars-refman-lammps.pdf
index a14d93cd69..ad15752107 100644
Binary files a/doc/src/PDF/colvars-refman-lammps.pdf and b/doc/src/PDF/colvars-refman-lammps.pdf differ
diff --git a/doc/src/Section_commands.txt b/doc/src/Section_commands.txt
index f1eb225fe5..571c6c4920 100644
--- a/doc/src/Section_commands.txt
+++ b/doc/src/Section_commands.txt
@@ -892,8 +892,8 @@ KOKKOS, o = USER-OMP, t = OPT.
 "hybrid"_pair_hybrid.html,
 "hybrid/overlay"_pair_hybrid.html,
 "adp (o)"_pair_adp.html,
-"airebo (o)"_pair_airebo.html,
-"airebo/morse (o)"_pair_airebo.html,
+"airebo (oi)"_pair_airebo.html,
+"airebo/morse (oi)"_pair_airebo.html,
 "beck (go)"_pair_beck.html,
 "body"_pair_body.html,
 "bop"_pair_bop.html,
@@ -927,8 +927,8 @@ KOKKOS, o = USER-OMP, t = OPT.
 "dpd/tstat (go)"_pair_dpd.html,
 "dsmc"_pair_dsmc.html,
 "eam (gkiot)"_pair_eam.html,
-"eam/alloy (gkot)"_pair_eam.html,
-"eam/fs (gkot)"_pair_eam.html,
+"eam/alloy (gkiot)"_pair_eam.html,
+"eam/fs (gkiot)"_pair_eam.html,
 "eim (o)"_pair_eim.html,
 "gauss (go)"_pair_gauss.html,
 "gayberne (gio)"_pair_gayberne.html,
@@ -942,9 +942,9 @@ KOKKOS, o = USER-OMP, t = OPT.
 "kim"_pair_kim.html,
 "lcbop"_pair_lcbop.html,
 "line/lj"_pair_line_lj.html,
-"lj/charmm/coul/charmm (ko)"_pair_charmm.html,
+"lj/charmm/coul/charmm (kio)"_pair_charmm.html,
 "lj/charmm/coul/charmm/implicit (ko)"_pair_charmm.html,
-"lj/charmm/coul/long (giko)"_pair_charmm.html,
+"lj/charmm/coul/long (gkio)"_pair_charmm.html,
 "lj/charmm/coul/msm"_pair_charmm.html,
 "lj/charmmfsw/coul/charmmfsh"_pair_charmm.html,
 "lj/charmmfsw/coul/long"_pair_charmm.html,
@@ -990,7 +990,7 @@ KOKKOS, o = USER-OMP, t = OPT.
 "polymorphic"_pair_polymorphic.html,
 "python"_pair_python.html,
 "reax"_pair_reax.html,
-"rebo (o)"_pair_airebo.html,
+"rebo (oi)"_pair_airebo.html,
 "resquared (go)"_pair_resquared.html,
 "snap"_pair_snap.html,
 "soft (go)"_pair_soft.html,
diff --git a/doc/src/Section_errors.txt b/doc/src/Section_errors.txt
index 408c01d52c..f5829f92fb 100644
--- a/doc/src/Section_errors.txt
+++ b/doc/src/Section_errors.txt
@@ -7886,8 +7886,8 @@ keyword to allow for additional bonds to be formed :dd
 
 {New bond exceeded special list size in fix bond/create} :dt
 
-See the "special_bonds extra" command
-(or the "read_data extra/special/per/atom" command)
+See the "read_data extra/special/per/atom" command
+(or the "create_box extra/special/per/atom" command)
 for info on how to leave space in the special bonds
 list to allow for additional bonds to be formed. :dd
 
@@ -9666,8 +9666,8 @@ you are running. :dd
 
 {Special list size exceeded in fix bond/create} :dt
 
-See the special_bonds extra command
-(or the read_data extra/special/per/atom command)
+See the "read_data extra/special/per/atom" command
+(or the "create_box extra/special/per/atom" command)
 for info on how to leave space in the special bonds
 list to allow for additional bonds to be formed. :dd
 
diff --git a/doc/src/Section_start.txt b/doc/src/Section_start.txt
index 6eef155be2..a25ec11cfe 100644
--- a/doc/src/Section_start.txt
+++ b/doc/src/Section_start.txt
@@ -662,27 +662,25 @@ your own build system. Due to differences between the Windows OS
 and Windows system libraries to Unix-like environments like Linux
 or MacOS, when compiling for Windows a few adjustments may be needed:
 
-Do not set the -DLAMMPS_MEMALIGN define (see LMP_INC makefile variable)
+Do [not] set the -DLAMMPS_MEMALIGN define (see LMP_INC makefile variable)
 Add -lwsock32 -lpsapi to the linker flags (see LIB makefile variable)
-Try adding -static-libgcc or -static or both to the linker flags when your
-LAMMPS executable complains about missing .dll files  :ul
+Try adding -static-libgcc or -static or both to the linker flags when your LAMMPS executable complains about missing .dll files  :ul
 
-Since none of the current LAMMPS core developers
-has significant experience building executables on Windows, we are
-happy to distribute contributed instructions and modifications, but
-we cannot provide support for those.
+Since none of the current LAMMPS core developers has significant
+experience building executables on Windows, we are happy to distribute
+contributed instructions and modifications to improve the situation,
+but we cannot provide support for those.
 
 With the so-called "Anniversary Update" to Windows 10, there is a
 Ubuntu Linux subsystem available for Windows, that can be installed
 and then used to compile/install LAMMPS as if you are running on a
 Ubuntu Linux system instead of Windows.
 
-As an alternative, you can download "daily builds" (and some older
-versions) of the installer packages from
-"rpm.lammps.org/windows.html"_http://rpm.lammps.org/windows.html.
-These executables are built with most optional packages and the
-download includes documentation, potential files, some tools and
-many examples, but no source code.
+As an alternative, you can download pre-compiled installer packages from
+"packages.lammps.org/windows.html"_http://packages.lammps.org/windows.html.
+These executables are built with most optional packages included and the
+download includes documentation, potential files, some tools and many
+examples, but no source code.
 
 :line
 
@@ -1095,7 +1093,7 @@ LAMMPS to be built with one or more of its optional packages.
 :line
 
 On a Windows box, you can skip making LAMMPS and simply download an
-installer package from "here"_http://rpm.lammps.org/windows.html
+installer package from "here"_http://packages.lammps.org/windows.html
 
 For running the non-MPI executable, follow these steps:
 
@@ -1107,18 +1105,27 @@ the [in.lj] input from the bench folder. (e.g. by typing: cd "Documents"). :l
 
 At the command prompt, type "lmp_serial -in in.lj", replacing [in.lj]
 with the name of your LAMMPS input script. :l
+
+The serial executable includes support for multi-threading
+parallelization from the styles in the USER-OMP packages.
+
+To run with, e.g. 4 threads, type "lmp_serial -in in.lj -pk omp 4 -sf omp"
 :ule
 
-For the MPI version, which allows you to run LAMMPS under Windows on
-multiple processors, follow these steps:
+For the MPI version, which allows you to run LAMMPS under Windows with
+the more general message passing parallel library (LAMMPS has been
+designed from ground up to use MPI efficiently), follow these steps:
 
-Download and install
-"MPICH2"_http://www.mcs.anl.gov/research/projects/mpich2/downloads/index.php?s=downloads
-for Windows. :ulb,l
+Download and install a compatible MPI library binary package:
+for 32-bit Windows
+"mpich2-1.4.1p1-win-ia32.msi"_download.lammps.org/thirdparty/mpich2-1.4.1p1-win-ia32.msi
+and for 64-bit Windows
+"mpich2-1.4.1p1-win-x86-64.msi"_download.lammps.org/thirdparty/mpich2-1.4.1p1-win-x86-64.msi
+:ulb,l
 
 The LAMMPS Windows installer packages will automatically adjust your
 path for the default location of this MPI package. After the installation
-of the MPICH software, it needs to be integrated into the system.
+of the MPICH2 software, it needs to be integrated into the system.
 For this you need to start a Command Prompt in {Administrator Mode}
 (right click on the icon and select it). Change into the MPICH2
 installation directory, then into the subdirectory [bin] and execute
@@ -1137,7 +1144,7 @@ or
 
 mpiexec -np 4 lmp_mpi -in in.lj :pre
 
-replacing in.lj with the name of your LAMMPS input script. For the latter
+replacing [in.lj] with the name of your LAMMPS input script. For the latter
 case, you may be prompted to enter your password. :l
 
 In this mode, output may not immediately show up on the screen, so if
@@ -1149,6 +1156,11 @@ something like:
 
 lmp_mpi -in in.lj :pre
 
+And the parallel executable also includes OpenMP multi-threading, which
+can be combined with MPI using something like:
+
+mpiexec -localonly 2 lmp_mpi -in in.lj -pk omp 2 -sf omp :pre
+
 :ule
 
 :line
diff --git a/doc/src/accelerate_intel.txt b/doc/src/accelerate_intel.txt
index 9eb295e0d0..a7c3382caa 100644
--- a/doc/src/accelerate_intel.txt
+++ b/doc/src/accelerate_intel.txt
@@ -29,8 +29,10 @@ Bond Styles: fene, harmonic :l
 Dihedral Styles: charmm, harmonic, opls :l
 Fixes: nve, npt, nvt, nvt/sllod :l
 Improper Styles: cvff, harmonic :l
-Pair Styles: buck/coul/cut, buck/coul/long, buck, eam, gayberne,
-charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, sw, tersoff :l
+Pair Styles: airebo, airebo/morse, buck/coul/cut, buck/coul/long, 
+buck, eam, eam/alloy, eam/fs, gayberne, lj/charmm/coul/charmm, 
+lj/charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, rebo,
+sw, tersoff :l
 K-Space Styles: pppm, pppm/disp :l
 :ule
 
diff --git a/doc/src/fix_bond_create.txt b/doc/src/fix_bond_create.txt
index a44c3103dd..c0045ac0f0 100644
--- a/doc/src/fix_bond_create.txt
+++ b/doc/src/fix_bond_create.txt
@@ -150,10 +150,9 @@ atoms.  Note that adding a single bond always adds a new 1st neighbor
 but may also induce *many* new 2nd and 3rd neighbors, depending on the
 molecular topology of your system.  The "extra special per atom"
 parameter must typically be set to allow for the new maximum total
-size (1st + 2nd + 3rd neighbors) of this per-atom list.  There are 3
+size (1st + 2nd + 3rd neighbors) of this per-atom list.  There are 2
 ways to do this.  See the "read_data"_read_data.html or
-"create_box"_create_box.html or "special_bonds extra" commands for
-details.
+"create_box"_create_box.html commands for details.
 
 NOTE: Even if you do not use the {atype}, {dtype}, or {itype}
 keywords, the list of topological neighbors is updated for atoms
diff --git a/doc/src/pair_airebo.txt b/doc/src/pair_airebo.txt
index e66ecb637f..1aa017f278 100644
--- a/doc/src/pair_airebo.txt
+++ b/doc/src/pair_airebo.txt
@@ -7,10 +7,13 @@
 :line
 
 pair_style airebo command :h3
+pair_style airebo/intel command :h3
 pair_style airebo/omp command :h3
 pair_style airebo/morse command :h3
+pair_style airebo/morse/intel command :h3
 pair_style airebo/morse/omp command :h3
 pair_style rebo command :h3
+pair_style rebo/intel command :h3
 pair_style rebo/omp command :h3
 
 [Syntax:]
diff --git a/doc/src/pair_charmm.txt b/doc/src/pair_charmm.txt
index ef4ef41c95..75a8e4bff9 100644
--- a/doc/src/pair_charmm.txt
+++ b/doc/src/pair_charmm.txt
@@ -7,6 +7,7 @@
 :line
 
 pair_style lj/charmm/coul/charmm command :h3
+pair_style lj/charmm/coul/charmm/intel command :h3
 pair_style lj/charmm/coul/charmm/omp command :h3
 pair_style lj/charmm/coul/charmm/implicit command :h3
 pair_style lj/charmm/coul/charmm/implicit/omp command :h3
diff --git a/doc/src/pair_eam.txt b/doc/src/pair_eam.txt
index ce8495affd..a0026432ec 100644
--- a/doc/src/pair_eam.txt
+++ b/doc/src/pair_eam.txt
@@ -14,6 +14,7 @@ pair_style eam/omp command :h3
 pair_style eam/opt command :h3
 pair_style eam/alloy command :h3
 pair_style eam/alloy/gpu command :h3
+pair_style eam/alloy/intel command :h3
 pair_style eam/alloy/kk command :h3
 pair_style eam/alloy/omp command :h3
 pair_style eam/alloy/opt command :h3
@@ -21,6 +22,7 @@ pair_style eam/cd command :h3
 pair_style eam/cd/omp command :h3
 pair_style eam/fs command :h3
 pair_style eam/fs/gpu command :h3
+pair_style eam/fs/intel command :h3
 pair_style eam/fs/kk command :h3
 pair_style eam/fs/omp command :h3
 pair_style eam/fs/opt command :h3
diff --git a/doc/src/special_bonds.txt b/doc/src/special_bonds.txt
index 6a661015bd..1021c4856b 100644
--- a/doc/src/special_bonds.txt
+++ b/doc/src/special_bonds.txt
@@ -25,9 +25,7 @@ keyword = {amber} or {charmm} or {dreiding} or {fene} or {lj/coul} or {lj} or {c
   {coul} values = w1,w2,w3
     w1,w2,w3 = weights (0.0 to 1.0) on pairwise Coulombic interactions
   {angle} value = {yes} or {no}
-  {dihedral} value = {yes} or {no}
-  {extra} value = N
-    N = number of extra 1-2,1-3,1-4 interactions to save space for :pre
+  {dihedral} value = {yes} or {no} :pre
 :ule
 
 Examples:
@@ -36,8 +34,7 @@ special_bonds amber
 special_bonds charmm
 special_bonds fene dihedral no
 special_bonds lj/coul 0.0 0.0 0.5 angle yes dihedral yes
-special_bonds lj 0.0 0.0 0.5 coul 0.0 0.0 0.0 dihedral yes
-special_bonds lj/coul 0 1 1 extra 2 :pre
+special_bonds lj 0.0 0.0 0.5 coul 0.0 0.0 0.0 dihedral yes :pre
 
 [Description:]
 
@@ -178,14 +175,6 @@ interaction between atoms 2 and 5 will be unaffected (full weighting
 of 1.0).  If the {dihedral} keyword is specified as {no} which is the
 default, then the 2,5 interaction will also be weighted by 0.5.
 
-The {extra} keyword can be used when additional bonds will be created
-during a simulation run, e.g. by the "fix
-bond/create"_fix_bond_create.html command.  It can also be used if
-molecules will be added to the system, e.g. via the "fix
-deposit"_fix_deposit.html, or "fix pour"_fix_pour.html commands, which
-will have atoms with more special neighbors than any atom in the
-current system has.
-
 :line
 
 NOTE: LAMMPS stores and maintains a data structure with a list of the
@@ -194,8 +183,9 @@ the system).  If new bonds are created (or molecules added containing
 atoms with more special neighbors), the size of this list needs to
 grow.  Note that adding a single bond always adds a new 1st neighbor
 but may also induce *many* new 2nd and 3rd neighbors, depending on the
-molecular topology of your system.  Using the {extra} keyword leaves
-empty space in the list for this N additional 1st, 2nd, or 3rd
+molecular topology of your system.  Using the {extra/special/per/atom}
+keyword to either "read_data"_read_data.html or "create_box"_create_box.html
+reserves empty space in the list for this N additional 1st, 2nd, or 3rd
 neighbors to be added.  If you do not do this, you may get an error
 when bonds (or molecules) are added.
 
@@ -203,8 +193,7 @@ when bonds (or molecules) are added.
 
 NOTE: If you reuse this command in an input script, you should set all
 the options you need each time.  This command cannot be used a 2nd
-time incrementally, e.g. to add some extra storage locations via the
-{extra} keyword.  E.g. these two commands:
+time incrementally.  E.g. these two commands:
 
 special_bonds lj 0.0 1.0 1.0
 special_bonds coul 0.0 0.0 1.0
@@ -221,25 +210,6 @@ Coul: coul 0.0 0.0 1.0
 because the LJ settings are reset to their default values
 each time the command is issued.
 
-Likewise
-
-special_bonds amber
-special_bonds extra 2 :pre
-
-is not the same as this single command:
-
-special_bonds amber extra 2 :pre
-
-since in the former case, the 2nd command will reset all the LJ and
-Coulombic weights to 0.0 (the default).
-
-One exception to this rule is the {extra} option itself.  It is not
-reset to its default value of 0 each time the special_bonds command is
-invoked.  This is because it can also be set by the
-"read_data"_read_data.html and "create_box"_create_box.html commands,
-so this command will not override those settings unless you explicitly
-use {extra} as an option.
-
 [Restrictions:] none
 
 [Related commands:]
diff --git a/doc/src/tutorial_bash_on_windows.txt b/doc/src/tutorial_bash_on_windows.txt
old mode 100755
new mode 100644
diff --git a/doc/src/tutorial_drude.txt b/doc/src/tutorial_drude.txt
index b9a167b804..f6e7eed40b 100644
--- a/doc/src/tutorial_drude.txt
+++ b/doc/src/tutorial_drude.txt
@@ -176,12 +176,13 @@ By recognizing the fix {drude}, LAMMPS will find and store matching
 DC-DP pairs and will treat DP as equivalent to their DC in the
 {special bonds} relations.  It may be necessary to extend the space
 for storing such special relations.  In this case extra space should
-be reserved by using the {extra} keyword of the {special_bonds}
+be reserved by using the {extra/special/per/atom} keyword of either
+the "read_data"_read_data.html or "create_box"_create_box.html
 command.  With our phenol, there is 1 more special neighbor for which
 space is required.  Otherwise LAMMPS crashes and gives the required
 value.
 
-special_bonds lj/coul 0.0 0.0 0.5 extra 1 :pre
+read_data data-p.lmp extra/special/per/atom 1 :pre
 
 Let us assume we want to run a simple NVT simulation at 300 K.  Note
 that Drude oscillators need to be thermalized at a low temperature in
diff --git a/doc/src/tutorials.txt b/doc/src/tutorials.txt
old mode 100755
new mode 100644
diff --git a/lib/colvars/Install.py b/lib/colvars/Install.py
index 01e70543f2..030644ceb5 100644
--- a/lib/colvars/Install.py
+++ b/lib/colvars/Install.py
@@ -45,12 +45,12 @@ while iarg < nargs:
   if args[iarg] == "-m":
     if iarg+2 > len(args): error()
     machine = args[iarg+1]
-    iarg += 2  
+    iarg += 2
   elif args[iarg] == "-e":
     if iarg+2 > len(args): error()
     extraflag = True
     suffix = args[iarg+1]
-    iarg += 2  
+    iarg += 2
   else: error()
 
 # set lib from working dir
diff --git a/lib/colvars/README b/lib/colvars/README
index ce1d319974..5df9612dfa 100644
--- a/lib/colvars/README
+++ b/lib/colvars/README
@@ -32,7 +32,7 @@ where Makefile.g++ uses the GNU C++ compiler and is a good template to start.
 
 **Optional**: if you use the Install.py script provided in this folder, you
 can give the machine name as the '-m' argument.  This can be the suffix of one
-of the files from either this folder, or from src/MAKE.
+of the files from either this folder, or from src/MAKE/MACHINES.
 *This is only supported by the Install.py within the lib/colvars folder*.
 
 When you are done building this library, two files should
@@ -53,10 +53,10 @@ settings in Makefile.common should work.
 For the reference manual see:
   http://colvars.github.io/colvars-refman-lammps
 
-A copy of reference manual is also in:
+A copy of the reference manual is also in:
   doc/PDF/colvars-refman-lammps.pdf
 
-Also included is a Doxygen-based developer documentation:
+Also available is a Doxygen-based developer documentation:
   http://colvars.github.io/doxygen/html/
 
 The reference article is:
diff --git a/lib/colvars/colvar.h b/lib/colvars/colvar.h
index 6113e1678b..dfa9e093a5 100644
--- a/lib/colvars/colvar.h
+++ b/lib/colvars/colvar.h
@@ -88,7 +88,12 @@ public:
   static std::vector<feature *> cv_features;
 
   /// \brief Implementation of the feature list accessor for colvar
-  std::vector<feature *> &features() {
+  virtual const std::vector<feature *> &features()
+  {
+    return cv_features;
+  }
+  virtual std::vector<feature *> &modify_features()
+  {
     return cv_features;
   }
 
diff --git a/lib/colvars/colvaratoms.h b/lib/colvars/colvaratoms.h
index dba2890abc..6113fb38a9 100644
--- a/lib/colvars/colvaratoms.h
+++ b/lib/colvars/colvaratoms.h
@@ -206,7 +206,12 @@ public:
   static std::vector<feature *> ag_features;
 
   /// \brief Implementation of the feature list accessor for atom group
-  virtual std::vector<feature *> &features() {
+  virtual const std::vector<feature *> &features()
+  {
+    return ag_features;
+  }
+  virtual std::vector<feature *> &modify_features()
+  {
     return ag_features;
   }
 
diff --git a/lib/colvars/colvarbias.cpp b/lib/colvars/colvarbias.cpp
index e437466be9..636727ca39 100644
--- a/lib/colvars/colvarbias.cpp
+++ b/lib/colvars/colvarbias.cpp
@@ -384,6 +384,7 @@ std::ostream & colvarbias::write_traj(std::ostream &os)
   os << " ";
   if (b_output_energy)
     os << " "
+       << std::setprecision(cvm::en_prec) << std::setw(cvm::en_width)
        << bias_energy;
   return os;
 }
diff --git a/lib/colvars/colvarbias.h b/lib/colvars/colvarbias.h
index 205e761cfc..a147cd3210 100644
--- a/lib/colvars/colvarbias.h
+++ b/lib/colvars/colvarbias.h
@@ -175,7 +175,11 @@ public:
   static std::vector<feature *> cvb_features;
 
   /// \brief Implementation of the feature list accessor for colvarbias
-  virtual std::vector<feature *> &features()
+  virtual const std::vector<feature *> &features()
+  {
+    return cvb_features;
+  }
+  virtual std::vector<feature *> &modify_features()
   {
     return cvb_features;
   }
diff --git a/lib/colvars/colvarbias_restraint.cpp b/lib/colvars/colvarbias_restraint.cpp
index bb6d6164e5..6879190968 100644
--- a/lib/colvars/colvarbias_restraint.cpp
+++ b/lib/colvars/colvarbias_restraint.cpp
@@ -99,12 +99,9 @@ int colvarbias_restraint_centers::init(std::string const &conf)
   if (null_centers) {
     // try to initialize the restraint centers for the first time
     colvar_centers.resize(num_variables());
-    colvar_centers_raw.resize(num_variables());
     for (i = 0; i < num_variables(); i++) {
       colvar_centers[i].type(variables(i)->value());
       colvar_centers[i].reset();
-      colvar_centers_raw[i].type(variables(i)->value());
-      colvar_centers_raw[i].reset();
     }
   }
 
@@ -113,7 +110,6 @@ int colvarbias_restraint_centers::init(std::string const &conf)
       if (cvm::debug()) {
         cvm::log("colvarbias_restraint: parsing initial centers, i = "+cvm::to_str(i)+".\n");
       }
-      colvar_centers_raw[i] = colvar_centers[i];
       colvar_centers[i].apply_constraints();
     }
     null_centers = false;
@@ -141,8 +137,6 @@ int colvarbias_restraint_centers::change_configuration(std::string const &conf)
     for (size_t i = 0; i < num_variables(); i++) {
       colvar_centers[i].type(variables(i)->value());
       colvar_centers[i].apply_constraints();
-      colvar_centers_raw[i].type(variables(i)->value());
-      colvar_centers_raw[i] = colvar_centers[i];
     }
   }
   return COLVARS_OK;
@@ -232,7 +226,6 @@ int colvarbias_restraint_moving::set_state_params(std::string const &conf)
 {
   if (b_chg_centers || b_chg_force_k) {
     if (target_nstages) {
-      //    cvm::log ("Reading current stage from the restart.\n");
       if (!get_keyval(conf, "stage", stage))
         cvm::error("Error: current stage is missing from the restart.\n");
     }
@@ -265,100 +258,127 @@ int colvarbias_restraint_centers_moving::init(std::string const &conf)
 
   size_t i;
   if (get_keyval(conf, "targetCenters", target_centers, colvar_centers)) {
-    if (colvar_centers.size() != num_variables()) {
+    if (target_centers.size() != num_variables()) {
       cvm::error("Error: number of target centers does not match "
-                 "that of collective variables.\n");
+                 "that of collective variables.\n", INPUT_ERROR);
     }
     b_chg_centers = true;
     for (i = 0; i < target_centers.size(); i++) {
       target_centers[i].apply_constraints();
+      centers_incr.push_back(colvar_centers[i]);
+      centers_incr[i].reset();
     }
   }
 
   if (b_chg_centers) {
-    // parse moving restraint options
+    // parse moving schedule options
     colvarbias_restraint_moving::init(conf);
+    if (initial_centers.size() == 0) {
+      // One-time init
+      initial_centers = colvar_centers;
+    }
+    // Call to check that the definition is correct
+    for (i = 0; i < num_variables(); i++) {
+      colvarvalue const midpoint =
+        colvarvalue::interpolate(initial_centers[i],
+                                 target_centers[i],
+                                 0.5);
+    }
   } else {
     target_centers.clear();
     return COLVARS_OK;
   }
 
   get_keyval(conf, "outputCenters", b_output_centers, b_output_centers);
-  get_keyval(conf, "outputAccumulatedWork", b_output_acc_work, b_output_acc_work);
+  get_keyval(conf, "outputAccumulatedWork", b_output_acc_work,
+             b_output_acc_work); // TODO this conflicts with stages
 
   return COLVARS_OK;
 }
 
 
+int colvarbias_restraint_centers_moving::update_centers(cvm::real lambda)
+{
+  if (cvm::debug()) {
+    cvm::log("Updating centers for the restraint bias \""+
+             this->name+"\": "+cvm::to_str(colvar_centers)+".\n");
+  }
+  size_t i;
+  for (i = 0; i < num_variables(); i++) {
+    colvarvalue const c_new = colvarvalue::interpolate(initial_centers[i],
+                                                       target_centers[i],
+                                                       lambda);
+    centers_incr[i] = (c_new).dist2_grad(colvar_centers[i]);
+    colvar_centers[i] = c_new;
+    variables(i)->wrap(colvar_centers[i]);
+  }
+  if (cvm::debug()) {
+    cvm::log("New centers for the restraint bias \""+
+             this->name+"\": "+cvm::to_str(colvar_centers)+".\n");
+  }
+  return cvm::get_error();
+}
+
+
 int colvarbias_restraint_centers_moving::update()
 {
   if (b_chg_centers) {
 
-    if (cvm::debug()) {
-      cvm::log("Updating centers for the restraint bias \""+
-               this->name+"\": "+cvm::to_str(colvar_centers)+".\n");
-    }
-
-    if (!centers_incr.size()) {
-      // if this is the first calculation, calculate the advancement
-      // at each simulation step (or stage, if applicable)
-      // (take current stage into account: it can be non-zero
-      //  if we are restarting a staged calculation)
-      centers_incr.resize(num_variables());
-      for (size_t i = 0; i < num_variables(); i++) {
-        centers_incr[i].type(variables(i)->value());
-        centers_incr[i] = (target_centers[i] - colvar_centers_raw[i]) /
-          cvm::real( target_nstages ? (target_nstages - stage) :
-                     (target_nsteps - cvm::step_absolute()));
-      }
-      if (cvm::debug()) {
-        cvm::log("Center increment for the restraint bias \""+
-                 this->name+"\": "+cvm::to_str(centers_incr)+" at stage "+cvm::to_str(stage)+ ".\n");
-      }
-    }
-
     if (target_nstages) {
-      if ((cvm::step_relative() > 0)
-          && (cvm::step_absolute() % target_nsteps) == 0
-          && stage < target_nstages) {
-
-        for (size_t i = 0; i < num_variables(); i++) {
-          colvar_centers_raw[i] += centers_incr[i];
-          colvar_centers[i] = colvar_centers_raw[i];
-          variables(i)->wrap(colvar_centers[i]);
-          colvar_centers[i].apply_constraints();
+      // Staged update
+      if (stage <= target_nstages) {
+        if ((cvm::step_relative() > 0) &&
+            ((cvm::step_absolute() % target_nsteps) == 1)) {
+          cvm::real const lambda =
+            cvm::real(stage)/cvm::real(target_nstages);
+          update_centers(lambda);
+          stage++;
+          cvm::log("Moving restraint \"" + this->name +
+                   "\" stage " + cvm::to_str(stage) +
+                   " : setting centers to " + cvm::to_str(colvar_centers) +
+                   " at step " +  cvm::to_str(cvm::step_absolute()));
+        } else {
+          for (size_t i = 0; i < num_variables(); i++) {
+            centers_incr[i].reset();
+          }
         }
-        stage++;
-        cvm::log("Moving restraint \"" + this->name +
-                 "\" stage " + cvm::to_str(stage) +
-                 " : setting centers to " + cvm::to_str(colvar_centers) +
-                 " at step " +  cvm::to_str(cvm::step_absolute()));
       }
-    } else if ((cvm::step_relative() > 0) && (cvm::step_absolute() <= target_nsteps)) {
-      // move the restraint centers in the direction of the targets
-      // (slow growth)
+    } else {
+      // Continuous update
+      if (cvm::step_absolute() <= target_nsteps) {
+        cvm::real const lambda =
+          cvm::real(cvm::step_absolute())/cvm::real(target_nsteps);
+        update_centers(lambda);
+      } else {
+        for (size_t i = 0; i < num_variables(); i++) {
+          centers_incr[i].reset();
+        }
+      }
+    }
+
+    if (cvm::step_relative() == 0) {
       for (size_t i = 0; i < num_variables(); i++) {
-        colvar_centers_raw[i] += centers_incr[i];
-        colvar_centers[i] = colvar_centers_raw[i];
-        variables(i)->wrap(colvar_centers[i]);
-        colvar_centers[i].apply_constraints();
+        // finite differences are undefined when restarting
+        centers_incr[i].reset();
       }
     }
 
     if (cvm::debug()) {
-      cvm::log("New centers for the restraint bias \""+
-               this->name+"\": "+cvm::to_str(colvar_centers)+".\n");
+      cvm::log("Center increment for the restraint bias \""+
+               this->name+"\": "+cvm::to_str(centers_incr)+
+               " at stage "+cvm::to_str(stage)+ ".\n");
     }
   }
 
-  return COLVARS_OK;
+  return cvm::get_error();
 }
 
 
 int colvarbias_restraint_centers_moving::update_acc_work()
 {
   if (b_output_acc_work) {
-    if ((cvm::step_relative() > 0) || (cvm::step_absolute() == 0)) {
+    if ((cvm::step_relative() > 0) &&
+        (cvm::step_absolute() <= target_nsteps)) {
       for (size_t i = 0; i < num_variables(); i++) {
         // project forces on the calculated increments at this step
         acc_work += colvar_forces[i] * centers_incr[i];
@@ -383,13 +403,6 @@ std::string const colvarbias_restraint_centers_moving::get_state_params() const
          << colvar_centers[i];
     }
     os << "\n";
-    os << "centers_raw ";
-    for (i = 0; i < num_variables(); i++) {
-      os << " "
-         << std::setprecision(cvm::cv_prec) << std::setw(cvm::cv_width)
-         << colvar_centers_raw[i];
-    }
-    os << "\n";
 
     if (b_output_acc_work) {
       os << "accumulatedWork "
@@ -398,7 +411,7 @@ std::string const colvarbias_restraint_centers_moving::get_state_params() const
     }
   }
 
-  return colvarbias_restraint_moving::get_state_params() + os.str();
+  return os.str();
 }
 
 
@@ -410,8 +423,6 @@ int colvarbias_restraint_centers_moving::set_state_params(std::string const &con
     //    cvm::log ("Reading the updated restraint centers from the restart.\n");
     if (!get_keyval(conf, "centers", colvar_centers))
       cvm::error("Error: restraint centers are missing from the restart.\n");
-    if (!get_keyval(conf, "centers_raw", colvar_centers_raw))
-      cvm::error("Error: \"raw\" restraint centers are missing from the restart.\n");
     if (b_output_acc_work) {
       if (!get_keyval(conf, "accumulatedWork", acc_work))
         cvm::error("Error: accumulatedWork is missing from the restart.\n");
@@ -609,7 +620,7 @@ std::string const colvarbias_restraint_k_moving::get_state_params() const
        << std::setprecision(cvm::en_prec)
        << std::setw(cvm::en_width) << force_k << "\n";
   }
-  return colvarbias_restraint_moving::get_state_params() + os.str();
+  return os.str();
 }
 
 
@@ -770,6 +781,7 @@ cvm::real colvarbias_restraint_harmonic::d_restraint_potential_dk(size_t i) cons
 std::string const colvarbias_restraint_harmonic::get_state_params() const
 {
   return colvarbias_restraint::get_state_params() +
+    colvarbias_restraint_moving::get_state_params() +
     colvarbias_restraint_centers_moving::get_state_params() +
     colvarbias_restraint_k_moving::get_state_params();
 }
@@ -779,6 +791,7 @@ int colvarbias_restraint_harmonic::set_state_params(std::string const &conf)
 {
   int error_code = COLVARS_OK;
   error_code |= colvarbias_restraint::set_state_params(conf);
+  error_code |= colvarbias_restraint_moving::set_state_params(conf);
   error_code |= colvarbias_restraint_centers_moving::set_state_params(conf);
   error_code |= colvarbias_restraint_k_moving::set_state_params(conf);
   return error_code;
@@ -1037,6 +1050,7 @@ cvm::real colvarbias_restraint_harmonic_walls::d_restraint_potential_dk(size_t i
 std::string const colvarbias_restraint_harmonic_walls::get_state_params() const
 {
   return colvarbias_restraint::get_state_params() +
+    colvarbias_restraint_moving::get_state_params() +
     colvarbias_restraint_k_moving::get_state_params();
 }
 
@@ -1045,6 +1059,7 @@ int colvarbias_restraint_harmonic_walls::set_state_params(std::string const &con
 {
   int error_code = COLVARS_OK;
   error_code |= colvarbias_restraint::set_state_params(conf);
+  error_code |= colvarbias_restraint_moving::set_state_params(conf);
   error_code |= colvarbias_restraint_k_moving::set_state_params(conf);
   return error_code;
 }
@@ -1164,6 +1179,7 @@ cvm::real colvarbias_restraint_linear::d_restraint_potential_dk(size_t i) const
 std::string const colvarbias_restraint_linear::get_state_params() const
 {
   return colvarbias_restraint::get_state_params() +
+    colvarbias_restraint_moving::get_state_params() +
     colvarbias_restraint_centers_moving::get_state_params() +
     colvarbias_restraint_k_moving::get_state_params();
 }
@@ -1173,6 +1189,7 @@ int colvarbias_restraint_linear::set_state_params(std::string const &conf)
 {
   int error_code = COLVARS_OK;
   error_code |= colvarbias_restraint::set_state_params(conf);
+  error_code |= colvarbias_restraint_moving::set_state_params(conf);
   error_code |= colvarbias_restraint_centers_moving::set_state_params(conf);
   error_code |= colvarbias_restraint_k_moving::set_state_params(conf);
   return error_code;
diff --git a/lib/colvars/colvarbias_restraint.h b/lib/colvars/colvarbias_restraint.h
index 98b967abdb..8c3a1537fc 100644
--- a/lib/colvars/colvarbias_restraint.h
+++ b/lib/colvars/colvarbias_restraint.h
@@ -74,9 +74,6 @@ protected:
 
   /// \brief Restraint centers
   std::vector<colvarvalue> colvar_centers;
-
-  /// \brief Restraint centers outside the domain of the colvars (no wrapping or constraints applied)
-  std::vector<colvarvalue> colvar_centers_raw;
 };
 
 
@@ -156,10 +153,16 @@ protected:
   /// \brief New restraint centers
   std::vector<colvarvalue> target_centers;
 
+  /// \brief Initial value of the restraint centers
+  std::vector<colvarvalue> initial_centers;
+
   /// \brief Amplitude of the restraint centers' increment at each step
-  /// (or stage) towards the new values (calculated from target_nsteps)
+  /// towards the new values (calculated from target_nsteps)
   std::vector<colvarvalue> centers_incr;
 
+  /// \brief Update the centers by interpolating between initial and target
+  virtual int update_centers(cvm::real lambda);
+
   /// Whether to write the current restraint centers to the trajectory file
   bool b_output_centers;
 
diff --git a/lib/colvars/colvarcomp.h b/lib/colvars/colvarcomp.h
index 2c865a166b..3c1ec2495c 100644
--- a/lib/colvars/colvarcomp.h
+++ b/lib/colvars/colvarcomp.h
@@ -132,9 +132,15 @@ public:
   static std::vector<feature *> cvc_features;
 
   /// \brief Implementation of the feature list accessor for colvar
-  virtual std::vector<feature *> &features() {
+  virtual const std::vector<feature *> &features()
+  {
     return cvc_features;
   }
+  virtual std::vector<feature *> &modify_features()
+  {
+    return cvc_features;
+  }
+
 
   /// \brief Obtain data needed for the calculation for the backend
   virtual void read_data();
diff --git a/lib/colvars/colvardeps.cpp b/lib/colvars/colvardeps.cpp
index 5402836f53..8f241a6255 100644
--- a/lib/colvars/colvardeps.cpp
+++ b/lib/colvars/colvardeps.cpp
@@ -374,8 +374,8 @@ int colvardeps::decr_ref_count(int feature_id) {
 }
 
 void colvardeps::init_feature(int feature_id, const char *description, feature_type type) {
-  features()[feature_id]->description = description;
-  features()[feature_id]->type = type;
+  modify_features()[feature_id]->description = description;
+  modify_features()[feature_id]->type = type;
 }
 
 // Shorthand macros for describing dependencies
@@ -401,7 +401,7 @@ void colvardeps::init_cvb_requires() {
   int i;
   if (features().size() == 0) {
     for (i = 0; i < f_cvb_ntot; i++) {
-      features().push_back(new feature);
+      modify_features().push_back(new feature);
     }
 
     init_feature(f_cvb_active, "active", f_type_dynamic);
@@ -438,7 +438,7 @@ void colvardeps::init_cv_requires() {
   size_t i;
   if (features().size() == 0) {
     for (i = 0; i < f_cv_ntot; i++) {
-      features().push_back(new feature);
+      modify_features().push_back(new feature);
     }
 
     init_feature(f_cv_active, "active", f_type_dynamic);
@@ -554,7 +554,7 @@ void colvardeps::init_cvc_requires() {
   // Initialize static array once and for all
   if (features().size() == 0) {
     for (i = 0; i < colvardeps::f_cvc_ntot; i++) {
-      features().push_back(new feature);
+      modify_features().push_back(new feature);
     }
 
     init_feature(f_cvc_active, "active", f_type_dynamic);
@@ -633,7 +633,7 @@ void colvardeps::init_ag_requires() {
   // Initialize static array once and for all
   if (features().size() == 0) {
     for (i = 0; i < f_ag_ntot; i++) {
-      features().push_back(new feature);
+      modify_features().push_back(new feature);
     }
 
     init_feature(f_ag_active, "active", f_type_dynamic);
diff --git a/lib/colvars/colvardeps.h b/lib/colvars/colvardeps.h
index b810a5fca1..dfb10d00e4 100644
--- a/lib/colvars/colvardeps.h
+++ b/lib/colvars/colvardeps.h
@@ -135,7 +135,8 @@ public:
   // with a non-static array
   // Intermediate classes (colvarbias and colvarcomp, which are also base classes)
   // implement this as virtual to allow overriding
-  virtual std::vector<feature *>&features() = 0;
+  virtual const std::vector<feature *>&features() = 0;
+  virtual std::vector<feature *>&modify_features() = 0;
 
   void add_child(colvardeps *child);
 
diff --git a/lib/colvars/colvars_version.h b/lib/colvars/colvars_version.h
index e544756428..312c0fd1a0 100644
--- a/lib/colvars/colvars_version.h
+++ b/lib/colvars/colvars_version.h
@@ -1,4 +1,5 @@
-#define COLVARS_VERSION "2017-07-15"
+#ifndef COLVARS_VERSION
+#define COLVARS_VERSION "2017-08-06"
 // This file is part of the Collective Variables module (Colvars).
 // The original version of Colvars and its updates are located at:
 // https://github.com/colvars/colvars
@@ -6,3 +7,4 @@
 // If you wish to distribute your changes, please submit them to the
 // Colvars repository at GitHub.
 
+#endif
diff --git a/lib/colvars/colvarscript.cpp b/lib/colvars/colvarscript.cpp
index 5bb2faae24..89302a16a2 100644
--- a/lib/colvars/colvarscript.cpp
+++ b/lib/colvars/colvarscript.cpp
@@ -472,7 +472,7 @@ int colvarscript::proc_features(colvardeps *obj,
   }
 
   if ((subcmd == "get") || (subcmd == "set")) {
-    std::vector<colvardeps::feature *> &features = obj->features();
+    std::vector<colvardeps::feature *> const &features = obj->features();
     std::string const req_feature(obj_to_str(objv[3]));
     colvardeps::feature *f = NULL;
     int fid = 0;
diff --git a/lib/colvars/colvartypes.cpp b/lib/colvars/colvartypes.cpp
index 5200d4d041..428fe1a4b1 100644
--- a/lib/colvars/colvartypes.cpp
+++ b/lib/colvars/colvartypes.cpp
@@ -19,6 +19,17 @@ bool      colvarmodule::rotation::monitor_crossings = false;
 cvm::real colvarmodule::rotation::crossing_threshold = 1.0E-02;
 
 
+/// Numerical recipes diagonalization
+static int jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot);
+
+/// Eigenvector sort
+static int eigsrt(cvm::real *d, cvm::real **v);
+
+/// Transpose the matrix
+static int transpose(cvm::real **v);
+
+
+
 std::string cvm::rvector::to_simple_string() const
 {
   std::ostringstream os;
@@ -286,7 +297,12 @@ void colvarmodule::rotation::diagonalize_matrix(cvm::matrix2d<cvm::real> &S,
 
   // diagonalize
   int jac_nrot = 0;
-  jacobi(S.c_array(), S_eigval.c_array(), S_eigvec.c_array(), &jac_nrot);
+  if (jacobi(S.c_array(), S_eigval.c_array(), S_eigvec.c_array(), &jac_nrot) !=
+      COLVARS_OK) {
+    cvm::error("Too many iterations in routine jacobi.\n"
+               "This is usually the result of an ill-defined set of atoms for "
+               "rotational alignment (RMSD, rotateReference, etc).\n");
+  }
   eigsrt(S_eigval.c_array(), S_eigvec.c_array());
   // jacobi saves eigenvectors by columns
   transpose(S_eigvec.c_array());
@@ -528,7 +544,7 @@ void colvarmodule::rotation::calc_optimal_rotation(std::vector<cvm::atom_pos> co
 
 #define n 4
 
-void jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot)
+int jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot)
 {
   int j,iq,ip,i;
   cvm::real tresh,theta,tau,t,sm,s,h,g,c;
@@ -554,7 +570,7 @@ void jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot)
         sm += std::fabs(a[ip][iq]);
     }
     if (sm == 0.0) {
-      return;
+      return COLVARS_OK;
     }
     if (i < 4)
       tresh=0.2*sm/(n*n);
@@ -606,10 +622,11 @@ void jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot)
       z[ip]=0.0;
     }
   }
-  cvm::error("Too many iterations in routine jacobi.\n");
+  return COLVARS_ERROR;
 }
 
-void eigsrt(cvm::real *d, cvm::real **v)
+
+int eigsrt(cvm::real *d, cvm::real **v)
 {
   int k,j,i;
   cvm::real p;
@@ -628,9 +645,11 @@ void eigsrt(cvm::real *d, cvm::real **v)
       }
     }
   }
+  return COLVARS_OK;
 }
 
-void transpose(cvm::real **v)
+
+int transpose(cvm::real **v)
 {
   cvm::real p;
   int i,j;
@@ -641,6 +660,7 @@ void transpose(cvm::real **v)
       v[j][i]=p;
     }
   }
+  return COLVARS_OK;
 }
 
 #undef n
diff --git a/lib/colvars/colvartypes.h b/lib/colvars/colvartypes.h
index 17c09a5095..fe3160eb4b 100644
--- a/lib/colvars/colvartypes.h
+++ b/lib/colvars/colvartypes.h
@@ -1020,16 +1020,6 @@ inline cvm::rvector operator * (cvm::rmatrix const &m,
 }
 
 
-/// Numerical recipes diagonalization
-void jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot);
-
-/// Eigenvector sort
-void eigsrt(cvm::real *d, cvm::real **v);
-
-/// Transpose the matrix
-void transpose(cvm::real **v);
-
-
 
 
 /// \brief 1-dimensional vector of real numbers with four components and
diff --git a/lib/colvars/colvarvalue.cpp b/lib/colvars/colvarvalue.cpp
index 7b498be6d6..312d101603 100644
--- a/lib/colvars/colvarvalue.cpp
+++ b/lib/colvars/colvarvalue.cpp
@@ -570,6 +570,50 @@ colvarvalue colvarvalue::dist2_grad(colvarvalue const &x2) const
 }
 
 
+/// Return the midpoint between x1 and x2, optionally weighted by lambda
+/// (which must be between 0.0 and 1.0)
+colvarvalue const colvarvalue::interpolate(colvarvalue const &x1,
+                                           colvarvalue const &x2,
+                                           cvm::real const lambda)
+{
+  colvarvalue::check_types(x1, x2);
+
+  if ((lambda < 0.0) || (lambda > 1.0)) {
+    cvm::error("Error: trying to interpolate between two colvarvalues with a "
+               "lamdba outside [0:1].\n", BUG_ERROR);
+  }
+
+  colvarvalue interp = ((1.0-lambda)*x1 + lambda*x2);
+  cvm::real const d2 = x1.dist2(x2);
+
+  switch (x1.type()) {
+  case colvarvalue::type_scalar:
+  case colvarvalue::type_3vector:
+  case colvarvalue::type_vector:
+  case colvarvalue::type_unit3vectorderiv:
+  case colvarvalue::type_quaternionderiv:
+    return interp;
+    break;
+  case colvarvalue::type_unit3vector:
+  case colvarvalue::type_quaternion:
+    if (interp.norm()/std::sqrt(d2) < 1.0e-6) {
+      cvm::error("Error: interpolation between "+cvm::to_str(x1)+" and "+
+                 cvm::to_str(x2)+" with lambda = "+cvm::to_str(lambda)+
+                 " is undefined: result = "+cvm::to_str(interp)+"\n",
+                 INPUT_ERROR);
+    }
+    interp.apply_constraints();
+    return interp;
+    break;
+  case colvarvalue::type_notset:
+  default:
+    x1.undef_op();
+    break;
+  }
+  return colvarvalue(colvarvalue::type_notset);
+}
+
+
 std::string colvarvalue::to_simple_string() const
 {
   switch (type()) {
diff --git a/lib/colvars/colvarvalue.h b/lib/colvars/colvarvalue.h
index fce0e1a970..41759e92b0 100644
--- a/lib/colvars/colvarvalue.h
+++ b/lib/colvars/colvarvalue.h
@@ -193,6 +193,12 @@ public:
   /// Derivative with respect to this \link colvarvalue \endlink of the square distance
   colvarvalue dist2_grad(colvarvalue const &x2) const;
 
+  /// Return the midpoint between x1 and x2, optionally weighted by lambda
+  /// (which must be between 0.0 and 1.0)
+  static colvarvalue const interpolate(colvarvalue const &x1,
+                                       colvarvalue const &x2,
+                                       cvm::real const lambda = 0.5);
+
   /// Assignment operator (type of x is checked)
   colvarvalue & operator = (colvarvalue const &x);
 
@@ -285,10 +291,10 @@ public:
   cvm::real & operator [] (int const i);
 
   /// Ensure that the two types are the same within a binary operator
-  int static check_types(colvarvalue const &x1, colvarvalue const &x2);
+  static int check_types(colvarvalue const &x1, colvarvalue const &x2);
 
   /// Ensure that the two types are the same within an assignment, or that the left side is type_notset
-  int static check_types_assign(Type const &vt1, Type const &vt2);
+  static int check_types_assign(Type const &vt1, Type const &vt2);
 
   /// Undefined operation
   void undef_op() const;
@@ -317,14 +323,14 @@ public:
 
   /// \brief Optimized routine for the inner product of one collective
   /// variable with an array
-  void static inner_opt(colvarvalue const                        &x,
+  static void inner_opt(colvarvalue const                        &x,
                         std::vector<colvarvalue>::iterator       &xv,
                         std::vector<colvarvalue>::iterator const &xv_end,
                         std::vector<cvm::real>::iterator         &result);
 
   /// \brief Optimized routine for the inner product of one collective
   /// variable with an array
-  void static inner_opt(colvarvalue const                        &x,
+  static void inner_opt(colvarvalue const                        &x,
                         std::list<colvarvalue>::iterator         &xv,
                         std::list<colvarvalue>::iterator const   &xv_end,
                         std::vector<cvm::real>::iterator         &result);
@@ -332,14 +338,14 @@ public:
   /// \brief Optimized routine for the second order Legendre
   /// polynomial, (3cos^2(w)-1)/2, of one collective variable with an
   /// array
-  void static p2leg_opt(colvarvalue const                        &x,
+  static void p2leg_opt(colvarvalue const                        &x,
                         std::vector<colvarvalue>::iterator       &xv,
                         std::vector<colvarvalue>::iterator const &xv_end,
                         std::vector<cvm::real>::iterator         &result);
 
   /// \brief Optimized routine for the second order Legendre
   /// polynomial of one collective variable with an array
-  void static p2leg_opt(colvarvalue const                        &x,
+  static void p2leg_opt(colvarvalue const                        &x,
                         std::list<colvarvalue>::iterator         &xv,
                         std::list<colvarvalue>::iterator const   &xv_end,
                         std::vector<cvm::real>::iterator         &result);
diff --git a/lib/gpu/Install.py b/lib/gpu/Install.py
index 657f1c8fcc..6ea2159de5 100644
--- a/lib/gpu/Install.py
+++ b/lib/gpu/Install.py
@@ -14,7 +14,7 @@ Syntax from lib dir: python Install.py -m machine -h hdir -a arch -p precision -
 
 specify one or more options, order does not matter
 
-copies an existing Makefile.machine in lib/gpu to Makefile.auto 
+copies an existing Makefile.machine in lib/gpu to Makefile.auto
 optionally edits these variables in Makefile.auto:
   CUDA_HOME, CUDA_ARCH, CUDA_PRECISION, EXTRAMAKE
 optionally uses Makefile.auto to build the GPU library -> libgpu.a
@@ -26,7 +26,7 @@ optionally copies Makefile.auto to a new Makefile.osuffix
   -h = set CUDA_HOME variable in Makefile.auto to hdir
        hdir = path to NVIDIA Cuda software, e.g. /usr/local/cuda
   -a = set CUDA_ARCH variable in Makefile.auto to arch
-       use arch = 20 for Tesla C2050/C2070 (Fermi) (deprecated as of CUDA 8.0) 
+       use arch = 20 for Tesla C2050/C2070 (Fermi) (deprecated as of CUDA 8.0)
                      or GeForce GTX 580 or similar
        use arch = 30 for Tesla K10 (Kepler)
        use arch = 35 for Tesla K40 (Kepler) or GeForce GTX Titan or similar
@@ -108,10 +108,10 @@ if pflag:
   elif precision == "mixed": precstr = "-D_SINGLE_DOUBLE"
   elif precision == "single": precstr = "-D_SINGLE_SINGLE"
   else: error("Invalid precision setting")
-  
+
 # create Makefile.auto
 # reset EXTRAMAKE, CUDA_HOME, CUDA_ARCH, CUDA_PRECISION if requested
-  
+
 if not os.path.exists("Makefile.%s" % isuffix):
   error("lib/gpu/Makefile.%s does not exist" % isuffix)
 
diff --git a/lib/gpu/lal_aux_fun1.h b/lib/gpu/lal_aux_fun1.h
index b40bb7f943..47a216ff6f 100644
--- a/lib/gpu/lal_aux_fun1.h
+++ b/lib/gpu/lal_aux_fun1.h
@@ -22,21 +22,21 @@
   offset=tid & (t_per_atom-1);                                               \
   ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;
 
-#define nbor_info(nbor_mem, packed_mem, nbor_stride, t_per_atom, ii, offset, \
-                  i, numj, stride, nbor_end, nbor_begin)                     \
-  i=nbor_mem[ii];                                                            \
-  nbor_begin=ii+nbor_stride;                                                 \
-  numj=nbor_mem[nbor_begin];                                                 \
-  if (nbor_mem==packed_mem) {                                                \
-    nbor_begin+=nbor_stride+fast_mul(ii,t_per_atom-1);                       \
-    stride=fast_mul(t_per_atom,nbor_stride);                                 \
-    nbor_end=nbor_begin+fast_mul(numj/t_per_atom,stride)+(numj & (t_per_atom-1)); \
+#define nbor_info(dev_nbor, dev_packed, nbor_pitch, t_per_atom, ii, offset,  \
+                  i, numj, n_stride, nbor_end, nbor_begin)                   \
+  i=dev_nbor[ii];                                                            \
+  nbor_begin=ii+nbor_pitch;                                                  \
+  numj=dev_nbor[nbor_begin];                                                 \
+  if (dev_nbor==dev_packed) {                                                \
+    nbor_begin+=nbor_pitch+fast_mul(ii,t_per_atom-1);                        \
+    n_stride=fast_mul(t_per_atom,nbor_pitch);                                \
+    nbor_end=nbor_begin+fast_mul(numj/t_per_atom,n_stride)+(numj & (t_per_atom-1)); \
     nbor_begin+=offset;                                                      \
   } else {                                                                   \
-    nbor_begin+=nbor_stride;                                                 \
-    nbor_begin=nbor_mem[nbor_begin];                                         \
+    nbor_begin+=nbor_pitch;                                                  \
+    nbor_begin=dev_nbor[nbor_begin];                                         \
     nbor_end=nbor_begin+numj;                                                \
-    stride=t_per_atom;                                                       \
+    n_stride=t_per_atom;                                                     \
     nbor_begin+=offset;                                                      \
   }
 
diff --git a/lib/gpu/lal_base_three.cpp b/lib/gpu/lal_base_three.cpp
index f772e36295..aa77a48c66 100644
--- a/lib/gpu/lal_base_three.cpp
+++ b/lib/gpu/lal_base_three.cpp
@@ -20,7 +20,7 @@ using namespace LAMMPS_AL;
 extern Device<PRECISION,ACC_PRECISION> global_device;
 
 template <class numtyp, class acctyp>
-BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0)  {
+BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0) {
   device=&global_device;
   ans=new Answer<numtyp,acctyp>();
   nbor=new Neighbor();
@@ -53,8 +53,8 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
                            const int max_nbors, const int maxspecial,
                            const double cell_size, const double gpu_split,
                            FILE *_screen, const void *pair_program,
-                           const char *k_two, const char *k_three_center,
-                           const char *k_three_end) {
+                           const char *two, const char *three_center,
+                           const char *three_end, const char *short_nbor) {
   screen=_screen;
 
   int gpu_nbor=0;
@@ -70,10 +70,10 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
     _gpu_host=1;
 
   _threads_per_atom=device->threads_per_atom();
-  if (_threads_per_atom>1 && gpu_nbor==0) {
+  if (_threads_per_atom>1 && gpu_nbor==0) { // neigh no and tpa > 1
     nbor->packing(true);
     _nbor_data=&(nbor->dev_packed);
-  } else
+  } else  // neigh yes or tpa == 1
     _nbor_data=&(nbor->dev_nbor);
   if (_threads_per_atom*_threads_per_atom>device->warp_size())
     return -10;
@@ -97,7 +97,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
 
   _block_pair=device->pair_block_size();
   _block_size=device->block_ellipse();
-  compile_kernels(*ucl_device,pair_program,k_two,k_three_center,k_three_end);
+  compile_kernels(*ucl_device,pair_program,two,three_center,three_end,short_nbor);
 
   // Initialize host-device load balancer
   hd_balancer.init(device,gpu_nbor,gpu_split);
@@ -113,6 +113,11 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
   _max_an_bytes+=ans2->gpu_bytes();
   #endif
 
+  int ef_nall=nall;
+  if (ef_nall==0)
+    ef_nall=2000;
+  dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE);
+
   return 0;
 }
 
@@ -136,6 +141,7 @@ void BaseThreeT::clear_atomic() {
     k_three_end.clear();
     k_three_end_vatom.clear();
     k_pair.clear();
+    k_short_nbor.clear();
     delete pair_program;
     _compiled=false;
   }
@@ -143,6 +149,7 @@ void BaseThreeT::clear_atomic() {
   time_pair.clear();
   hd_balancer.clear();
 
+  dev_short_nbor.clear();
   nbor->clear();
   ans->clear();
   #ifdef THREE_CONCURRENT
@@ -169,6 +176,8 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
   if (!success)
     return NULL;
 
+  _nall = nall;
+
   // originally the requirement that nall == nlist was enforced
   // to allow direct indexing neighbors of neighbors after re-arrangement
 //  nbor->get_host3(nall,nlist,ilist,numj,firstneigh,block_size());
@@ -203,6 +212,8 @@ inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
     return 0;
   atom->cast_copy_x(host_x,host_type);
 
+  _nall = nall;
+
   int mn;
   nbor->build_nbor_list(host_x, nall, host_inum, nall, *atom, sublo, subhi, tag,
                         nspecial, special, success, mn);
@@ -247,12 +258,22 @@ void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
     reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
     if (!success)
       return;
+    _max_nbors = nbor->max_nbor_loop(nlist,numj,ilist);
   }
 
   atom->cast_x_data(host_x,host_type);
   hd_balancer.start_timer();
   atom->add_x_data(host_x,host_type);
 
+  // re-allocate dev_short_nbor if necessary
+  if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
+    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
+    dev_short_nbor.resize((2+_max_nbors)*_nmax);
+  }
+
+  // _ainum to be used in loop() for short neighbor list build
+  _ainum = nlist;
+
   int evatom=0;
   if (eatom || vatom)
     evatom=1;
@@ -300,7 +321,7 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
 
   // Build neighbor list on GPU if necessary
   if (ago==0) {
-    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
+    _max_nbors = build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
                     sublo, subhi, tag, nspecial, special, success);
     if (!success)
       return NULL;
@@ -313,6 +334,15 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
   *ilist=nbor->host_ilist.begin();
   *jnum=nbor->host_acc.begin();
 
+  // re-allocate dev_short_nbor if necessary
+  if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
+    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
+    dev_short_nbor.resize((2+_max_nbors)*_nmax);
+  }
+
+  // _ainum to be used in loop() for short neighbor list build
+  _ainum = nall;
+
   int evatom=0;
   if (eatom || vatom)
     evatom=1;
@@ -339,19 +369,20 @@ double BaseThreeT::host_memory_usage_atomic() const {
 
 template <class numtyp, class acctyp>
 void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str,
-                                 const char *ktwo, const char *kthree_center,
-                                 const char *kthree_end) {
+                                 const char *two, const char *three_center,
+                                 const char *three_end, const char* short_nbor) {
   if (_compiled)
     return;
 
-  std::string vatom_name=std::string(kthree_end)+"_vatom";
+  std::string vatom_name=std::string(three_end)+"_vatom";
 
   pair_program=new UCL_Program(dev);
   pair_program->load_string(pair_str,device->compile_string().c_str());
-  k_three_center.set_function(*pair_program,kthree_center);
-  k_three_end.set_function(*pair_program,kthree_end);
+  k_three_center.set_function(*pair_program,three_center);
+  k_three_end.set_function(*pair_program,three_end);
   k_three_end_vatom.set_function(*pair_program,vatom_name.c_str());
-  k_pair.set_function(*pair_program,ktwo);
+  k_pair.set_function(*pair_program,two);
+  k_short_nbor.set_function(*pair_program,short_nbor);
   pos_tex.get_texture(*pair_program,"pos_tex");
 
   #ifdef THREE_CONCURRENT
diff --git a/lib/gpu/lal_base_three.h b/lib/gpu/lal_base_three.h
index 4f27ecdf92..f5f36863c4 100644
--- a/lib/gpu/lal_base_three.h
+++ b/lib/gpu/lal_base_three.h
@@ -56,7 +56,8 @@ class BaseThree {
                  const int maxspecial, const double cell_size,
                  const double gpu_split, FILE *screen,
                  const void *pair_program, const char *k_two,
-                 const char *k_three_center, const char *k_three_end);
+                 const char *k_three_center, const char *k_three_end,
+                 const char *k_short_nbor=NULL);
 
   /// Estimate the overhead for GPU context changes and CPU driver
   void estimate_gpu_overhead();
@@ -73,18 +74,18 @@ class BaseThree {
   }
 
   /// Check if there is enough storage for neighbors and realloc if not
-  /** \param nlocal number of particles whose nbors must be stored on device
-    * \param host_inum number of particles whose nbors need to copied to host
-    * \param current maximum number of neighbors
+  /** \param inum number of particles whose nbors must be stored on device
+    * \param max_nbors maximum number of neighbors
+    * \param success set to false if insufficient memory
     * \note olist_size=total number of local particles **/
   inline void resize_local(const int inum, const int max_nbors, bool &success) {
     nbor->resize(inum,max_nbors,success);
   }
 
   /// Check if there is enough storage for neighbors and realloc if not
-  /** \param nlocal number of particles whose nbors must be stored on device
+  /** \param inum number of particles whose nbors must be stored on device
     * \param host_inum number of particles whose nbors need to copied to host
-    * \param current maximum number of neighbors
+    * \param max_nbors current maximum number of neighbors
     * \note host_inum is 0 if the host is performing neighboring
     * \note nlocal+host_inum=total number local particles
     * \note olist_size=0 **/
@@ -143,14 +144,6 @@ class BaseThree {
                const bool vflag, const bool eatom, const bool vatom,
                int &host_start, const double cpu_time, bool &success);
 
-  /// Pair loop with device neighboring
-  int * compute(const int ago, const int inum_full, const int nall,
-                double **host_x, int *host_type, double *sublo,
-                double *subhi, tagint *tag, int **nspecial,
-                tagint **special, const bool eflag, const bool vflag,
-                const bool eatom, const bool vatom, int &host_start,
-                const double cpu_time, bool &success);
-
   /// Pair loop with device neighboring
   int ** compute(const int ago, const int inum_full,
                  const int nall, double **host_x, int *host_type, double *sublo,
@@ -193,6 +186,9 @@ class BaseThree {
   /// Neighbor data
   Neighbor *nbor;
 
+  UCL_D_Vec<int> dev_short_nbor;
+  UCL_Kernel k_short_nbor;
+
   // ------------------------- DEVICE KERNELS -------------------------
   UCL_Program *pair_program;
   UCL_Kernel k_pair, k_three_center, k_three_end, k_three_end_vatom;
@@ -207,12 +203,13 @@ class BaseThree {
   int _block_pair, _block_size, _threads_per_atom, _end_command_queue;
   int _gpu_nbor;
   double _max_bytes, _max_an_bytes;
+  int _max_nbors, _ainum, _nall;
   double _gpu_overhead, _driver_overhead;
   UCL_D_Vec<int> *_nbor_data;
 
   void compile_kernels(UCL_Device &dev, const void *pair_string,
-                       const char *k_two, const char *k_three_center,
-                       const char *k_three_end);
+                       const char *two, const char *three_center,
+                       const char *three_end, const char* short_nbor);
 
   virtual void loop(const bool _eflag, const bool _vflag,
                     const int evatom) = 0;
diff --git a/lib/gpu/lal_sw.cpp b/lib/gpu/lal_sw.cpp
index 3492d7030e..24984e4878 100644
--- a/lib/gpu/lal_sw.cpp
+++ b/lib/gpu/lal_sw.cpp
@@ -55,7 +55,7 @@ int SWT::init(const int ntypes, const int nlocal, const int nall, const int max_
   int success;
   success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
                            _screen,sw,"k_sw","k_sw_three_center",
-                           "k_sw_three_end");
+                           "k_sw_three_end","k_sw_short_nbor");
   if (success!=0)
     return success;
 
@@ -193,19 +193,30 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   else
     vflag=0;
 
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+  // build the short neighbor list
+  int ainum=this->_ainum;
+  int nbor_pitch=this->nbor->nbor_pitch();
+  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
                                (BX/this->_threads_per_atom)));
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &sw3, &map, &elem2param, &_nelements,
+                 &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                 &this->dev_short_nbor, &ainum,
+                 &nbor_pitch, &this->_threads_per_atom);
 
   // this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1
   // this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1
-  int ainum=this->ans->inum();
-  int nbor_pitch=this->nbor->nbor_pitch();
+  ainum=this->ans->inum();
+  nbor_pitch=this->nbor->nbor_pitch();
+  GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
   this->time_pair.start();
-
+  
   this->k_pair.set_size(GX,BX);
   this->k_pair.run(&this->atom->x, &sw1, &sw2, &sw3,
                    &map, &elem2param, &_nelements,
                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                   &this->dev_short_nbor,
                    &this->ans->force, &this->ans->engv,
                    &eflag, &vflag, &ainum, &nbor_pitch,
                    &this->_threads_per_atom);
@@ -217,6 +228,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   this->k_three_center.run(&this->atom->x, &sw1, &sw2, &sw3,
                            &map, &elem2param, &_nelements,
                            &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                           &this->dev_short_nbor,
                            &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
                            &nbor_pitch, &this->_threads_per_atom, &evatom);
 
@@ -231,7 +243,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
     this->k_three_end_vatom.run(&this->atom->x, &sw1, &sw2, &sw3,
                           &map, &elem2param, &_nelements,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc,
+                          &this->nbor->dev_acc, &this->dev_short_nbor,
                           &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                           &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
 
@@ -240,7 +252,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
     this->k_three_end.run(&this->atom->x, &sw1, &sw2, &sw3,
                           &map, &elem2param, &_nelements,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc,
+                          &this->nbor->dev_acc, &this->dev_short_nbor,
                           &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                           &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
 
diff --git a/lib/gpu/lal_sw.cu b/lib/gpu/lal_sw.cu
index 46330c59e4..a5c9f49d08 100644
--- a/lib/gpu/lal_sw.cu
+++ b/lib/gpu/lal_sw.cu
@@ -130,6 +130,63 @@ texture<int4> sw3_tex;
 
 #endif
 
+__kernel void k_sw_short_nbor(const __global numtyp4 *restrict x_,
+                           const __global numtyp4 *restrict sw3,
+                           const __global int *restrict map,
+                           const __global int *restrict elem2param,
+                           const int nelements,
+                           const __global int * dev_nbor,
+                           const __global int * dev_packed,
+                           __global int * dev_short_nbor,
+                           const int inum, const int nbor_pitch, const int t_per_atom) {
+  __local int n_stride;
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    int itype=ix.w;
+    itype=map[itype];
+
+    int ncount = 0;
+    int m = nbor;
+    dev_short_nbor[m] = 0;
+    int nbor_short = nbor+n_stride;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int j=dev_packed[nbor];
+      int nj = j;
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int jtype=jx.w;
+      jtype=map[jtype];
+      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<sw3[ijparam].y) { // sw_cutsq = sw3[ijparam].y
+        dev_short_nbor[nbor_short] = nj;
+        nbor_short += n_stride;
+        ncount++;
+      }
+    } // for nbor
+
+    // store the number of neighbors for each thread
+    dev_short_nbor[m] = ncount;
+
+  } // if ii
+}
 
 __kernel void k_sw(const __global numtyp4 *restrict x_,
                    const __global numtyp4 *restrict sw1,
@@ -140,6 +197,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
                    const int nelements,
                    const __global int * dev_nbor,
                    const __global int * dev_packed,
+                   const __global int * dev_short_nbor,
                    __global acctyp4 *restrict ans,
                    __global acctyp *restrict engv,
                    const int eflag, const int vflag, const int inum,
@@ -158,8 +216,8 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
   __syncthreads();
 
   if (ii<inum) {
-    int nbor, nbor_end;
-    int i, numj;
+    int nbor, nbor_end, i, numj;
+    const int* nbor_mem = dev_packed;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -167,9 +225,17 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
     int itype=ix.w;
     itype=map[itype];
 
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
-      int j=dev_packed[nbor];
+      int j=nbor_mem[nbor];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -337,6 +403,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
                                 const int nelements,
                                 const __global int * dev_nbor,
                                 const __global int * dev_packed,
+                                const __global int * dev_short_nbor,
                                 __global acctyp4 *restrict ans,
                                 __global acctyp *restrict engv,
                                 const int eflag, const int vflag,
@@ -361,7 +428,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end;
-
+    const int* nbor_mem = dev_packed;
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
@@ -371,9 +438,18 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
     int itype=ix.w;
     itype=map[itype];
 
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+    int nborj_start = nbor_j;
+
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_packed[nbor_j];
+      int j=nbor_mem[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -395,14 +471,23 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
       sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
       sw_cut_ij=sw3_ijparam.x;
 
-      int nbor_k=nbor_j-offset_j+offset_k;
-      if (nbor_k<=nbor_j)
-        nbor_k+=n_stride;
+      int nbor_k,k_end;
+      if (dev_packed==dev_nbor) {
+        nbor_k=nborj_start-offset_j+offset_k;
+        int numk = dev_short_nbor[nbor_k-n_stride];
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      } else {
+        nbor_k = nbor_j-offset_j+offset_k;
+        if (nbor_k<=nbor_j) nbor_k += n_stride;
+        k_end = nbor_end;
+      }
 
-      for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
 
+        if (dev_packed==dev_nbor && k <= j) continue;
+
         numtyp4 kx; fetch4(kx,k,pos_tex);
         int ktype=kx.w;
         ktype=map[ktype];
@@ -460,6 +545,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
                              const __global int * dev_nbor,
                              const __global int * dev_packed,
                              const __global int * dev_acc,
+                             const __global int * dev_short_nbor,
                              __global acctyp4 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag,
@@ -484,7 +570,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-
+    const int* nbor_mem = dev_packed;
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
@@ -494,8 +580,16 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
     int itype=ix.w;
     itype=map[itype];
 
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
-      int j=dev_packed[nbor_j];
+      int j=nbor_mem[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -534,8 +628,15 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
         nbor_k+=offset_k;
       }
 
+      // recalculate numk and k_end for the use of short neighbor list
+      if (dev_packed==dev_nbor) {
+        numk = dev_short_nbor[nbor_k];
+        nbor_k += n_stride;
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
+
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -598,6 +699,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
                              const __global int * dev_nbor,
                              const __global int * dev_packed,
                              const __global int * dev_acc,
+                             const __global int * dev_short_nbor,
                              __global acctyp4 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag,
@@ -622,7 +724,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-
+    const int* nbor_mem = dev_packed;
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
@@ -632,8 +734,16 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
     int itype=ix.w;
     itype=map[itype];
 
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
-      int j=dev_packed[nbor_j];
+      int j=nbor_mem[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -672,8 +782,15 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
         nbor_k+=offset_k;
       }
 
+      // recalculate numk and k_end for the use of short neighbor list
+      if (dev_packed==dev_nbor) {
+        numk = dev_short_nbor[nbor_k];
+        nbor_k += n_stride;
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
+
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
diff --git a/lib/gpu/lal_tersoff.cpp b/lib/gpu/lal_tersoff.cpp
index 6b0b563d9f..a63d286d9c 100644
--- a/lib/gpu/lal_tersoff.cpp
+++ b/lib/gpu/lal_tersoff.cpp
@@ -55,7 +55,8 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int
   int success;
   success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
                            _screen,tersoff,"k_tersoff_repulsive",
-                           "k_tersoff_three_center", "k_tersoff_three_end");
+                           "k_tersoff_three_center", "k_tersoff_three_end",
+                           "k_tersoff_short_nbor");
   if (success!=0)
     return success;
 
@@ -157,11 +158,16 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int
 
   UCL_H_Vec<numtyp> cutsq_view(nparams,*(this->ucl_device),
                                UCL_WRITE_ONLY);
-  for (int i=0; i<nparams; i++)
+  double cutsqmax = 0.0;
+  for (int i=0; i<nparams; i++) {
     cutsq_view[i]=static_cast<numtyp>(host_cutsq[i]);
+    if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i];
+  }
   cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
   ucl_copy(cutsq,cutsq_view,false);
 
+  _cutshortsq = static_cast<numtyp>(cutsqmax);
+
   UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements,
                            *(this->ucl_device), UCL_WRITE_ONLY);
 
@@ -219,171 +225,6 @@ double TersoffT::host_memory_usage() const {
 
 #define KTHREADS this->_threads_per_atom
 #define JTHREADS this->_threads_per_atom
-// ---------------------------------------------------------------------------
-// Copy nbor list from host if necessary and then calculate forces, virials,..
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-void TersoffT::compute(const int f_ago, const int inum_full, const int nall,
-                       const int nlist, double **host_x, int *host_type,
-                       int *ilist, int *numj, int **firstneigh,
-                       const bool eflag, const bool vflag, const bool eatom,
-                       const bool vatom, int &host_start,
-                       const double cpu_time, bool &success) {
-  this->acc_timers();
-  if (inum_full==0) {
-    host_start=0;
-    // Make sure textures are correct if realloc by a different hybrid style
-    this->resize_atom(0,nall,success);
-    this->zero_timers();
-    return;
-  }
-
-  int ago=this->hd_balancer.ago_first(f_ago);
-  int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
-  this->ans->inum(inum);
-  #ifdef THREE_CONCURRENT
-  this->ans2->inum(inum);
-  #endif
-  host_start=inum;
-
-  if (ago==0) {
-    this->reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
-    if (!success)
-      return;
-    _max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist);
-  }
-
-  this->atom->cast_x_data(host_x,host_type);
-  this->hd_balancer.start_timer();
-  this->atom->add_x_data(host_x,host_type);
-
-  // re-allocate zetaij if necessary
-  if (nall*_max_nbors > _zetaij.cols()) {
-    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-    _zetaij.resize(_max_nbors*_nmax);
-  }
-
-  int _eflag;
-  if (eflag)
-    _eflag=1;
-  else
-    _eflag=0;
-
-  int ainum=nlist;
-  int nbor_pitch=this->nbor->nbor_pitch();
-  int BX=this->block_pair();
-  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
-                               (BX/(JTHREADS*KTHREADS))));
-
-  this->k_zeta.set_size(GX,BX);
-  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
-                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
-
-  int evatom=0;
-  if (eatom || vatom)
-    evatom=1;
-  #ifdef THREE_CONCURRENT
-  this->ucl_device->sync();
-  #endif
-  loop(eflag,vflag,evatom);
-  this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
-  this->device->add_ans_object(this->ans);
-  #ifdef THREE_CONCURRENT
-  this->ans2->copy_answers(eflag,vflag,eatom,vatom,ilist);
-  this->device->add_ans_object(this->ans2);
-  #endif
-  this->hd_balancer.stop_timer();
-}
-
-// ---------------------------------------------------------------------------
-// Reneighbor on GPU if necessary and then compute forces, virials, energies
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-int ** TersoffT::compute(const int ago, const int inum_full,
-                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag,
-                         int **nspecial, tagint **special, const bool eflag,
-                         const bool vflag, const bool eatom,
-                         const bool vatom, int &host_start,
-                         int **ilist, int **jnum,
-                         const double cpu_time, bool &success) {
-  this->acc_timers();
-
-  if (inum_full==0) {
-    host_start=0;
-    // Make sure textures are correct if realloc by a different hybrid style
-    this->resize_atom(0,nall,success);
-    this->zero_timers();
-    return NULL;
-  }
-
-  this->hd_balancer.balance(cpu_time);
-  int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
-  this->ans->inum(inum);
-  #ifdef THREE_CONCURRENT
-  this->ans2->inum(inum);
-  #endif
-  host_start=inum;
-
-  // Build neighbor list on GPU if necessary
-  if (ago==0) {
-    _max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
-                    sublo, subhi, tag, nspecial, special, success);
-    if (!success)
-      return NULL;
-    this->hd_balancer.start_timer();
-  } else {
-    this->atom->cast_x_data(host_x,host_type);
-    this->hd_balancer.start_timer();
-    this->atom->add_x_data(host_x,host_type);
-  }
-  *ilist=this->nbor->host_ilist.begin();
-  *jnum=this->nbor->host_acc.begin();
-
-  // re-allocate zetaij if necessary
-  if (nall*_max_nbors > _zetaij.cols()) {
-    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-    _zetaij.resize(_max_nbors*_nmax);
-  }
-
-  int _eflag;
-  if (eflag)
-    _eflag=1;
-  else
-    _eflag=0;
-
-  int ainum=nall;
-  int nbor_pitch=this->nbor->nbor_pitch();
-  int BX=this->block_pair();
-  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
-                               (BX/(JTHREADS*KTHREADS))));
-
-  this->k_zeta.set_size(GX,BX);
-  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
-                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
-
-  int evatom=0;
-  if (eatom || vatom)
-    evatom=1;
-  #ifdef THREE_CONCURRENT
-  this->ucl_device->sync();
-  #endif
-  loop(eflag,vflag,evatom);
-  this->ans->copy_answers(eflag,vflag,eatom,vatom);
-  this->device->add_ans_object(this->ans);
-  #ifdef THREE_CONCURRENT
-  this->ans2->copy_answers(eflag,vflag,eatom,vatom);
-  this->device->add_ans_object(this->ans2);
-  #endif
-  this->hd_balancer.stop_timer();
-
-  return this->nbor->host_jlist.begin()-host_start;
-}
-
 // ---------------------------------------------------------------------------
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
@@ -402,9 +243,40 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   else
     vflag=0;
 
-  int ainum=this->ans->inum();
+  // build the short neighbor list
+  int ainum=this->_ainum;
   int nbor_pitch=this->nbor->nbor_pitch();
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
+                               (BX/this->_threads_per_atom)));
+
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &cutsq, &map,
+                 &elem2param, &_nelements, &_nparams,
+                 &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                 &this->dev_short_nbor, &ainum,
+                 &nbor_pitch, &this->_threads_per_atom);
+
+  // re-allocate zetaij if necessary
+  int nall = this->_nall;
+  if (nall*this->_max_nbors > _zetaij.cols()) {
+    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
+    _zetaij.resize(this->_max_nbors*_nmax);
+  }
+
+  nbor_pitch=this->nbor->nbor_pitch();
+  GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/
+                               (BX/(JTHREADS*KTHREADS))));
+
+  this->k_zeta.set_size(GX,BX);
+  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
+                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
+                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                   &this->dev_short_nbor,
+                   &_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);
+
+  ainum=this->ans->inum();
+  nbor_pitch=this->nbor->nbor_pitch();
+  GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
   this->time_pair.start();
@@ -412,6 +284,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   this->k_pair.run(&this->atom->x, &ts1, &ts2, &cutsq,
                    &map, &elem2param, &_nelements, &_nparams,
                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                   &this->dev_short_nbor,
                    &this->ans->force, &this->ans->engv,
                    &eflag, &vflag, &ainum, &nbor_pitch,
                    &this->_threads_per_atom);
@@ -423,6 +296,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                            &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                            &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                           &this->dev_short_nbor,
                            &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
                            &nbor_pitch, &this->_threads_per_atom, &evatom);
 
@@ -437,7 +311,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
     this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                           &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc,
+                          &this->nbor->dev_acc, &this->dev_short_nbor,
                           &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                           &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
 
@@ -446,7 +320,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
     this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                           &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc,
+                          &this->nbor->dev_acc, &this->dev_short_nbor,
                           &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                           &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
   }
diff --git a/lib/gpu/lal_tersoff.cu b/lib/gpu/lal_tersoff.cu
index b7d48d9e34..cdeb5679d8 100644
--- a/lib/gpu/lal_tersoff.cu
+++ b/lib/gpu/lal_tersoff.cu
@@ -106,7 +106,7 @@ texture<int4> ts5_tex;
     ans[ii]=old;                                                            \
   }
 
-#define store_zeta(z, tid, t_per_atom, offset)                              \
+#define acc_zeta(z, tid, t_per_atom, offset)                                \
   if (t_per_atom>1) {                                                       \
     __local acctyp red_acc[BLOCK_PAIR];                                     \
     red_acc[tid]=z;                                                         \
@@ -155,7 +155,7 @@ texture<int4> ts5_tex;
     ans[ii]=old;                                                            \
   }
 
-#define store_zeta(z, tid, t_per_atom, offset)                              \
+#define acc_zeta(z, tid, t_per_atom, offset)                                \
   if (t_per_atom>1) {                                                       \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
       z += shfl_xor(z, s, t_per_atom);                                      \
@@ -164,6 +164,65 @@ texture<int4> ts5_tex;
 
 #endif
 
+__kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_,
+                                   const __global numtyp *restrict cutsq,
+                                   const __global int *restrict map,
+                                   const __global int *restrict elem2param,
+                                   const int nelements, const int nparams,
+                                   const __global int * dev_nbor,
+                                   const __global int * dev_packed,
+                                   __global int * dev_short_nbor,
+                                   const int inum, const int nbor_pitch,
+                                   const int t_per_atom) {
+  __local int n_stride;
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    int itype=ix.w;
+    itype=map[itype];
+
+    int ncount = 0;
+    int m = nbor;
+    dev_short_nbor[m] = 0;
+    int nbor_short = nbor+n_stride;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int j=dev_packed[nbor];
+      int nj = j;
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int jtype=jx.w;
+      jtype=map[jtype];
+      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<cutsq[ijparam]) {
+        dev_short_nbor[nbor_short] = nj;
+        nbor_short += n_stride;
+        ncount++;
+      }
+    } // for nbor
+
+    // store the number of neighbors for each thread
+    dev_short_nbor[m] = ncount;
+
+  } // if ii
+}
+
 // Tersoff is currently used for 3 elements at most: 3*3*3 = 27 entries
 // while the block size should never be less than 32.
 // SHARED_SIZE = 32 for now to reduce the pressure on the shared memory per block
@@ -184,6 +243,7 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
                              __global acctyp4 * zetaij,
                              const __global int * dev_nbor,
                              const __global int * dev_packed,
+                             const __global int * dev_short_nbor,
                              const int eflag, const int inum,
                              const int nbor_pitch, const int t_per_atom) {
   __local int tpa_sq,n_stride;
@@ -211,22 +271,29 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
   __syncthreads();
 
   if (ii<inum) {
-    int nbor_j, nbor_end;
-    int i, numj;
-
+    int nbor_j, nbor_end, i, numj;
+    const int* nbor_mem=dev_packed;
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
-    int nborj_start = nbor_j;
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     itype=map[itype];
 
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+    int nborj_start = nbor_j;
+
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_packed[nbor_j];
+      int j=nbor_mem[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -241,14 +308,20 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
       delr1.z = jx.z-ix.z;
       numtyp rsq1 = delr1.x*delr1.x+delr1.y*delr1.y+delr1.z*delr1.z;
 
-      if (rsq1 > cutsq[ijparam]) continue;
+//      if (rsq1 > cutsq[ijparam]) continue;
 
       // compute zeta_ij
       z = (acctyp)0;
 
       int nbor_k = nborj_start-offset_j+offset_k;
-      for ( ; nbor_k < nbor_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+      int k_end = nbor_end;
+      if (dev_packed==dev_nbor) {
+        int numk = dev_short_nbor[nbor_k-n_stride];
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
+
+      for ( ; nbor_k < k_end; nbor_k+=n_stride) {
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
 
         if (k == j) continue;
@@ -284,10 +357,12 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
 
       //int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
       //int idx = jj*n_stride + i*t_per_atom + offset_j;
-      int idx;
-      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-               i, nbor_j, offset_j, idx);
-      store_zeta(z, tid, t_per_atom, offset_k);
+      //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
+      int idx = nbor_j;
+      if (dev_packed==dev_nbor) idx -= n_stride;
+//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//               i, nbor_j, offset_j, idx);
+      acc_zeta(z, tid, t_per_atom, offset_k);
 
       numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
       numtyp ijparam_lam2 = ts1_ijparam.y;
@@ -330,6 +405,7 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
                                   const int nelements, const int nparams,
                                   const __global int * dev_nbor,
                                   const __global int * dev_packed,
+                                  const __global int * dev_short_nbor,
                                   __global acctyp4 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
@@ -356,8 +432,8 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
   __syncthreads();
 
   if (ii<inum) {
-    int nbor, nbor_end;
-    int i, numj;
+    int nbor, nbor_end, i, numj;
+    const int* nbor_mem=dev_packed;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -365,9 +441,17 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
     int itype=ix.w;
     itype=map[itype];
 
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
-      int j=dev_packed[nbor];
+      int j=nbor_mem[nbor];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -382,32 +466,31 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
-      if (rsq<cutsq[ijparam]) {
-        numtyp feng[2];
-        numtyp ijparam_lam1 = ts1[ijparam].x;
-        numtyp4 ts2_ijparam = ts2[ijparam];
-        numtyp ijparam_biga = ts2_ijparam.x;
-        numtyp ijparam_bigr = ts2_ijparam.z;
-        numtyp ijparam_bigd = ts2_ijparam.w;
+      // rsq<cutsq[ijparam]
+      numtyp feng[2];
+      numtyp ijparam_lam1 = ts1[ijparam].x;
+      numtyp4 ts2_ijparam = ts2[ijparam];
+      numtyp ijparam_biga = ts2_ijparam.x;
+      numtyp ijparam_bigr = ts2_ijparam.z;
+      numtyp ijparam_bigd = ts2_ijparam.w;
 
-        repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
-                  rsq, eflag, feng);
+      repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
+                rsq, eflag, feng);
 
-        numtyp force = feng[0];
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
+      numtyp force = feng[0];
+      f.x+=delx*force;
+      f.y+=dely*force;
+      f.z+=delz*force;
 
-        if (eflag>0)
-          energy+=feng[1];
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
+      if (eflag>0)
+        energy+=feng[1];
+      if (vflag>0) {
+        virial[0] += delx*delx*force;
+        virial[1] += dely*dely*force;
+        virial[2] += delz*delz*force;
+        virial[3] += delx*dely*force;
+        virial[4] += delx*delz*force;
+        virial[5] += dely*delz*force;
       }
     } // for nbor
 
@@ -428,6 +511,7 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
                                      const __global acctyp4 *restrict zetaij,
                                      const __global int * dev_nbor,
                                      const __global int * dev_packed,
+                                     const __global int * dev_short_nbor,
                                      __global acctyp4 *restrict ans,
                                      __global acctyp *restrict engv,
                                      const int eflag, const int vflag,
@@ -461,20 +545,28 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end;
-
+    const int* nbor_mem=dev_packed;
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
-    int nborj_start = nbor_j;
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     itype=map[itype];
 
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+    int nborj_start = nbor_j;
+
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_packed[nbor_j];
+      int j=nbor_mem[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -489,7 +581,6 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
       delr1[2] = jx.z-ix.z;
       numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
 
-      if (rsq1 > cutsq[ijparam]) continue;
       numtyp r1 = ucl_sqrt(rsq1);
       numtyp r1inv = ucl_rsqrt(rsq1);
 
@@ -497,9 +588,11 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
 
       //int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
       //int idx = jj*n_stride + i*t_per_atom + offset_j;
-      int idx;
-      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-               i, nbor_j, offset_j, idx);
+      //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
+      int idx = nbor_j;
+      if (dev_packed==dev_nbor) idx -= n_stride;
+//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//               i, nbor_j, offset_j, idx);
       acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
       numtyp force = zeta_ij.x*tpainv;
       numtyp prefactor = zeta_ij.y;
@@ -520,9 +613,15 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
         virial[5] += delr1[1]*delr1[2]*mforce;
       }
 
-      int nbor_k=nborj_start-offset_j+offset_k;
-      for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+      int nbor_k = nborj_start-offset_j+offset_k;
+      int k_end = nbor_end;
+      if (dev_packed==dev_nbor) {
+        int numk = dev_short_nbor[nbor_k-n_stride];
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
+
+      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
 
         if (j == k) continue;
@@ -598,6 +697,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
                                   const __global int * dev_nbor,
                                   const __global int * dev_packed,
                                   const __global int * dev_acc,
+                                  const __global int * dev_short_nbor,
                                   __global acctyp4 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
@@ -632,7 +732,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-
+    const int* nbor_mem=dev_packed;
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
@@ -643,9 +743,18 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
     itype=map[itype];
 
     numtyp tpainv = ucl_recip((numtyp)t_per_atom);
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_packed[nbor_j];
+      int j=nbor_mem[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -660,8 +769,6 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
       delr1[2] = jx.z-ix.z;
       numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
 
-      if (rsq1 > cutsq[ijparam]) continue;
-
       numtyp mdelr1[3];
       mdelr1[0] = -delr1[0];
       mdelr1[1] = -delr1[1];
@@ -683,13 +790,20 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
         k_end=nbor_k+numk;
         nbor_k+=offset_k;
       }
+
+      // recalculate numk and k_end for the use of short neighbor list
+      if (dev_packed==dev_nbor) {
+        numk = dev_short_nbor[nbor_k];
+        nbor_k += n_stride;
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
       int nbork_start = nbor_k;
 
       // look up for zeta_ji: find i in the j's neighbor list
       int m = tid / t_per_atom;
       int ijnum = -1;
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
         if (k == i) {
           ijnum = nbor_k;
@@ -711,9 +825,11 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
 
       //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
       //int idx = iix*n_stride + j*t_per_atom + offset_kf;
-      int idx;
-      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-               j, ijnum, offset_kf, idx);
+      //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
+      int idx = ijnum;
+      if (dev_packed==dev_nbor) idx -= n_stride;
+//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//               j, ijnum, offset_kf, idx);
       acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
       numtyp force = zeta_ji.x*tpainv;
       numtyp prefactor_ji = zeta_ji.y;
@@ -736,7 +852,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
 
       // attractive forces
       for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -777,9 +893,11 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
 
         //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
         //int idx = kk*n_stride + j*t_per_atom + offset_k;
-        int idx;
-        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-                 j, nbor_k, offset_k, idx);
+        //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
+        int idx = nbor_k;
+        if (dev_packed==dev_nbor) idx -= n_stride;
+//        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//                 j, nbor_k, offset_k, idx);
         acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
         numtyp prefactor_jk = zeta_jk.y;
         int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
@@ -824,6 +942,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
                                         const __global int * dev_nbor,
                                         const __global int * dev_packed,
                                         const __global int * dev_acc,
+                                        const __global int * dev_short_nbor,
                                         __global acctyp4 *restrict ans,
                                         __global acctyp *restrict engv,
                                         const int eflag, const int vflag,
@@ -858,7 +977,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-
+    const int* nbor_mem = dev_packed;
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
@@ -869,9 +988,18 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
     itype=map[itype];
 
     numtyp tpainv = ucl_recip((numtyp)t_per_atom);
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_packed[nbor_j];
+      int j=nbor_mem[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -886,8 +1014,6 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
       delr1[2] = jx.z-ix.z;
       numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
 
-      if (rsq1 > cutsq[ijparam]) continue;
-
       numtyp mdelr1[3];
       mdelr1[0] = -delr1[0];
       mdelr1[1] = -delr1[1];
@@ -909,13 +1035,20 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
         k_end=nbor_k+numk;
         nbor_k+=offset_k;
       }
+
+      // recalculate numk and k_end for the use of short neighbor list
+      if (dev_packed==dev_nbor) {
+        numk = dev_short_nbor[nbor_k];
+        nbor_k += n_stride;
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
       int nbork_start = nbor_k;
 
       // look up for zeta_ji
       int m = tid / t_per_atom;
       int ijnum = -1;
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
         if (k == i) {
           ijnum = nbor_k;
@@ -937,9 +1070,11 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
 
       //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
       //int idx = iix*n_stride + j*t_per_atom + offset_kf;
-      int idx;
-      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-               j, ijnum, offset_kf, idx);
+      //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
+      int idx = ijnum;
+      if (dev_packed==dev_nbor) idx -= n_stride;
+//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//               j, ijnum, offset_kf, idx);
       acctyp4 zeta_ji = zetaij[idx]; //  fetch(zeta_ji,idx,zeta_tex);
       numtyp force = zeta_ji.x*tpainv;
       numtyp prefactor_ji = zeta_ji.y;
@@ -962,7 +1097,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
 
       // attractive forces
       for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -1010,9 +1145,11 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
 
         //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
         //int idx = kk*n_stride + j*t_per_atom + offset_k;
-        int idx;
-        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-                 j, nbor_k, offset_k, idx);
+        //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
+        int idx = nbor_k;
+        if (dev_packed==dev_nbor) idx -= n_stride;
+//        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//                 j, nbor_k, offset_k, idx);
         acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
         numtyp prefactor_jk = zeta_jk.y;
 
@@ -1040,7 +1177,6 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
         virial[3] += TWOTHIRD*(delr2[0]*fj[1] + mdelr1[0]*fk[1]);
         virial[4] += TWOTHIRD*(delr2[0]*fj[2] + mdelr1[0]*fk[2]);
         virial[5] += TWOTHIRD*(delr2[1]*fj[2] + mdelr1[1]*fk[2]);
-
       }
     } // for nbor
 
diff --git a/lib/gpu/lal_tersoff.h b/lib/gpu/lal_tersoff.h
index c72ebd7286..fd01af031a 100644
--- a/lib/gpu/lal_tersoff.h
+++ b/lib/gpu/lal_tersoff.h
@@ -47,21 +47,6 @@ class Tersoff : public BaseThree<numtyp, acctyp> {
            const double* h, const double* gamma, const double* beta,
            const double* powern, const double* cutsq);
 
-  /// Pair loop with host neighboring
-  void compute(const int f_ago, const int inum_full, const int nall,
-               const int nlist, double **host_x, int *host_type,
-               int *ilist, int *numj, int **firstneigh, const bool eflag,
-               const bool vflag, const bool eatom, const bool vatom,
-               int &host_start, const double cpu_time, bool &success);
-
-  /// Pair loop with device neighboring
-  int ** compute(const int ago, const int inum_full,
-                 const int nall, double **host_x, int *host_type, double *sublo,
-                 double *subhi, tagint *tag, int **nspecial,
-                 tagint **special, const bool eflag, const bool vflag,
-                 const bool eatom, const bool vatom, int &host_start,
-                 int **ilist, int **numj, const double cpu_time, bool &success);
-
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
@@ -104,8 +89,7 @@ class Tersoff : public BaseThree<numtyp, acctyp> {
 
   UCL_Kernel k_zeta;
   UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex;
-
-  int _max_nbors;
+  numtyp _cutshortsq;
 
  private:
   bool _allocated;
diff --git a/lib/gpu/lal_tersoff_mod.cpp b/lib/gpu/lal_tersoff_mod.cpp
index 553dad3583..c37c07f1a1 100644
--- a/lib/gpu/lal_tersoff_mod.cpp
+++ b/lib/gpu/lal_tersoff_mod.cpp
@@ -55,7 +55,8 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
   int success;
   success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
                            _screen,tersoff_mod,"k_tersoff_mod_repulsive",
-                           "k_tersoff_mod_three_center", "k_tersoff_mod_three_end");
+                           "k_tersoff_mod_three_center", "k_tersoff_mod_three_end",
+                           "k_tersoff_mod_short_nbor");
   if (success!=0)
     return success;
 
@@ -157,11 +158,16 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
 
   UCL_H_Vec<numtyp> cutsq_view(nparams,*(this->ucl_device),
                                UCL_WRITE_ONLY);
-  for (int i=0; i<nparams; i++)
+  double cutsqmax = 0.0;
+  for (int i=0; i<nparams; i++) {
     cutsq_view[i]=static_cast<numtyp>(host_cutsq[i]);
+    if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i];
+  }
   cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
   ucl_copy(cutsq,cutsq_view,false);
 
+  _cutshortsq = static_cast<numtyp>(cutsqmax);
+
   UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements,
                            *(this->ucl_device), UCL_WRITE_ONLY);
 
@@ -219,171 +225,6 @@ double TersoffMT::host_memory_usage() const {
 
 #define KTHREADS this->_threads_per_atom
 #define JTHREADS this->_threads_per_atom
-// ---------------------------------------------------------------------------
-// Copy nbor list from host if necessary and then calculate forces, virials,..
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-void TersoffMT::compute(const int f_ago, const int inum_full, const int nall,
-                       const int nlist, double **host_x, int *host_type,
-                       int *ilist, int *numj, int **firstneigh,
-                       const bool eflag, const bool vflag, const bool eatom,
-                       const bool vatom, int &host_start,
-                       const double cpu_time, bool &success) {
-  this->acc_timers();
-  if (inum_full==0) {
-    host_start=0;
-    // Make sure textures are correct if realloc by a different hybrid style
-    this->resize_atom(0,nall,success);
-    this->zero_timers();
-    return;
-  }
-
-  int ago=this->hd_balancer.ago_first(f_ago);
-  int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
-  this->ans->inum(inum);
-  #ifdef THREE_CONCURRENT
-  this->ans2->inum(inum);
-  #endif
-  host_start=inum;
-
-  if (ago==0) {
-    this->reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
-    if (!success)
-      return;
-    _max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist);
-  }
-
-  this->atom->cast_x_data(host_x,host_type);
-  this->hd_balancer.start_timer();
-  this->atom->add_x_data(host_x,host_type);
-
-  // re-allocate zetaij if necessary
-  if (nall*_max_nbors > _zetaij.cols()) {
-    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-    _zetaij.resize(_max_nbors*_nmax);
-  }
-
-  int _eflag;
-  if (eflag)
-    _eflag=1;
-  else
-    _eflag=0;
-
-  int ainum=nlist;
-  int nbor_pitch=this->nbor->nbor_pitch();
-  int BX=this->block_pair();
-  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
-                               (BX/(JTHREADS*KTHREADS))));
-
-  this->k_zeta.set_size(GX,BX);
-  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
-                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
-
-  int evatom=0;
-  if (eatom || vatom)
-    evatom=1;
-  #ifdef THREE_CONCURRENT
-  this->ucl_device->sync();
-  #endif
-  loop(eflag,vflag,evatom);
-  this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
-  this->device->add_ans_object(this->ans);
-  #ifdef THREE_CONCURRENT
-  this->ans2->copy_answers(eflag,vflag,eatom,vatom,ilist);
-  this->device->add_ans_object(this->ans2);
-  #endif
-  this->hd_balancer.stop_timer();
-}
-
-// ---------------------------------------------------------------------------
-// Reneighbor on GPU if necessary and then compute forces, virials, energies
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-int ** TersoffMT::compute(const int ago, const int inum_full,
-                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag,
-                         int **nspecial, tagint **special, const bool eflag,
-                         const bool vflag, const bool eatom,
-                         const bool vatom, int &host_start,
-                         int **ilist, int **jnum,
-                         const double cpu_time, bool &success) {
-  this->acc_timers();
-
-  if (inum_full==0) {
-    host_start=0;
-    // Make sure textures are correct if realloc by a different hybrid style
-    this->resize_atom(0,nall,success);
-    this->zero_timers();
-    return NULL;
-  }
-
-  this->hd_balancer.balance(cpu_time);
-  int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
-  this->ans->inum(inum);
-  #ifdef THREE_CONCURRENT
-  this->ans2->inum(inum);
-  #endif
-  host_start=inum;
-
-  // Build neighbor list on GPU if necessary
-  if (ago==0) {
-    _max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
-                    sublo, subhi, tag, nspecial, special, success);
-    if (!success)
-      return NULL;
-    this->hd_balancer.start_timer();
-  } else {
-    this->atom->cast_x_data(host_x,host_type);
-    this->hd_balancer.start_timer();
-    this->atom->add_x_data(host_x,host_type);
-  }
-  *ilist=this->nbor->host_ilist.begin();
-  *jnum=this->nbor->host_acc.begin();
-
-  // re-allocate zetaij if necessary
-  if (nall*_max_nbors > _zetaij.cols()) {
-    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-    _zetaij.resize(_max_nbors*_nmax);
-  }
-
-  int _eflag;
-  if (eflag)
-    _eflag=1;
-  else
-    _eflag=0;
-
-  int ainum=nall;
-  int nbor_pitch=this->nbor->nbor_pitch();
-  int BX=this->block_pair();
-  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
-                               (BX/(JTHREADS*KTHREADS))));
-
-  this->k_zeta.set_size(GX,BX);
-  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
-                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
-
-  int evatom=0;
-  if (eatom || vatom)
-    evatom=1;
-  #ifdef THREE_CONCURRENT
-  this->ucl_device->sync();
-  #endif
-  loop(eflag,vflag,evatom);
-  this->ans->copy_answers(eflag,vflag,eatom,vatom);
-  this->device->add_ans_object(this->ans);
-  #ifdef THREE_CONCURRENT
-  this->ans2->copy_answers(eflag,vflag,eatom,vatom);
-  this->device->add_ans_object(this->ans2);
-  #endif
-  this->hd_balancer.stop_timer();
-
-  return this->nbor->host_jlist.begin()-host_start;
-}
-
 // ---------------------------------------------------------------------------
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
@@ -402,9 +243,40 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   else
     vflag=0;
 
-  int ainum=this->ans->inum();
+  // build the short neighbor list
+  int ainum=this->_ainum;
   int nbor_pitch=this->nbor->nbor_pitch();
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
+                               (BX/this->_threads_per_atom)));
+
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &cutsq, &map,
+                 &elem2param, &_nelements, &_nparams,
+                 &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                 &this->dev_short_nbor, &ainum,
+                 &nbor_pitch, &this->_threads_per_atom);
+
+  // re-allocate zetaij if necessary
+  int nall = this->_nall;
+  if (nall*this->_max_nbors > _zetaij.cols()) {
+    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
+    _zetaij.resize(this->_max_nbors*_nmax);
+  }
+
+  nbor_pitch=this->nbor->nbor_pitch();
+  GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/
+                               (BX/(JTHREADS*KTHREADS))));
+
+  this->k_zeta.set_size(GX,BX);
+  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
+                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
+                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                   &this->dev_short_nbor,
+                   &_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);
+
+  ainum=this->ans->inum();
+  nbor_pitch=this->nbor->nbor_pitch();
+  GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
   this->time_pair.start();
@@ -412,6 +284,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   this->k_pair.run(&this->atom->x, &ts1, &ts2, &cutsq,
                    &map, &elem2param, &_nelements, &_nparams,
                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                   &this->dev_short_nbor,
                    &this->ans->force, &this->ans->engv,
                    &eflag, &vflag, &ainum, &nbor_pitch,
                    &this->_threads_per_atom);
@@ -423,6 +296,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
                            &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                            &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                           &this->dev_short_nbor,
                            &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
                            &nbor_pitch, &this->_threads_per_atom, &evatom);
 
@@ -437,7 +311,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
     this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
                           &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc,
+                          &this->nbor->dev_acc, &this->dev_short_nbor,
                           &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                           &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
 
@@ -446,7 +320,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
     this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
                           &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc,
+                          &this->nbor->dev_acc, &this->dev_short_nbor,
                           &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                           &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
   }
diff --git a/lib/gpu/lal_tersoff_mod.cu b/lib/gpu/lal_tersoff_mod.cu
index 3a81b36941..576359b514 100644
--- a/lib/gpu/lal_tersoff_mod.cu
+++ b/lib/gpu/lal_tersoff_mod.cu
@@ -106,7 +106,7 @@ texture<int4> ts5_tex;
     ans[ii]=old;                                                            \
   }
 
-#define store_zeta(z, tid, t_per_atom, offset)                              \
+#define acc_zeta(z, tid, t_per_atom, offset)                                \
   if (t_per_atom>1) {                                                       \
     __local acctyp red_acc[BLOCK_PAIR];                                     \
     red_acc[tid]=z;                                                         \
@@ -155,7 +155,7 @@ texture<int4> ts5_tex;
     ans[ii]=old;                                                            \
   }
 
-#define store_zeta(z, tid, t_per_atom, offset)                              \
+#define acc_zeta(z, tid, t_per_atom, offset)                                \
   if (t_per_atom>1) {                                                       \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
       z += shfl_xor(z, s, t_per_atom);                                      \
@@ -164,6 +164,65 @@ texture<int4> ts5_tex;
 
 #endif
 
+__kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
+                                   const __global numtyp *restrict cutsq,
+                                   const __global int *restrict map,
+                                   const __global int *restrict elem2param,
+                                   const int nelements, const int nparams,
+                                   const __global int * dev_nbor,
+                                   const __global int * dev_packed,
+                                   __global int * dev_short_nbor,
+                                   const int inum, const int nbor_pitch,
+                                   const int t_per_atom) {
+  __local int n_stride;
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    int itype=ix.w;
+    itype=map[itype];
+
+    int ncount = 0;
+    int m = nbor;
+    dev_short_nbor[m] = 0;
+    int nbor_short = nbor+n_stride;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int j=dev_packed[nbor];
+      int nj = j;
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int jtype=jx.w;
+      jtype=map[jtype];
+      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<cutsq[ijparam]) {
+        dev_short_nbor[nbor_short] = nj;
+        nbor_short += n_stride;
+        ncount++;
+      }
+    } // for nbor
+
+    // store the number of neighbors for each thread
+    dev_short_nbor[m] = ncount;
+
+  } // if ii
+}
+
 // Tersoff is currently used for 3 elements at most: 3*3*3 = 27 entries
 // while the block size should never be less than 32.
 // SHARED_SIZE = 32 for now to reduce the pressure on the shared memory per block
@@ -184,6 +243,7 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
                              __global acctyp4 * zetaij,
                              const __global int * dev_nbor,
                              const __global int * dev_packed,
+                             const __global int * dev_short_nbor,
                              const int eflag, const int inum,
                              const int nbor_pitch, const int t_per_atom) {
   __local int tpa_sq,n_stride;
@@ -211,22 +271,29 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
   __syncthreads();
 
   if (ii<inum) {
-    int nbor_j, nbor_end;
-    int i, numj;
-
+    int nbor_j, nbor_end, i, numj;
+    const int* nbor_mem=dev_packed;
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
-    int nborj_start = nbor_j;
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     itype=map[itype];
 
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+    int nborj_start = nbor_j;
+
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_packed[nbor_j];
+      int j=nbor_mem[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -241,14 +308,18 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
       delr1.z = jx.z-ix.z;
       numtyp rsq1 = delr1.x*delr1.x+delr1.y*delr1.y+delr1.z*delr1.z;
 
-      if (rsq1 > cutsq[ijparam]) continue;
-
       // compute zeta_ij
-      z = (numtyp)0;
+      z = (acctyp)0;
 
       int nbor_k = nborj_start-offset_j+offset_k;
-      for ( ; nbor_k < nbor_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+      int k_end = nbor_end;
+      if (dev_packed==dev_nbor) {
+        int numk = dev_short_nbor[nbor_k-n_stride];
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
+
+      for ( ; nbor_k < k_end; nbor_k+=n_stride) {
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
 
         if (k == j) continue;
@@ -287,10 +358,12 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
 
       //int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
       //int idx = jj*n_stride + i*t_per_atom + offset_j;
-      int idx;
-      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-               i, nbor_j, offset_j, idx);
-      store_zeta(z, tid, t_per_atom, offset_k);
+      //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
+      int idx = nbor_j;
+      if (dev_packed==dev_nbor) idx -= n_stride;
+//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//               i, nbor_j, offset_j, idx);
+      acc_zeta(z, tid, t_per_atom, offset_k);
 
       numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
       numtyp ijparam_lam2 = ts1_ijparam.y;
@@ -331,6 +404,7 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
                                   const int nelements, const int nparams,
                                   const __global int * dev_nbor,
                                   const __global int * dev_packed,
+                                  const __global int * dev_short_nbor,
                                   __global acctyp4 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
@@ -357,8 +431,8 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
   __syncthreads();
 
   if (ii<inum) {
-    int nbor, nbor_end;
-    int i, numj;
+    int nbor, nbor_end, i, numj;
+    const int* nbor_mem=dev_packed;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -366,9 +440,17 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
     int itype=ix.w;
     itype=map[itype];
 
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
-      int j=dev_packed[nbor];
+      int j=nbor_mem[nbor];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -383,32 +465,31 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
-      if (rsq<cutsq[ijparam]) {
-        numtyp feng[2];
-        numtyp ijparam_lam1 = ts1[ijparam].x;
-        numtyp4 ts2_ijparam = ts2[ijparam];
-        numtyp ijparam_biga = ts2_ijparam.x;
-        numtyp ijparam_bigr = ts2_ijparam.z;
-        numtyp ijparam_bigd = ts2_ijparam.w;
+      // rsq<cutsq[ijparam]
+      numtyp feng[2];
+      numtyp ijparam_lam1 = ts1[ijparam].x;
+      numtyp4 ts2_ijparam = ts2[ijparam];
+      numtyp ijparam_biga = ts2_ijparam.x;
+      numtyp ijparam_bigr = ts2_ijparam.z;
+      numtyp ijparam_bigd = ts2_ijparam.w;
 
-        repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
-                  rsq, eflag, feng);
+      repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
+                rsq, eflag, feng);
 
-        numtyp force = feng[0];
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
+      numtyp force = feng[0];
+      f.x+=delx*force;
+      f.y+=dely*force;
+      f.z+=delz*force;
 
-        if (eflag>0)
-          energy+=feng[1];
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
+      if (eflag>0)
+        energy+=feng[1];
+      if (vflag>0) {
+        virial[0] += delx*delx*force;
+        virial[1] += dely*dely*force;
+        virial[2] += delz*delz*force;
+        virial[3] += delx*dely*force;
+        virial[4] += delx*delz*force;
+        virial[5] += dely*delz*force;
       }
     } // for nbor
 
@@ -430,6 +511,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
                                      const __global acctyp4 *restrict zetaij,
                                      const __global int * dev_nbor,
                                      const __global int * dev_packed,
+                                     const __global int * dev_short_nbor,
                                      __global acctyp4 *restrict ans,
                                      __global acctyp *restrict engv,
                                      const int eflag, const int vflag,
@@ -465,20 +547,28 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end;
-
+    const int* nbor_mem=dev_packed;
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
-    int nborj_start = nbor_j;
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     itype=map[itype];
 
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+    int nborj_start = nbor_j;
+
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_packed[nbor_j];
+      int j=nbor_mem[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -493,7 +583,6 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
       delr1[2] = jx.z-ix.z;
       numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
 
-      if (rsq1 > cutsq[ijparam]) continue;
       numtyp r1 = ucl_sqrt(rsq1);
       numtyp r1inv = ucl_rsqrt(rsq1);
 
@@ -501,9 +590,11 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
 
       //int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
       //int idx = jj*n_stride + i*t_per_atom + offset_j;
-      int idx;
-      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-               i, nbor_j, offset_j, idx);
+      //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
+      int idx = nbor_j;
+      if (dev_packed==dev_nbor) idx -= n_stride;
+//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//               i, nbor_j, offset_j, idx);
       acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
       numtyp force = zeta_ij.x*tpainv;
       numtyp prefactor = zeta_ij.y;
@@ -524,9 +615,15 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
         virial[5] += delr1[1]*delr1[2]*mforce;
       }
 
-      int nbor_k=nborj_start-offset_j+offset_k;
-      for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+      int nbor_k = nborj_start-offset_j+offset_k;
+      int k_end = nbor_end;
+      if (dev_packed==dev_nbor) {
+        int numk = dev_short_nbor[nbor_k-n_stride];
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
+
+      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
 
         if (j == k) continue;
@@ -606,6 +703,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
                                   const __global int * dev_nbor,
                                   const __global int * dev_packed,
                                   const __global int * dev_acc,
+                                  const __global int * dev_short_nbor,
                                   __global acctyp4 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
@@ -642,7 +740,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-
+    const int* nbor_mem=dev_packed;
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
@@ -653,9 +751,18 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
     itype=map[itype];
 
     numtyp tpainv = ucl_recip((numtyp)t_per_atom);
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_packed[nbor_j];
+      int j=nbor_mem[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -670,8 +777,6 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
       delr1[2] = jx.z-ix.z;
       numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
 
-      if (rsq1 > cutsq[ijparam]) continue;
-
       numtyp mdelr1[3];
       mdelr1[0] = -delr1[0];
       mdelr1[1] = -delr1[1];
@@ -693,13 +798,20 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
         k_end=nbor_k+numk;
         nbor_k+=offset_k;
       }
+
+      // recalculate numk and k_end for the use of short neighbor list
+      if (dev_packed==dev_nbor) {
+        numk = dev_short_nbor[nbor_k];
+        nbor_k += n_stride;
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
       int nbork_start = nbor_k;
 
       // look up for zeta_ji: find i in the j's neighbor list
       int m = tid / t_per_atom;
       int ijnum = -1;
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
         if (k == i) {
           ijnum = nbor_k;
@@ -721,9 +833,11 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
 
       //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
       //int idx = iix*n_stride + j*t_per_atom + offset_kf;
-      int idx;
-      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-               j, ijnum, offset_kf, idx);
+      //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
+      int idx = ijnum;
+      if (dev_packed==dev_nbor) idx -= n_stride;
+//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//               j, ijnum, offset_kf, idx);
       acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
       numtyp force = zeta_ji.x*tpainv;
       numtyp prefactor_ji = zeta_ji.y;
@@ -746,7 +860,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
 
       // attractive forces
       for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -790,9 +904,11 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
 
         //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
         //int idx = kk*n_stride + j*t_per_atom + offset_k;
-        int idx;
-        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-                 j, nbor_k, offset_k, idx);
+        //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
+        int idx = nbor_k;
+        if (dev_packed==dev_nbor) idx -= n_stride;
+//        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//                 j, nbor_k, offset_k, idx);
         acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
         numtyp prefactor_jk = zeta_jk.y;
         int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
@@ -841,6 +957,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
                                         const __global int * dev_nbor,
                                         const __global int * dev_packed,
                                         const __global int * dev_acc,
+                                        const __global int * dev_short_nbor,
                                         __global acctyp4 *restrict ans,
                                         __global acctyp *restrict engv,
                                         const int eflag, const int vflag,
@@ -877,7 +994,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-
+    const int* nbor_mem = dev_packed;
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
@@ -888,9 +1005,18 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
     itype=map[itype];
 
     numtyp tpainv = ucl_recip((numtyp)t_per_atom);
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_packed[nbor_j];
+      int j=nbor_mem[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -905,8 +1031,6 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
       delr1[2] = jx.z-ix.z;
       numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
 
-      if (rsq1 > cutsq[ijparam]) continue;
-
       numtyp mdelr1[3];
       mdelr1[0] = -delr1[0];
       mdelr1[1] = -delr1[1];
@@ -928,13 +1052,20 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
         k_end=nbor_k+numk;
         nbor_k+=offset_k;
       }
+
+      // recalculate numk and k_end for the use of short neighbor list
+      if (dev_packed==dev_nbor) {
+        numk = dev_short_nbor[nbor_k];
+        nbor_k += n_stride;
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
       int nbork_start = nbor_k;
 
       // look up for zeta_ji
       int m = tid / t_per_atom;
       int ijnum = -1;
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
         if (k == i) {
           ijnum = nbor_k;
@@ -956,9 +1087,11 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
 
       //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
       //int idx = iix*n_stride + j*t_per_atom + offset_kf;
-      int idx;
-      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-               j, ijnum, offset_kf, idx);
+      //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
+      int idx = ijnum;
+      if (dev_packed==dev_nbor) idx -= n_stride;
+//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//               j, ijnum, offset_kf, idx);
       acctyp4 zeta_ji = zetaij[idx]; //  fetch(zeta_ji,idx,zeta_tex);
       numtyp force = zeta_ji.x*tpainv;
       numtyp prefactor_ji = zeta_ji.y;
@@ -981,7 +1114,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
 
       // attractive forces
       for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -1032,9 +1165,11 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
 
         //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
         //int idx = kk*n_stride + j*t_per_atom + offset_k;
-        int idx;
-        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-                 j, nbor_k, offset_k, idx);
+        //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
+        int idx = nbor_k;
+        if (dev_packed==dev_nbor) idx -= n_stride;
+//        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//                 j, nbor_k, offset_k, idx);
         acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
         numtyp prefactor_jk = zeta_jk.y;
 
diff --git a/lib/gpu/lal_tersoff_mod.h b/lib/gpu/lal_tersoff_mod.h
index 9a05c66009..ab1560d951 100644
--- a/lib/gpu/lal_tersoff_mod.h
+++ b/lib/gpu/lal_tersoff_mod.h
@@ -47,21 +47,6 @@ class TersoffMod : public BaseThree<numtyp, acctyp> {
            const double* h, const double* beta, const double* powern,
            const double* powern_del, const double* ca1, const double* cutsq);
 
-  /// Pair loop with host neighboring
-  void compute(const int f_ago, const int inum_full, const int nall,
-               const int nlist, double **host_x, int *host_type,
-               int *ilist, int *numj, int **firstneigh, const bool eflag,
-               const bool vflag, const bool eatom, const bool vatom,
-               int &host_start, const double cpu_time, bool &success);
-
-  /// Pair loop with device neighboring
-  int ** compute(const int ago, const int inum_full,
-                 const int nall, double **host_x, int *host_type, double *sublo,
-                 double *subhi, tagint *tag, int **nspecial,
-                 tagint **special, const bool eflag, const bool vflag,
-                 const bool eatom, const bool vatom, int &host_start,
-                 int **ilist, int **numj, const double cpu_time, bool &success);
-
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
@@ -104,8 +89,7 @@ class TersoffMod : public BaseThree<numtyp, acctyp> {
 
   UCL_Kernel k_zeta;
   UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex;
-
-  int _max_nbors;
+  numtyp _cutshortsq;
 
  private:
   bool _allocated;
diff --git a/lib/gpu/lal_tersoff_zbl.cpp b/lib/gpu/lal_tersoff_zbl.cpp
index 9cce8a802d..341f663030 100644
--- a/lib/gpu/lal_tersoff_zbl.cpp
+++ b/lib/gpu/lal_tersoff_zbl.cpp
@@ -62,7 +62,8 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
   int success;
   success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
                            _screen,tersoff_zbl,"k_tersoff_zbl_repulsive",
-                           "k_tersoff_zbl_three_center", "k_tersoff_zbl_three_end");
+                           "k_tersoff_zbl_three_center", "k_tersoff_zbl_three_end",
+                           "k_tersoff_zbl_short_nbor");
   if (success!=0)
     return success;
 
@@ -177,11 +178,16 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
 
   UCL_H_Vec<numtyp> cutsq_view(nparams,*(this->ucl_device),
                                UCL_WRITE_ONLY);
-  for (int i=0; i<nparams; i++)
+  double cutsqmax = 0.0;
+  for (int i=0; i<nparams; i++) {
     cutsq_view[i]=static_cast<numtyp>(host_cutsq[i]);
+    if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i];
+  }
   cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
   ucl_copy(cutsq,cutsq_view,false);
 
+  _cutshortsq = static_cast<numtyp>(cutsqmax);
+
   UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements,
                            *(this->ucl_device), UCL_WRITE_ONLY);
 
@@ -244,171 +250,6 @@ double TersoffZT::host_memory_usage() const {
 
 #define KTHREADS this->_threads_per_atom
 #define JTHREADS this->_threads_per_atom
-// ---------------------------------------------------------------------------
-// Copy nbor list from host if necessary and then calculate forces, virials,..
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-void TersoffZT::compute(const int f_ago, const int inum_full, const int nall,
-                       const int nlist, double **host_x, int *host_type,
-                       int *ilist, int *numj, int **firstneigh,
-                       const bool eflag, const bool vflag, const bool eatom,
-                       const bool vatom, int &host_start,
-                       const double cpu_time, bool &success) {
-  this->acc_timers();
-  if (inum_full==0) {
-    host_start=0;
-    // Make sure textures are correct if realloc by a different hybrid style
-    this->resize_atom(0,nall,success);
-    this->zero_timers();
-    return;
-  }
-
-  int ago=this->hd_balancer.ago_first(f_ago);
-  int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
-  this->ans->inum(inum);
-  #ifdef THREE_CONCURRENT
-  this->ans2->inum(inum);
-  #endif
-  host_start=inum;
-
-  if (ago==0) {
-    this->reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
-    if (!success)
-      return;
-    _max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist);
-  }
-
-  this->atom->cast_x_data(host_x,host_type);
-  this->hd_balancer.start_timer();
-  this->atom->add_x_data(host_x,host_type);
-
-  // re-allocate zetaij if necessary
-  if (nall*_max_nbors > _zetaij.cols()) {
-    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-    _zetaij.resize(_max_nbors*_nmax);
-  }
-
-  int _eflag;
-  if (eflag)
-    _eflag=1;
-  else
-    _eflag=0;
-
-  int ainum=nlist;
-  int nbor_pitch=this->nbor->nbor_pitch();
-  int BX=this->block_pair();
-  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
-                               (BX/(JTHREADS*KTHREADS))));
-
-  this->k_zeta.set_size(GX,BX);
-  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
-                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
-
-  int evatom=0;
-  if (eatom || vatom)
-    evatom=1;
-  #ifdef THREE_CONCURRENT
-  this->ucl_device->sync();
-  #endif
-  loop(eflag,vflag,evatom);
-  this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
-  this->device->add_ans_object(this->ans);
-  #ifdef THREE_CONCURRENT
-  this->ans2->copy_answers(eflag,vflag,eatom,vatom,ilist);
-  this->device->add_ans_object(this->ans2);
-  #endif
-  this->hd_balancer.stop_timer();
-}
-
-// ---------------------------------------------------------------------------
-// Reneighbor on GPU if necessary and then compute forces, virials, energies
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-int ** TersoffZT::compute(const int ago, const int inum_full,
-                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag,
-                         int **nspecial, tagint **special, const bool eflag,
-                         const bool vflag, const bool eatom,
-                         const bool vatom, int &host_start,
-                         int **ilist, int **jnum,
-                         const double cpu_time, bool &success) {
-  this->acc_timers();
-
-  if (inum_full==0) {
-    host_start=0;
-    // Make sure textures are correct if realloc by a different hybrid style
-    this->resize_atom(0,nall,success);
-    this->zero_timers();
-    return NULL;
-  }
-
-  this->hd_balancer.balance(cpu_time);
-  int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
-  this->ans->inum(inum);
-  #ifdef THREE_CONCURRENT
-  this->ans2->inum(inum);
-  #endif
-  host_start=inum;
-
-  // Build neighbor list on GPU if necessary
-  if (ago==0) {
-    _max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
-                    sublo, subhi, tag, nspecial, special, success);
-    if (!success)
-      return NULL;
-    this->hd_balancer.start_timer();
-  } else {
-    this->atom->cast_x_data(host_x,host_type);
-    this->hd_balancer.start_timer();
-    this->atom->add_x_data(host_x,host_type);
-  }
-  *ilist=this->nbor->host_ilist.begin();
-  *jnum=this->nbor->host_acc.begin();
-
-  // re-allocate zetaij if necessary
-  if (nall*_max_nbors > _zetaij.cols()) {
-    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-    _zetaij.resize(_max_nbors*_nmax);
-  }
-
-  int _eflag;
-  if (eflag)
-    _eflag=1;
-  else
-    _eflag=0;
-
-  int ainum=nall;
-  int nbor_pitch=this->nbor->nbor_pitch();
-  int BX=this->block_pair();
-  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
-                               (BX/(JTHREADS*KTHREADS))));
-
-  this->k_zeta.set_size(GX,BX);
-  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
-                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
-
-  int evatom=0;
-  if (eatom || vatom)
-    evatom=1;
-  #ifdef THREE_CONCURRENT
-  this->ucl_device->sync();
-  #endif
-  loop(eflag,vflag,evatom);
-  this->ans->copy_answers(eflag,vflag,eatom,vatom);
-  this->device->add_ans_object(this->ans);
-  #ifdef THREE_CONCURRENT
-  this->ans2->copy_answers(eflag,vflag,eatom,vatom);
-  this->device->add_ans_object(this->ans2);
-  #endif
-  this->hd_balancer.stop_timer();
-
-  return this->nbor->host_jlist.begin()-host_start;
-}
-
 // ---------------------------------------------------------------------------
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
@@ -427,9 +268,40 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   else
     vflag=0;
 
-  int ainum=this->ans->inum();
+  // build the short neighbor list
+  int ainum=this->_ainum;
   int nbor_pitch=this->nbor->nbor_pitch();
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
+                               (BX/this->_threads_per_atom)));
+
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &cutsq, &map,
+                 &elem2param, &_nelements, &_nparams,
+                 &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                 &this->dev_short_nbor, &ainum,
+                 &nbor_pitch, &this->_threads_per_atom);
+
+  // re-allocate zetaij if necessary
+  int nall = this->_nall;
+  if (nall*this->_max_nbors > _zetaij.cols()) {
+    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
+    _zetaij.resize(this->_max_nbors*_nmax);
+  }
+
+  nbor_pitch=this->nbor->nbor_pitch();
+  GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/
+                               (BX/(JTHREADS*KTHREADS))));
+
+  this->k_zeta.set_size(GX,BX);
+  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
+                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
+                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                   &this->dev_short_nbor,
+                   &_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);
+
+  ainum=this->ans->inum();
+  nbor_pitch=this->nbor->nbor_pitch();
+  GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
   this->time_pair.start();
@@ -438,6 +310,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
                    &_global_e, &_global_a_0, &_global_epsilon_0, &cutsq,
                    &map, &elem2param, &_nelements, &_nparams,
                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                   &this->dev_short_nbor,
                    &this->ans->force, &this->ans->engv,
                    &eflag, &vflag, &ainum, &nbor_pitch,
                    &this->_threads_per_atom);
@@ -449,6 +322,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                            &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                            &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                           &this->dev_short_nbor,
                            &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
                            &nbor_pitch, &this->_threads_per_atom, &evatom);
 
@@ -463,7 +337,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
     this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                           &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc,
+                          &this->nbor->dev_acc, &this->dev_short_nbor,
                           &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                           &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
 
@@ -472,7 +346,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
     this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                           &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc,
+                          &this->nbor->dev_acc, &this->dev_short_nbor,
                           &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                           &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
   }
diff --git a/lib/gpu/lal_tersoff_zbl.cu b/lib/gpu/lal_tersoff_zbl.cu
index 9509b9802c..e8bb017f59 100644
--- a/lib/gpu/lal_tersoff_zbl.cu
+++ b/lib/gpu/lal_tersoff_zbl.cu
@@ -109,7 +109,7 @@ texture<int4> ts6_tex;
     ans[ii]=old;                                                            \
   }
 
-#define store_zeta(z, tid, t_per_atom, offset)                              \
+#define acc_zeta(z, tid, t_per_atom, offset)                                \
   if (t_per_atom>1) {                                                       \
     __local acctyp red_acc[BLOCK_PAIR];                                     \
     red_acc[tid]=z;                                                         \
@@ -158,7 +158,7 @@ texture<int4> ts6_tex;
     ans[ii]=old;                                                            \
   }
 
-#define store_zeta(z, tid, t_per_atom, offset)                              \
+#define acc_zeta(z, tid, t_per_atom, offset)                                \
   if (t_per_atom>1) {                                                       \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
       z += shfl_xor(z, s, t_per_atom);                                      \
@@ -167,6 +167,65 @@ texture<int4> ts6_tex;
 
 #endif
 
+__kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
+                                   const __global numtyp *restrict cutsq,
+                                   const __global int *restrict map,
+                                   const __global int *restrict elem2param,
+                                   const int nelements, const int nparams,
+                                   const __global int * dev_nbor,
+                                   const __global int * dev_packed,
+                                   __global int * dev_short_nbor,
+                                   const int inum, const int nbor_pitch,
+                                   const int t_per_atom) {
+  __local int n_stride;
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    int itype=ix.w;
+    itype=map[itype];
+
+    int ncount = 0;
+    int m = nbor;
+    dev_short_nbor[m] = 0;
+    int nbor_short = nbor+n_stride;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int j=dev_packed[nbor];
+      int nj = j;
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int jtype=jx.w;
+      jtype=map[jtype];
+      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<cutsq[ijparam]) {
+        dev_short_nbor[nbor_short] = nj;
+        nbor_short += n_stride;
+        ncount++;
+      }
+    } // for nbor
+
+    // store the number of neighbors for each thread
+    dev_short_nbor[m] = ncount;
+
+  } // if ii
+}
+
 // Tersoff is currently used for 3 elements at most: 3*3*3 = 27 entries
 // while the block size should never be less than 32.
 // SHARED_SIZE = 32 for now to reduce the pressure on the shared memory per block
@@ -188,6 +247,7 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
                              __global acctyp4 * zetaij,
                              const __global int * dev_nbor,
                              const __global int * dev_packed,
+                             const __global int * dev_short_nbor,
                              const int eflag, const int inum,
                              const int nbor_pitch, const int t_per_atom) {
   __local int tpa_sq,n_stride;
@@ -217,22 +277,29 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
   __syncthreads();
 
   if (ii<inum) {
-    int nbor_j, nbor_end;
-    int i, numj;
-
+    int nbor_j, nbor_end, i, numj;
+    const int* nbor_mem=dev_packed;
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
-    int nborj_start = nbor_j;
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     itype=map[itype];
 
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+    int nborj_start = nbor_j;
+
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_packed[nbor_j];
+      int j=nbor_mem[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -247,14 +314,18 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
       delr1.z = jx.z-ix.z;
       numtyp rsq1 = delr1.x*delr1.x+delr1.y*delr1.y+delr1.z*delr1.z;
 
-      if (rsq1 > cutsq[ijparam]) continue;
-
       // compute zeta_ij
       z = (acctyp)0;
 
       int nbor_k = nborj_start-offset_j+offset_k;
-      for ( ; nbor_k < nbor_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+      int k_end = nbor_end;
+      if (dev_packed==dev_nbor) {
+        int numk = dev_short_nbor[nbor_k-n_stride];
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
+
+      for ( ; nbor_k < k_end; nbor_k+=n_stride) {
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
 
         if (k == j) continue;
@@ -290,10 +361,12 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
 
       //int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
       //int idx = jj*n_stride + i*t_per_atom + offset_j;
-      int idx;
-      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-               i, nbor_j, offset_j, idx);
-      store_zeta(z, tid, t_per_atom, offset_k);
+      //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
+      int idx = nbor_j;
+      if (dev_packed==dev_nbor) idx -= n_stride;
+//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//               i, nbor_j, offset_j, idx);
+      acc_zeta(z, tid, t_per_atom, offset_k);
 
       numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
       numtyp ijparam_lam2 = ts1_ijparam.y;
@@ -342,6 +415,7 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
                                   const int nelements, const int nparams,
                                   const __global int * dev_nbor,
                                   const __global int * dev_packed,
+                                  const __global int * dev_short_nbor,
                                   __global acctyp4 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
@@ -370,8 +444,8 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
   __syncthreads();
 
   if (ii<inum) {
-    int nbor, nbor_end;
-    int i, numj;
+    int nbor, nbor_end, i, numj;
+    const int* nbor_mem=dev_packed;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -379,9 +453,17 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
     int itype=ix.w;
     itype=map[itype];
 
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
-      int j=dev_packed[nbor];
+      int j=nbor_mem[nbor];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -396,38 +478,37 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
-      if (rsq<cutsq[ijparam]) {
-        numtyp feng[2];
-        numtyp ijparam_lam1 = ts1[ijparam].x;
-        numtyp4 ts2_ijparam = ts2[ijparam];
-        numtyp ijparam_biga = ts2_ijparam.x;
-        numtyp ijparam_bigr = ts2_ijparam.z;
-        numtyp ijparam_bigd = ts2_ijparam.w;
-        numtyp4 ts6_ijparam = ts6[ijparam];
-        numtyp ijparam_Z_i = ts6_ijparam.x;
-        numtyp ijparam_Z_j = ts6_ijparam.y;
-        numtyp ijparam_ZBLcut = ts6_ijparam.z;
-        numtyp ijparam_ZBLexpscale = ts6_ijparam.w;
+      // rsq<cutsq[ijparam]
+      numtyp feng[2];
+      numtyp ijparam_lam1 = ts1[ijparam].x;
+      numtyp4 ts2_ijparam = ts2[ijparam];
+      numtyp ijparam_biga = ts2_ijparam.x;
+      numtyp ijparam_bigr = ts2_ijparam.z;
+      numtyp ijparam_bigd = ts2_ijparam.w;
+      numtyp4 ts6_ijparam = ts6[ijparam];
+      numtyp ijparam_Z_i = ts6_ijparam.x;
+      numtyp ijparam_Z_j = ts6_ijparam.y;
+      numtyp ijparam_ZBLcut = ts6_ijparam.z;
+      numtyp ijparam_ZBLexpscale = ts6_ijparam.w;
 
-        repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
-                  ijparam_Z_i, ijparam_Z_j, ijparam_ZBLcut, ijparam_ZBLexpscale,
-                  global_e, global_a_0, global_epsilon_0, rsq, eflag, feng);
+      repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
+                ijparam_Z_i, ijparam_Z_j, ijparam_ZBLcut, ijparam_ZBLexpscale,
+                global_e, global_a_0, global_epsilon_0, rsq, eflag, feng);
 
-        numtyp force = feng[0];
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
+      numtyp force = feng[0];
+      f.x+=delx*force;
+      f.y+=dely*force;
+      f.z+=delz*force;
 
-        if (eflag>0)
-          energy+=feng[1];
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
+      if (eflag>0)
+        energy+=feng[1];
+      if (vflag>0) {
+        virial[0] += delx*delx*force;
+        virial[1] += dely*dely*force;
+        virial[2] += delz*delz*force;
+        virial[3] += delx*dely*force;
+        virial[4] += delx*delz*force;
+        virial[5] += dely*delz*force;
       }
     } // for nbor
 
@@ -448,6 +529,7 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
                                      const __global acctyp4 *restrict zetaij,
                                      const __global int * dev_nbor,
                                      const __global int * dev_packed,
+                                     const __global int * dev_short_nbor,
                                      __global acctyp4 *restrict ans,
                                      __global acctyp *restrict engv,
                                      const int eflag, const int vflag,
@@ -481,20 +563,28 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end;
-
+    const int* nbor_mem=dev_packed;
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
-    int nborj_start = nbor_j;
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     itype=map[itype];
 
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+    int nborj_start = nbor_j;
+
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_packed[nbor_j];
+      int j=nbor_mem[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -509,7 +599,6 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
       delr1[2] = jx.z-ix.z;
       numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
 
-      if (rsq1 > cutsq[ijparam]) continue;
       numtyp r1 = ucl_sqrt(rsq1);
       numtyp r1inv = ucl_rsqrt(rsq1);
 
@@ -517,9 +606,11 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
 
       //int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
       //int idx = jj*n_stride + i*t_per_atom + offset_j;
-      int idx;
-      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-               i, nbor_j, offset_j, idx);
+      //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
+      int idx = nbor_j;
+      if (dev_packed==dev_nbor) idx -= n_stride;
+//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//               i, nbor_j, offset_j, idx);
       acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
       numtyp force = zeta_ij.x*tpainv;
       numtyp prefactor = zeta_ij.y;
@@ -540,9 +631,15 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
         virial[5] += delr1[1]*delr1[2]*mforce;
       }
 
-      int nbor_k=nborj_start-offset_j+offset_k;
-      for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+      int nbor_k = nborj_start-offset_j+offset_k;
+      int k_end = nbor_end;
+      if (dev_packed==dev_nbor) {
+        int numk = dev_short_nbor[nbor_k-n_stride];
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
+
+      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
 
         if (j == k) continue;
@@ -618,6 +715,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
                                   const __global int * dev_nbor,
                                   const __global int * dev_packed,
                                   const __global int * dev_acc,
+                                  const __global int * dev_short_nbor,
                                   __global acctyp4 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
@@ -652,7 +750,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-
+    const int* nbor_mem=dev_packed;
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
@@ -663,9 +761,18 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
     itype=map[itype];
 
     numtyp tpainv = ucl_recip((numtyp)t_per_atom);
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_packed[nbor_j];
+      int j=nbor_mem[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -680,8 +787,6 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
       delr1[2] = jx.z-ix.z;
       numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
 
-      if (rsq1 > cutsq[ijparam]) continue;
-
       numtyp mdelr1[3];
       mdelr1[0] = -delr1[0];
       mdelr1[1] = -delr1[1];
@@ -703,13 +808,20 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
         k_end=nbor_k+numk;
         nbor_k+=offset_k;
       }
+
+      // recalculate numk and k_end for the use of short neighbor list
+      if (dev_packed==dev_nbor) {
+        numk = dev_short_nbor[nbor_k];
+        nbor_k += n_stride;
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
       int nbork_start = nbor_k;
 
       // look up for zeta_ji: find i in the j's neighbor list
       int m = tid / t_per_atom;
       int ijnum = -1;
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
         if (k == i) {
           ijnum = nbor_k;
@@ -731,9 +843,11 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
 
       //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
       //int idx = iix*n_stride + j*t_per_atom + offset_kf;
-      int idx;
-      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-               j, ijnum, offset_kf, idx);
+      //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
+      int idx = ijnum;
+      if (dev_packed==dev_nbor) idx -= n_stride;
+//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//               j, ijnum, offset_kf, idx);
       acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
       numtyp force = zeta_ji.x*tpainv;
       numtyp prefactor_ji = zeta_ji.y;
@@ -756,7 +870,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
 
       // attractive forces
       for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -797,9 +911,11 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
 
         //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
         //int idx = kk*n_stride + j*t_per_atom + offset_k;
-        int idx;
-        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-                 j, nbor_k, offset_k, idx);
+        //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
+        int idx = nbor_k;
+        if (dev_packed==dev_nbor) idx -= n_stride;
+//        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//                 j, nbor_k, offset_k, idx);
         acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
         numtyp prefactor_jk = zeta_jk.y;
         int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
@@ -844,6 +960,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
                                         const __global int * dev_nbor,
                                         const __global int * dev_packed,
                                         const __global int * dev_acc,
+                                        const __global int * dev_short_nbor,
                                         __global acctyp4 *restrict ans,
                                         __global acctyp *restrict engv,
                                         const int eflag, const int vflag,
@@ -878,7 +995,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-
+    const int* nbor_mem = dev_packed;
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
@@ -889,9 +1006,18 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
     itype=map[itype];
 
     numtyp tpainv = ucl_recip((numtyp)t_per_atom);
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_packed[nbor_j];
+      int j=nbor_mem[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -906,8 +1032,6 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
       delr1[2] = jx.z-ix.z;
       numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
 
-      if (rsq1 > cutsq[ijparam]) continue;
-
       numtyp mdelr1[3];
       mdelr1[0] = -delr1[0];
       mdelr1[1] = -delr1[1];
@@ -929,13 +1053,20 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
         k_end=nbor_k+numk;
         nbor_k+=offset_k;
       }
+
+      // recalculate numk and k_end for the use of short neighbor list
+      if (dev_packed==dev_nbor) {
+        numk = dev_short_nbor[nbor_k];
+        nbor_k += n_stride;
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
       int nbork_start = nbor_k;
 
       // look up for zeta_ji
       int m = tid / t_per_atom;
       int ijnum = -1;
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
         if (k == i) {
           ijnum = nbor_k;
@@ -957,9 +1088,11 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
 
       //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
       //int idx = iix*n_stride + j*t_per_atom + offset_kf;
-      int idx;
-      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-               j, ijnum, offset_kf, idx);
+      //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
+      int idx = ijnum;
+      if (dev_packed==dev_nbor) idx -= n_stride;
+//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//               j, ijnum, offset_kf, idx);
       acctyp4 zeta_ji = zetaij[idx]; //  fetch(zeta_ji,idx,zeta_tex);
       numtyp force = zeta_ji.x*tpainv;
       numtyp prefactor_ji = zeta_ji.y;
@@ -982,7 +1115,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
 
       // attractive forces
       for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -1030,9 +1163,11 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
 
         //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
         //int idx = kk*n_stride + j*t_per_atom + offset_k;
-        int idx;
-        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-                 j, nbor_k, offset_k, idx);
+        //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
+        int idx = nbor_k;
+        if (dev_packed==dev_nbor) idx -= n_stride;
+//        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//                 j, nbor_k, offset_k, idx);
         acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
         numtyp prefactor_jk = zeta_jk.y;
 
diff --git a/lib/gpu/lal_tersoff_zbl.h b/lib/gpu/lal_tersoff_zbl.h
index cc0b848845..0e6cac9587 100644
--- a/lib/gpu/lal_tersoff_zbl.h
+++ b/lib/gpu/lal_tersoff_zbl.h
@@ -49,21 +49,6 @@ class TersoffZBL : public BaseThree<numtyp, acctyp> {
            const double* ZBLcut, const double* ZBLexpscale, const double global_e,
            const double global_a_0, const double global_epsilon_0, const double* cutsq);
 
-  /// Pair loop with host neighboring
-  void compute(const int f_ago, const int inum_full, const int nall,
-               const int nlist, double **host_x, int *host_type,
-               int *ilist, int *numj, int **firstneigh, const bool eflag,
-               const bool vflag, const bool eatom, const bool vatom,
-               int &host_start, const double cpu_time, bool &success);
-
-  /// Pair loop with device neighboring
-  int ** compute(const int ago, const int inum_full,
-                 const int nall, double **host_x, int *host_type, double *sublo,
-                 double *subhi, tagint *tag, int **nspecial,
-                 tagint **special, const bool eflag, const bool vflag,
-                 const bool eatom, const bool vatom, int &host_start,
-                 int **ilist, int **numj, const double cpu_time, bool &success);
-
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
@@ -109,8 +94,8 @@ class TersoffZBL : public BaseThree<numtyp, acctyp> {
   UCL_Kernel k_zeta;
   UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex, ts6_tex;
 
-  int _max_nbors;
   numtyp _global_e,_global_a_0,_global_epsilon_0;
+  numtyp _cutshortsq;
 
  private:
   bool _allocated;
diff --git a/lib/gpu/lal_vashishta.cpp b/lib/gpu/lal_vashishta.cpp
index 96537e65d3..d03ac992bd 100644
--- a/lib/gpu/lal_vashishta.cpp
+++ b/lib/gpu/lal_vashishta.cpp
@@ -59,7 +59,7 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i
   int success;
   success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
                            _screen,vashishta,"k_vashishta","k_vashishta_three_center",
-                           "k_vashishta_three_end");
+                           "k_vashishta_three_end","k_vashishta_short_nbor");
   if (success!=0)
     return success;
 
@@ -128,15 +128,18 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i
 
   param4.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
 
+  double r0sqmax = 0;
   for (int i=0; i<nparams; i++) {
-    double r0sq = r0[i]*r0[i]-1e-4; // TODO: should we have the 1e-4?
-
+    double r0sq = r0[i]*r0[i]; // TODO: should we have the 1e-4?
+    if (r0sqmax < r0sq) r0sqmax = r0sq;
     dview[i].x=static_cast<numtyp>(r0sq);
     dview[i].y=static_cast<numtyp>(gamma[i]);
     dview[i].z=static_cast<numtyp>(cutsq[i]);
     dview[i].w=static_cast<numtyp>(r0[i]);
   }
 
+  _cutshortsq = static_cast<numtyp>(r0sqmax);
+
   ucl_copy(param4,dview,false);
   param4_tex.get_texture(*(this->pair_program),"param4_tex");
   param4_tex.bind_float(param4,4);
@@ -223,15 +226,28 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   else
     vflag=0;
 
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+  // build the short neighbor list
+  int ainum=this->_ainum;
+  int nbor_pitch=this->nbor->nbor_pitch();
+  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
                                (BX/this->_threads_per_atom)));
 
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &param4, &map,
+                 &elem2param, &_nelements, &_nparams,
+                 &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                 &this->dev_short_nbor, &ainum,
+                 &nbor_pitch, &this->_threads_per_atom);
+
   // this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1
   // this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1
-  int ainum=this->ans->inum();
-  int nbor_pitch=this->nbor->nbor_pitch();
+  ainum=this->ans->inum();
+  nbor_pitch=this->nbor->nbor_pitch();
+  GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
   this->time_pair.start();
 
+  // note that k_pair does not run with the short neighbor list
   this->k_pair.set_size(GX,BX);
   this->k_pair.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
                    &map, &elem2param, &_nelements,
@@ -248,6 +264,7 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   this->k_three_center.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
                            &map, &elem2param, &_nelements,
                            &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                           &this->dev_short_nbor,
                            &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
                            &nbor_pitch, &this->_threads_per_atom, &evatom);
   Answer<numtyp,acctyp> *end_ans;
@@ -257,21 +274,19 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   end_ans=this->ans;
   #endif
   if (evatom!=0) {
-    
     this->k_three_end_vatom.set_size(GX,BX);
     this->k_three_end_vatom.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
                           &map, &elem2param, &_nelements,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc,
+                          &this->nbor->dev_acc, &this->dev_short_nbor,
                           &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                           &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
   } else {
-    
     this->k_three_end.set_size(GX,BX);
     this->k_three_end.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
                           &map, &elem2param, &_nelements,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc,
+                          &this->nbor->dev_acc, &this->dev_short_nbor,
                           &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                           &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
   }
diff --git a/lib/gpu/lal_vashishta.cu b/lib/gpu/lal_vashishta.cu
index caa3c03613..fa7f413aa5 100644
--- a/lib/gpu/lal_vashishta.cu
+++ b/lib/gpu/lal_vashishta.cu
@@ -136,6 +136,64 @@ texture<int4> param5_tex;
 
 #endif
 
+__kernel void k_vashishta_short_nbor(const __global numtyp4 *restrict x_,
+                                     const __global numtyp4 *restrict param4,
+                                     const __global int *restrict map,
+                                     const __global int *restrict elem2param,
+                                     const int nelements, const int nparams,
+                                     const __global int * dev_nbor,
+                                     const __global int * dev_packed,
+                                     __global int * dev_short_nbor,
+                                     const int inum, const int nbor_pitch,
+                                     const int t_per_atom) {
+  __local int n_stride;
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    int itype=ix.w;
+    itype=map[itype];
+
+    int ncount = 0;
+    int m = nbor;
+    dev_short_nbor[m] = 0;
+    int nbor_short = nbor+n_stride;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int j=dev_packed[nbor];
+      int nj = j;
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int jtype=jx.w;
+      jtype=map[jtype];
+      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<param4[ijparam].x) { //param4[ijparam].x = r0sq; //param4[ijparam].z=cutsq
+        dev_short_nbor[nbor_short] = nj;
+        nbor_short += n_stride;
+        ncount++;
+      }
+    } // for nbor
+
+    // store the number of neighbors for each thread
+    dev_short_nbor[m] = ncount;
+
+  } // if ii
+}
 
 __kernel void k_vashishta(const __global numtyp4 *restrict x_,
                    const __global numtyp4 *restrict param1,
@@ -166,8 +224,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
   __syncthreads();
 
   if (ii<inum) {
-    int nbor, nbor_end;
-    int i, numj;
+    int nbor, nbor_end, i, numj;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -211,7 +268,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
         numtyp param3_dvrc=param3_ijparam.z;
         numtyp param3_c0  =param3_ijparam.w;
 
-        numtyp r=sqrt(rsq);
+        numtyp r=ucl_sqrt(rsq);
         numtyp rinvsq=1.0/rsq;
         numtyp r4inv = rinvsq*rinvsq;
         numtyp r6inv = rinvsq*r4inv;
@@ -219,8 +276,8 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
         numtyp reta = pow(r,-param1_eta);
         numtyp lam1r = r*param1_lam1inv;
         numtyp lam4r = r*param1_lam4inv;
-        numtyp vc2 = param1_zizj * exp(-lam1r)/r;
-        numtyp vc3 = param2_mbigd * r4inv*exp(-lam4r);
+        numtyp vc2 = param1_zizj * ucl_exp(-lam1r)/r;
+        numtyp vc3 = param2_mbigd * r4inv*ucl_exp(-lam4r);
 
         numtyp force = (param2_dvrc*r
             - (4.0*vc3 + lam4r*vc3+param2_big6w*r6inv
@@ -230,6 +287,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
+
         if (eflag>0)
           energy += (param3_bigh*reta+vc2-vc3-param3_bigw*r6inv-r*param3_dvrc+param3_c0);
           
@@ -255,31 +313,31 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
   numtyp r1 = ucl_sqrt(rsq1);                                                \
   numtyp rinvsq1 = ucl_recip(rsq1);                                          \
   numtyp rainv1 = ucl_recip(r1 - param_r0_ij);                               \
-  numtyp gsrainv1 = param_gamma_ij * rainv1;                                    \
+  numtyp gsrainv1 = param_gamma_ij * rainv1;                                 \
   numtyp gsrainvsq1 = gsrainv1*rainv1/r1;                                    \
   numtyp expgsrainv1 = ucl_exp(gsrainv1);                                    \
                                                                              \
   numtyp r2 = ucl_sqrt(rsq2);                                                \
   numtyp rinvsq2 = ucl_recip(rsq2);                                          \
   numtyp rainv2 = ucl_recip(r2 - param_r0_ik);                               \
-  numtyp gsrainv2 = param_gamma_ik * rainv2;                                    \
+  numtyp gsrainv2 = param_gamma_ik * rainv2;                                 \
   numtyp gsrainvsq2 = gsrainv2*rainv2/r2;                                    \
   numtyp expgsrainv2 = ucl_exp(gsrainv2);                                    \
                                                                              \
   numtyp rinv12 = ucl_recip(r1*r2);                                          \
   numtyp cs = (delr1x*delr2x + delr1y*delr2y + delr1z*delr2z) * rinv12;      \
-  numtyp delcs = cs - param_costheta_ijk;                                       \
+  numtyp delcs = cs - param_costheta_ijk;                                    \
   numtyp delcssq = delcs*delcs;                                              \
-  numtyp pcsinv = param_bigc_ijk*delcssq+1.0;                                   \
+  numtyp pcsinv = param_bigc_ijk*delcssq+1.0;                                \
   numtyp pcsinvsq = pcsinv*pcsinv;                                           \
   numtyp pcs = delcssq/pcsinv;                                               \
                                                                              \
   numtyp facexp = expgsrainv1*expgsrainv2;                                   \
                                                                              \
-  numtyp facrad = param_bigb_ijk * facexp*pcs;                                  \
+  numtyp facrad = param_bigb_ijk * facexp*pcs;                               \
   numtyp frad1 = facrad*gsrainvsq1;                                          \
   numtyp frad2 = facrad*gsrainvsq2;                                          \
-  numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq;                      \
+  numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq;                   \
   numtyp facang12 = rinv12*facang;                                           \
   numtyp csfacang = cs*facang;                                               \
   numtyp csfac1 = rinvsq1*csfacang;                                          \
@@ -311,28 +369,28 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
   numtyp r1 = ucl_sqrt(rsq1);                                                \
   numtyp rinvsq1 = ucl_recip(rsq1);                                          \
   numtyp rainv1 = ucl_recip(r1 - param_r0_ij);                               \
-  numtyp gsrainv1 = param_gamma_ij * rainv1;                                    \
+  numtyp gsrainv1 = param_gamma_ij * rainv1;                                 \
   numtyp gsrainvsq1 = gsrainv1*rainv1/r1;                                    \
   numtyp expgsrainv1 = ucl_exp(gsrainv1);                                    \
                                                                              \
   numtyp r2 = ucl_sqrt(rsq2);                                                \
   numtyp rainv2 = ucl_recip(r2 - param_r0_ik);                               \
-  numtyp gsrainv2 = param_gamma_ik * rainv2;                                    \
+  numtyp gsrainv2 = param_gamma_ik * rainv2;                                 \
   numtyp expgsrainv2 = ucl_exp(gsrainv2);                                    \
                                                                              \
   numtyp rinv12 = ucl_recip(r1*r2);                                          \
   numtyp cs = (delr1x*delr2x + delr1y*delr2y + delr1z*delr2z) * rinv12;      \
-  numtyp delcs = cs - param_costheta_ijk;                                       \
+  numtyp delcs = cs - param_costheta_ijk;                                    \
   numtyp delcssq = delcs*delcs;                                              \
-  numtyp pcsinv = param_bigc_ijk*delcssq+1.0;                                   \
+  numtyp pcsinv = param_bigc_ijk*delcssq+1.0;                                \
   numtyp pcsinvsq = pcsinv*pcsinv;                                           \
   numtyp pcs = delcssq/pcsinv;                                               \
                                                                              \
   numtyp facexp = expgsrainv1*expgsrainv2;                                   \
                                                                              \
-  numtyp facrad = param_bigb_ijk * facexp*pcs;                                  \
+  numtyp facrad = param_bigb_ijk * facexp*pcs;                               \
   numtyp frad1 = facrad*gsrainvsq1;                                          \
-  numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq;                      \
+  numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq;                   \
   numtyp facang12 = rinv12*facang;                                           \
   numtyp csfacang = cs*facang;                                               \
   numtyp csfac1 = rinvsq1*csfacang;                                          \
@@ -353,6 +411,7 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
                                 const int nelements,
                                 const __global int * dev_nbor,
                                 const __global int * dev_packed,
+                                const __global int * dev_short_nbor,
                                 __global acctyp4 *restrict ans,
                                 __global acctyp *restrict engv,
                                 const int eflag, const int vflag,
@@ -377,7 +436,7 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end;
-
+    const int* nbor_mem = dev_packed;
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
@@ -387,9 +446,18 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
     int itype=ix.w;
     itype=map[itype];
 
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+    int nborj_start = nbor_j;
+
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=dev_packed[nbor_j];
+      int j=nbor_mem[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -406,18 +474,27 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
       
       numtyp4 param4_ijparam; fetch4(param4_ijparam,ijparam,param4_tex);
       param_r0sq_ij=param4_ijparam.x;
-      if (rsq1 > param_r0sq_ij) continue;
+      if (rsq1 > param_r0sq_ij) continue; // still keep this for neigh no and tpa > 1
       param_gamma_ij=param4_ijparam.y;
       param_r0_ij=param4_ijparam.w;
       
-      int nbor_k=nbor_j-offset_j+offset_k;
-      if (nbor_k<=nbor_j)
-        nbor_k+=n_stride;
+      int nbor_k,k_end;
+      if (dev_packed==dev_nbor) {
+        nbor_k=nborj_start-offset_j+offset_k;
+        int numk = dev_short_nbor[nbor_k-n_stride];
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      } else {
+        nbor_k = nbor_j-offset_j+offset_k;
+        if (nbor_k<=nbor_j) nbor_k += n_stride;
+        k_end = nbor_end;
+      }
 
-      for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
 
+        if (dev_packed==dev_nbor && k <= j) continue;
+
         numtyp4 kx; fetch4(kx,k,pos_tex);
         int ktype=kx.w;
         ktype=map[ktype];
@@ -478,6 +555,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
                              const __global int * dev_nbor,
                              const __global int * dev_packed,
                              const __global int * dev_acc,
+                             const __global int * dev_short_nbor,
                              __global acctyp4 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag,
@@ -502,7 +580,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-
+    const int* nbor_mem = dev_packed;
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
@@ -512,8 +590,16 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
     int itype=ix.w;
     itype=map[itype];
 
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
-      int j=dev_packed[nbor_j];
+      int j=nbor_mem[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -529,7 +615,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
       int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
       numtyp4 param4_ijparam; fetch4(param4_ijparam,ijparam,param4_tex);
       param_r0sq_ij = param4_ijparam.x;
-      if (rsq1 > param_r0sq_ij) continue;
+      if (rsq1 > param_r0sq_ij) continue; // still keep this for neigh no and tpa > 1
 
       param_gamma_ij=param4_ijparam.y;
       param_r0_ij = param4_ijparam.w;
@@ -551,8 +637,15 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
         nbor_k+=offset_k;
       }
 
+      // recalculate numk and k_end for the use of short neighbor list
+      if (dev_packed==dev_nbor) {
+        numk = dev_short_nbor[nbor_k];
+        nbor_k += n_stride;
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
+
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -617,6 +710,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
                              const __global int * dev_nbor,
                              const __global int * dev_packed,
                              const __global int * dev_acc,
+                             const __global int * dev_short_nbor,
                              __global acctyp4 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag,
@@ -641,7 +735,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-
+    const int* nbor_mem = dev_packed;
     int offset_j=offset/t_per_atom;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
               n_stride,nbor_end,nbor_j);
@@ -651,8 +745,16 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
     int itype=ix.w;
     itype=map[itype];
 
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
-      int j=dev_packed[nbor_j];
+      int j=nbor_mem[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -668,7 +770,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
       int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
       numtyp4 param4_ijparam; fetch4(param4_ijparam,ijparam,param4_tex);
       param_r0sq_ij=param4_ijparam.x;
-      if (rsq1 > param_r0sq_ij) continue;
+      if (rsq1 > param_r0sq_ij) continue;  // still keep this for neigh no and tpa > 1
 
       param_gamma_ij=param4_ijparam.y;
       param_r0_ij=param4_ijparam.w;
@@ -690,8 +792,15 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
         nbor_k+=offset_k;
       }
 
+      // recalculate numk and k_end for the use of short neighbor list
+      if (dev_packed==dev_nbor) {
+        numk = dev_short_nbor[nbor_k];
+        nbor_k += n_stride;
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
+
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=nbor_mem[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
diff --git a/lib/gpu/lal_vashishta.h b/lib/gpu/lal_vashishta.h
index 6eea8362cc..c5c96a5b80 100644
--- a/lib/gpu/lal_vashishta.h
+++ b/lib/gpu/lal_vashishta.h
@@ -82,6 +82,7 @@ class Vashishta : public BaseThree<numtyp, acctyp> {
   UCL_D_Vec<int> elem2param;
   UCL_D_Vec<int> map;
   int _nparams,_nelements;
+  numtyp _cutshortsq;
 
   UCL_Texture param1_tex, param2_tex, param3_tex, param4_tex, param5_tex;
 
diff --git a/lib/kim/Install.py b/lib/kim/Install.py
index 21ea859852..aa244ee6ea 100644
--- a/lib/kim/Install.py
+++ b/lib/kim/Install.py
@@ -6,6 +6,8 @@
 from __future__ import print_function
 import sys,os,re,subprocess
 
+# help message
+
 help = """
 Syntax from src dir: make lib-kim args="-b -v version  -a kim-name"
                  or: make lib-kim args="-b -a everything"
@@ -23,7 +25,7 @@ specify one or more options, order does not matter
   -b = download and build base KIM API library with example Models
        this will delete any previous installation in the current folder
   -n = do NOT download and build base KIM API library.
-       Use an existing installation 
+       Use an existing installation
   -p = specify location of KIM API installation (implies -n)
   -a = add single KIM model or model driver with kim-name
        to existing KIM API lib (see example below).
@@ -78,13 +80,27 @@ def which(program):
   return None
 
 def geturl(url,fname):
+  success = False
+
   if which('curl') != None:
     cmd = 'curl -L -o "%s" %s' % (fname,url)
-  elif which('wget') != None:
+    try:
+      subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
+      success = True
+    except subprocess.CalledProcessError as e:
+      print("Calling curl failed with: %s" % e.output.decode('UTF-8'))
+
+  if not success and which('wget') != None:
     cmd = 'wget -O "%s" %s' % (fname,url)
-  else: error("cannot find 'wget' or 'curl' to download source code")
-  txt = subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
-  return txt
+    try:
+      subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
+      success = True
+    except subprocess.CalledProcessError as e:
+      print("Calling wget failed with: %s" % e.output.decode('UTF-8'))
+
+  if not success:
+    error("Failed to download source code with 'curl' or 'wget'")
+  return
 
 # parse args
 
diff --git a/lib/kokkos/CHANGELOG.md b/lib/kokkos/CHANGELOG.md
index acb54ff22f..3fe9e46111 100644
--- a/lib/kokkos/CHANGELOG.md
+++ b/lib/kokkos/CHANGELOG.md
@@ -1,5 +1,46 @@
 # Change Log
 
+## [2.03.13](https://github.com/kokkos/kokkos/tree/2.03.13) (2017-07-27)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.05...2.03.13)
+
+**Implemented enhancements:**
+
+- Disallow enabling both OpenMP and Threads in the same executable [\#406](https://github.com/kokkos/kokkos/issues/406)
+- Make Kokkos::OpenMP respect OMP environment even if hwloc is available [\#630](https://github.com/kokkos/kokkos/issues/630)
+- Improve Atomics Performance on KNL/Broadwell where PREFETCHW/RFO is Available [\#898](https://github.com/kokkos/kokkos/issues/898)
+- Kokkos::resize should test whether dimensions have changed before resizing [\#904](https://github.com/kokkos/kokkos/issues/904)
+- Develop performance-regression/acceptance tests [\#737](https://github.com/kokkos/kokkos/issues/737)
+- Make the deep\_copy Profiling hook a start/end system [\#890](https://github.com/kokkos/kokkos/issues/890)
+- Add deep\_copy Profiling hook [\#843](https://github.com/kokkos/kokkos/issues/843)
+- Append tag name to parallel construct name for Profiling [\#842](https://github.com/kokkos/kokkos/issues/842)
+- Add view label to `View bounds error` message for CUDA backend [\#870](https://github.com/kokkos/kokkos/issues/870)
+- Disable printing the loaded profiling library [\#824](https://github.com/kokkos/kokkos/issues/824)
+- "Declared but never referenced" warnings [\#853](https://github.com/kokkos/kokkos/issues/853)
+- Warnings about lock\_address\_cuda\_space [\#852](https://github.com/kokkos/kokkos/issues/852)
+- WorkGraph execution policy [\#771](https://github.com/kokkos/kokkos/issues/771)
+- Simplify makefiles by guarding compilation with appropriate KOKKOS\_ENABLE\_\#\#\# macros [\#716](https://github.com/kokkos/kokkos/issues/716)
+- Cmake build: wrong include install directory [\#668](https://github.com/kokkos/kokkos/issues/668)
+- Derived View type and allocation [\#566](https://github.com/kokkos/kokkos/issues/566)
+- Fix Compiler warnings when compiling core unit tests for Cuda [\#214](https://github.com/kokkos/kokkos/issues/214)
+
+**Fixed bugs:**
+
+- Out-of-bounds read in Kokkos\_Layout.hpp [\#975](https://github.com/kokkos/kokkos/issues/975)
+- CudaClang: Fix failing test with Clang 4.0 [\#941](https://github.com/kokkos/kokkos/issues/941)
+- Respawn when memory pool allocation fails \(not available memory\) [\#940](https://github.com/kokkos/kokkos/issues/940)
+- Memory pool aborts on zero allocation request, returns NULL for \< minimum [\#939](https://github.com/kokkos/kokkos/issues/939)
+- Error with TaskScheduler query of underlying memory pool [\#917](https://github.com/kokkos/kokkos/issues/917)
+- Profiling::\*Callee static variables declared in header [\#863](https://github.com/kokkos/kokkos/issues/863)
+- calling \*Space::name\(\) causes compile error [\#862](https://github.com/kokkos/kokkos/issues/862)
+- bug in Profiling::deallocateData [\#860](https://github.com/kokkos/kokkos/issues/860)
+- task\_depend test failing, CUDA 8.0 + Pascal + RDC [\#829](https://github.com/kokkos/kokkos/issues/829)
+- \[develop branch\] Standalone cmake issues [\#826](https://github.com/kokkos/kokkos/issues/826)
+- Kokkos CUDA failes to compile with OMPI\_CXX and MPICH\_CXX wrappers [\#776](https://github.com/kokkos/kokkos/issues/776)
+- Task Team reduction on Pascal [\#767](https://github.com/kokkos/kokkos/issues/767)
+- CUDA stack overflow with TaskDAG test [\#758](https://github.com/kokkos/kokkos/issues/758)
+- TeamVector test on Cuda [\#670](https://github.com/kokkos/kokkos/issues/670)
+- Clang 4.0 Cuda Build broken again [\#560](https://github.com/kokkos/kokkos/issues/560)
+
 
 ## [2.03.05](https://github.com/kokkos/kokkos/tree/2.03.05) (2017-05-27)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.00...2.03.05)
diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos
index 24cd772e00..d2967cf9a3 100644
--- a/lib/kokkos/Makefile.kokkos
+++ b/lib/kokkos/Makefile.kokkos
@@ -33,6 +33,7 @@ KOKKOS_INTERNAL_USE_LIBRT := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "lib
 KOKKOS_INTERNAL_USE_MEMKIND := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "experimental_memkind" | wc -l))
 
 # Check for advanced settings.
+KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "compiler_warnings" | wc -l))
 KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "aggressive_vectorization" | wc -l))
 KOKKOS_INTERNAL_DISABLE_PROFILING := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "disable_profiling" | wc -l))
 KOKKOS_INTERNAL_DISABLE_DUALVIEW_MODIFY_CHECK := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "disable_dualview_modify_check" | wc -l))
@@ -78,14 +79,14 @@ KOKKOS_INTERNAL_COMPILER_PGI         := $(strip $(shell $(CXX) --version       2
 KOKKOS_INTERNAL_COMPILER_XL          := $(strip $(shell $(CXX) -qversion       2>&1 | grep XL                  | wc -l))
 KOKKOS_INTERNAL_COMPILER_CRAY        := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep "CC-"               | wc -l))
 KOKKOS_INTERNAL_COMPILER_NVCC        := $(strip $(shell $(CXX) --version       2>&1 | grep nvcc                | wc -l))
-KOKKOS_INTERNAL_COMPILER_CLANG       := $(strip $(shell $(CXX) --version       2>&1 | grep clang               | wc -l))
-KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(strip $(shell $(CXX) --version       2>&1 | grep "apple-darwin"      | wc -l))
 ifneq ($(OMPI_CXX),)
-  KOKKOS_INTERNAL_COMPILER_NVCC  := $(strip $(shell $(OMPI_CXX) --version   2>&1 | grep "nvcc" | wc -l))
+  KOKKOS_INTERNAL_COMPILER_NVCC      := $(strip $(shell $(OMPI_CXX) --version       2>&1 | grep nvcc                | wc -l))
 endif
 ifneq ($(MPICH_CXX),)
-  KOKKOS_INTERNAL_COMPILER_NVCC  := $(strip $(shell $(MPICH_CXX) --version  2>&1 | grep "nvcc" | wc -l))
+  KOKKOS_INTERNAL_COMPILER_NVCC      := $(strip $(shell $(MPICH_CXX) --version       2>&1 | grep nvcc                | wc -l))
 endif
+KOKKOS_INTERNAL_COMPILER_CLANG       := $(strip $(shell $(CXX) --version       2>&1 | grep clang               | wc -l))
+KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(strip $(shell $(CXX) --version       2>&1 | grep "apple-darwin"      | wc -l))
 
 ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 2)
   KOKKOS_INTERNAL_COMPILER_CLANG = 1
@@ -111,6 +112,36 @@ ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
   endif
 endif
 
+# Set compiler warnings flags.
+ifeq ($(KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS), 1)
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+    # TODO check if PGI accepts GNU style warnings
+    KOKKOS_INTERNAL_COMPILER_WARNINGS =
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
+      KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
+    else
+      ifeq ($(KOKKOS_INTERNAL_COMPILER_APPLE_CLANG), 1)
+        KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
+      else
+        ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
+          KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
+        else
+          ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+            # TODO check if cray accepts GNU style warnings
+            KOKKOS_INTERNAL_COMPILER_WARNINGS =
+          else
+            #gcc
+            KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized
+          endif
+        endif
+      endif
+    endif
+  endif
+else
+  KOKKOS_INTERNAL_COMPILER_WARNINGS =
+endif
+
 # Set OpenMP flags.
 ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
   KOKKOS_INTERNAL_OPENMP_FLAG := -mp
@@ -162,6 +193,7 @@ endif
 
 # Intel based.
 KOKKOS_INTERNAL_USE_ARCH_KNC := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNC | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_WSM := $(strip $(shell echo $(KOKKOS_ARCH) | grep WSM | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_SNB := $(strip $(shell echo $(KOKKOS_ARCH) | grep SNB | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_HSW := $(strip $(shell echo $(KOKKOS_ARCH) | grep HSW | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_BDW := $(strip $(shell echo $(KOKKOS_ARCH) | grep BDW | wc -l))
@@ -229,13 +261,14 @@ KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_
 KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(strip $(shell echo $(KOKKOS_ARCH) | grep AMDAVX | wc -l))
 
 # Any AVX?
+KOKKOS_INTERNAL_USE_ARCH_SSE42      := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_WSM) | bc ))
 KOKKOS_INTERNAL_USE_ARCH_AVX        := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX) | bc ))
 KOKKOS_INTERNAL_USE_ARCH_AVX2       := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW) | bc ))
 KOKKOS_INTERNAL_USE_ARCH_AVX512MIC  := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
 KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
 
 # Decide what ISA level we are able to support.
-KOKKOS_INTERNAL_USE_ISA_X86_64    := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
+KOKKOS_INTERNAL_USE_ISA_X86_64    := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_WSM)+$(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
 KOKKOS_INTERNAL_USE_ISA_KNC       := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNC) | bc ))
 KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc ))
 
@@ -243,7 +276,7 @@ KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_
 KOKKOS_INTERNAL_USE_TM            := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
 
 # Incompatible flags?
-KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1" | bc ))
+KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1" | bc ))
 KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1" | bc))
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1)
@@ -257,12 +290,10 @@ endif
 
 KOKKOS_CPPFLAGS = -I./ -I$(KOKKOS_PATH)/core/src -I$(KOKKOS_PATH)/containers/src -I$(KOKKOS_PATH)/algorithms/src
 
-# No warnings:
 KOKKOS_CXXFLAGS =
-# INTEL and CLANG warnings:
-#KOKKOS_CXXFLAGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
-# GCC warnings:
-#KOKKOS_CXXFLAGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized -Wignored-qualifiers -Wempty-body -Wclobbered
+ifeq ($(KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS), 1)
+  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_WARNINGS)
+endif
 
 KOKKOS_LIBS = -lkokkos -ldl
 KOKKOS_LDFLAGS = -L$(shell pwd)
@@ -486,6 +517,28 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX), 1)
   endif
 endif
 
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_SSE42), 1)
+  tmp := $(shell echo "\#define KOKKOS_ARCH_SSE42 1" >> KokkosCore_config.tmp )
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
+    KOKKOS_CXXFLAGS += -xSSE4.2
+    KOKKOS_LDFLAGS  += -xSSE4.2
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+
+    else
+      ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+        KOKKOS_CXXFLAGS += -tp=nehalem
+        KOKKOS_LDFLAGS  += -tp=nehalem
+      else
+        # Assume that this is a really a GNU compiler.
+        KOKKOS_CXXFLAGS += -msse4.2
+        KOKKOS_LDFLAGS  += -msse4.2
+      endif
+    endif
+  endif
+endif
+
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1)
   tmp := $(shell echo "\#define KOKKOS_ARCH_AVX 1" >> KokkosCore_config.tmp )
 
@@ -689,7 +742,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
   endif
 endif
 
-KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h)
+KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h 2>&1)
 ifeq ($(KOKKOS_INTERNAL_LS_CONFIG), KokkosCore_config.h)
   KOKKOS_INTERNAL_NEW_CONFIG := $(strip $(shell diff KokkosCore_config.h KokkosCore_config.tmp | grep define | wc -l))
 else
diff --git a/lib/kokkos/Makefile.targets b/lib/kokkos/Makefile.targets
index 3cb52a04cd..a9341a907c 100644
--- a/lib/kokkos/Makefile.targets
+++ b/lib/kokkos/Makefile.targets
@@ -20,8 +20,10 @@ Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Ta
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
 Kokkos_HostThreadTeam.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp
-Kokkos_spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
+Kokkos_Spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp
+Kokkos_Rendezvous.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Rendezvous.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Rendezvous.cpp
 Kokkos_Profiling_Interface.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp
 Kokkos_SharedAlloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_SharedAlloc.cpp
@@ -36,6 +38,8 @@ Kokkos_CudaSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cu
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp
 Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
+Kokkos_Cuda_Locks.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
diff --git a/lib/kokkos/algorithms/unit_tests/TestOpenMP.cpp b/lib/kokkos/algorithms/unit_tests/TestOpenMP.cpp
index 1e7ee68549..c2c118ce1a 100644
--- a/lib/kokkos/algorithms/unit_tests/TestOpenMP.cpp
+++ b/lib/kokkos/algorithms/unit_tests/TestOpenMP.cpp
@@ -61,14 +61,19 @@ protected:
   {
     std::cout << std::setprecision(5) << std::scientific;
 
-    unsigned threads_count = omp_get_max_threads();
+    int threads_count = 0;
+    #pragma omp parallel
+    {
+      #pragma omp atomic
+      ++threads_count;
+    }
 
-    if ( Kokkos::hwloc::available() ) {
-      threads_count = Kokkos::hwloc::get_available_numa_count() *
-                      Kokkos::hwloc::get_available_cores_per_numa();
+    if (threads_count > 3) {
+      threads_count /= 2;
     }
 
     Kokkos::OpenMP::initialize( threads_count );
+    Kokkos::OpenMP::print_configuration( std::cout );
   }
 
   static void TearDownTestCase()
diff --git a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp
index 9cf02f74b4..2771f1793d 100644
--- a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp
+++ b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp
@@ -1,12 +1,12 @@
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -35,7 +35,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 
@@ -283,12 +283,12 @@ struct test_random_scalar {
                       RandomGenerator& pool,
                       unsigned int num_draws)
   {
-    using std::cerr;
+    using std::cout;
     using std::endl;
     using Kokkos::parallel_reduce;
 
     {
-      cerr << " -- Testing randomness properties" << endl;
+      cout << " -- Testing randomness properties" << endl;
 
       RandomProperties result;
       typedef test_random_functor<RandomGenerator, Scalar> functor_type;
@@ -307,7 +307,7 @@ struct test_random_scalar {
                     ( 1.5*tolerance > variance_eps)) ? 1:0;
       pass_covar = ((-2.0*tolerance < covariance_eps) &&
                     ( 2.0*tolerance > covariance_eps)) ? 1:0;
-      cerr << "Pass: " << pass_mean
+      cout << "Pass: " << pass_mean
            << " " << pass_var
            << " " << mean_eps
            << " " << variance_eps
@@ -315,7 +315,7 @@ struct test_random_scalar {
            << " || " << tolerance << endl;
     }
     {
-      cerr << " -- Testing 1-D histogram" << endl;
+      cout << " -- Testing 1-D histogram" << endl;
 
       RandomProperties result;
       typedef test_histogram1d_functor<typename RandomGenerator::device_type> functor_type;
@@ -335,7 +335,7 @@ struct test_random_scalar {
       pass_hist1d_covar = ((-0.06 < covariance_eps) &&
                            ( 0.06 > covariance_eps)) ? 1:0;
 
-      cerr << "Density 1D: " << mean_eps
+      cout << "Density 1D: " << mean_eps
            << " " << variance_eps
            << " " << (result.covariance/HIST_DIM1D/HIST_DIM1D)
            << " || " << tolerance
@@ -348,7 +348,7 @@ struct test_random_scalar {
            << endl;
     }
     {
-      cerr << " -- Testing 3-D histogram" << endl;
+      cout << " -- Testing 3-D histogram" << endl;
 
       RandomProperties result;
       typedef test_histogram3d_functor<typename RandomGenerator::device_type> functor_type;
@@ -368,7 +368,7 @@ struct test_random_scalar {
       pass_hist3d_covar = ((-tolerance < covariance_eps) &&
                            ( tolerance > covariance_eps)) ? 1:0;
 
-      cerr << "Density 3D: " << mean_eps
+      cout << "Density 3D: " << mean_eps
            << " " << variance_eps
            << " " << result.covariance/HIST_DIM1D/HIST_DIM1D
            << " || " << tolerance
@@ -381,18 +381,18 @@ struct test_random_scalar {
 template <class RandomGenerator>
 void test_random(unsigned int num_draws)
 {
-  using std::cerr;
+  using std::cout;
   using std::endl;
   typename test_random_functor<RandomGenerator,int>::type_1d density_1d("D1d");
   typename test_random_functor<RandomGenerator,int>::type_3d density_3d("D3d");
 
 
   uint64_t ticks = std::chrono::high_resolution_clock::now().time_since_epoch().count();
-  cerr << "Test Seed:" << ticks << endl;
+  cout << "Test Seed:" << ticks << endl;
 
   RandomGenerator pool(ticks);
 
-  cerr << "Test Scalar=int" << endl;
+  cout << "Test Scalar=int" << endl;
   test_random_scalar<RandomGenerator,int> test_int(density_1d,density_3d,pool,num_draws);
   ASSERT_EQ( test_int.pass_mean,1);
   ASSERT_EQ( test_int.pass_var,1);
@@ -406,7 +406,7 @@ void test_random(unsigned int num_draws)
   deep_copy(density_1d,0);
   deep_copy(density_3d,0);
 
-  cerr << "Test Scalar=unsigned int" << endl;
+  cout << "Test Scalar=unsigned int" << endl;
   test_random_scalar<RandomGenerator,unsigned int> test_uint(density_1d,density_3d,pool,num_draws);
   ASSERT_EQ( test_uint.pass_mean,1);
   ASSERT_EQ( test_uint.pass_var,1);
@@ -420,7 +420,7 @@ void test_random(unsigned int num_draws)
   deep_copy(density_1d,0);
   deep_copy(density_3d,0);
 
-  cerr << "Test Scalar=int64_t" << endl;
+  cout << "Test Scalar=int64_t" << endl;
   test_random_scalar<RandomGenerator,int64_t> test_int64(density_1d,density_3d,pool,num_draws);
   ASSERT_EQ( test_int64.pass_mean,1);
   ASSERT_EQ( test_int64.pass_var,1);
@@ -434,7 +434,7 @@ void test_random(unsigned int num_draws)
   deep_copy(density_1d,0);
   deep_copy(density_3d,0);
 
-  cerr << "Test Scalar=uint64_t" << endl;
+  cout << "Test Scalar=uint64_t" << endl;
   test_random_scalar<RandomGenerator,uint64_t> test_uint64(density_1d,density_3d,pool,num_draws);
   ASSERT_EQ( test_uint64.pass_mean,1);
   ASSERT_EQ( test_uint64.pass_var,1);
@@ -448,7 +448,7 @@ void test_random(unsigned int num_draws)
   deep_copy(density_1d,0);
   deep_copy(density_3d,0);
 
-  cerr << "Test Scalar=float" << endl;
+  cout << "Test Scalar=float" << endl;
   test_random_scalar<RandomGenerator,float> test_float(density_1d,density_3d,pool,num_draws);
   ASSERT_EQ( test_float.pass_mean,1);
   ASSERT_EQ( test_float.pass_var,1);
@@ -462,7 +462,7 @@ void test_random(unsigned int num_draws)
   deep_copy(density_1d,0);
   deep_copy(density_3d,0);
 
-  cerr << "Test Scalar=double" << endl;
+  cout << "Test Scalar=double" << endl;
   test_random_scalar<RandomGenerator,double> test_double(density_1d,density_3d,pool,num_draws);
   ASSERT_EQ( test_double.pass_mean,1);
   ASSERT_EQ( test_double.pass_var,1);
diff --git a/lib/kokkos/algorithms/unit_tests/UnitTestMain.cpp b/lib/kokkos/algorithms/unit_tests/UnitTestMain.cpp
index f952ab3db5..9e75b580bc 100644
--- a/lib/kokkos/algorithms/unit_tests/UnitTestMain.cpp
+++ b/lib/kokkos/algorithms/unit_tests/UnitTestMain.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
diff --git a/lib/kokkos/benchmarks/bytes_and_flops/main.cpp b/lib/kokkos/benchmarks/bytes_and_flops/main.cpp
index f545247212..8db5ce0eb5 100644
--- a/lib/kokkos/benchmarks/bytes_and_flops/main.cpp
+++ b/lib/kokkos/benchmarks/bytes_and_flops/main.cpp
@@ -44,12 +44,13 @@
 #include<Kokkos_Core.hpp>
 #include<impl/Kokkos_Timer.hpp>
 #include<bench.hpp>
+#include<cstdlib>
 
 int main(int argc, char* argv[]) {
   Kokkos::initialize();
-  
 
-  if(argc<10) { 
+
+  if(argc<10) {
     printf("Arguments: N K R D U F T S\n");
     printf("  P:   Precision (1==float, 2==double)\n");
     printf("  N,K: dimensions of the 2D array to allocate\n");
@@ -68,7 +69,7 @@ int main(int argc, char* argv[]) {
     Kokkos::finalize();
     return 0;
   }
-  
+
 
   int P = atoi(argv[1]);
   int N = atoi(argv[2]);
@@ -80,7 +81,7 @@ int main(int argc, char* argv[]) {
   int T = atoi(argv[8]);
   int S = atoi(argv[9]);
 
-  if(U>8) {printf("U must be 1-8\n"); return 0;} 
+  if(U>8) {printf("U must be 1-8\n"); return 0;}
   if( (D!=1) && (D!=2) && (D!=4) && (D!=8) && (D!=16) && (D!=32)) {printf("D must be one of 1,2,4,8,16,32\n"); return 0;}
   if( (P!=1) && (P!=2) ) {printf("P must be one of 1,2\n"); return 0;}
 
diff --git a/lib/kokkos/benchmarks/gather/main.cpp b/lib/kokkos/benchmarks/gather/main.cpp
index 161c6f2091..88eb0493c1 100644
--- a/lib/kokkos/benchmarks/gather/main.cpp
+++ b/lib/kokkos/benchmarks/gather/main.cpp
@@ -44,11 +44,11 @@
 #include<Kokkos_Core.hpp>
 #include<impl/Kokkos_Timer.hpp>
 #include<gather.hpp>
+#include<cstdlib>
 
 int main(int argc, char* argv[]) {
   Kokkos::initialize(argc,argv);
 
-
   if(argc<8) {
     printf("Arguments: S N K D\n");
     printf("  S:   Scalar Type Size (1==float, 2==double, 4=complex<double>)\n");
diff --git a/lib/kokkos/benchmarks/policy_performance/Makefile b/lib/kokkos/benchmarks/policy_performance/Makefile
new file mode 100644
index 0000000000..13aef3209c
--- /dev/null
+++ b/lib/kokkos/benchmarks/policy_performance/Makefile
@@ -0,0 +1,44 @@
+KOKKOS_PATH = ../..
+SRC = $(wildcard *.cpp)
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
+CXXFLAGS = -O3 -g
+LINK = ${CXX}
+LINKFLAGS = 
+EXE = policy_performance.cuda
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+KOKKOS_CUDA_OPTIONS+=enable_lambda
+else
+CXX = g++
+CXXFLAGS = -O3 -g -Wall -Werror
+LINK = ${CXX}
+LINKFLAGS =  
+EXE = policy_performance.host
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+
+DEPFLAGS = -M
+
+OBJ = $(SRC:.cpp=.o)
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS) main.cpp policy_perf_test.hpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
diff --git a/lib/kokkos/benchmarks/policy_performance/main.cpp b/lib/kokkos/benchmarks/policy_performance/main.cpp
new file mode 100644
index 0000000000..b0ed9bb512
--- /dev/null
+++ b/lib/kokkos/benchmarks/policy_performance/main.cpp
@@ -0,0 +1,170 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include "policy_perf_test.hpp"
+
+int main(int argc, char* argv[] ) {
+  Kokkos::initialize(argc,argv);
+
+  if(argc<10) {
+    printf("  Ten arguments are needed to run this program:\n");
+    printf("    (1)team_range, (2)thread_range, (3)vector_range, (4)outer_repeat, (5)thread_repeat, (6)vector_repeat, (7)team_size, (8)vector_size, (9)schedule, (10)test_type\n");
+    printf("  team_range:     number of teams (league_size)\n");
+    printf("  thread_range:   range for nested TeamThreadRange parallel_*\n");
+    printf("  vector_range:   range for nested ThreadVectorRange parallel_*\n");
+    printf("  outer_repeat:   number of repeats for outer parallel_* call\n");
+    printf("  thread_repeat:  number of repeats for TeamThreadRange parallel_* call\n");
+    printf("  vector_repeat:  number of repeats for ThreadVectorRange parallel_* call\n");
+    printf("  team_size:      number of team members (team_size)\n");
+    printf("  vector_size:    desired vectorization (if possible)\n");
+    printf("  schedule:       1 == Static  2 == Dynamic\n");
+    printf("  test_type:      3-digit code XYZ for testing (nested) parallel_*\n");
+    printf("  code key:       XYZ    X in {1,2,3,4,5}, Y in {0,1,2}, Z in {0,1,2}\n");
+    printf("                  TeamPolicy:\n");
+    printf("                    X: 0 = none (never used, makes no sense); 1 = parallel_for; 2 = parallel_reduce\n");
+    printf("                    Y: 0 = none; 1 = parallel_for; 2 = parallel_reduce\n");
+    printf("                    Z: 0 = none; 1 = parallel_for; 2 = parallel_reduce\n");
+    printf("                  RangePolicy:\n");
+    printf("                    X: 3 = parallel_for; 4 = parallel_reduce; 5 = parallel_scan\n");
+    printf("                    Y: 0 = none\n");
+    printf("                    Z: 0 = none\n");
+    printf("  Example Input:\n");
+    printf("  100000 32 32 100 100 100 8 1 1 100\n"); 
+    Kokkos::finalize();
+    return 0;
+  }
+
+  int team_range = atoi(argv[1]);
+  int thread_range = atoi(argv[2]);
+  int vector_range = atoi(argv[3]);
+
+  int outer_repeat = atoi(argv[4]);
+  int thread_repeat = atoi(argv[5]);
+  int vector_repeat = atoi(argv[6]);
+
+  int team_size = atoi(argv[7]);
+  int vector_size = atoi(argv[8]);
+  int schedule = atoi(argv[9]);
+  int test_type = atoi(argv[10]);
+
+  int disable_verbose_output = 0; 
+  if ( argc > 11 ) {
+    disable_verbose_output = atoi(argv[11]);
+  }
+
+  if ( schedule != 1 && schedule != 2 ) {
+    printf("schedule: %d\n", schedule);
+    printf("Options for schedule are: 1 == Static  2 == Dynamic\n");
+    Kokkos::finalize();
+    return -1;
+  }
+
+  if ( test_type != 100 && test_type != 110 && test_type != 111 && test_type != 112 && test_type != 120  && test_type != 121  && test_type != 122
+     && test_type != 200 && test_type != 210 && test_type != 211 && test_type != 212 && test_type != 220  && test_type != 221  && test_type != 222
+     && test_type != 300 && test_type != 400 && test_type != 500
+     )
+  {
+    printf("Incorrect test_type option\n");
+    Kokkos::finalize();
+    return -2;
+  }
+
+  double result = 0.0;
+
+  Kokkos::parallel_reduce( "parallel_reduce warmup", Kokkos::TeamPolicy<>(10,1), 
+    KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type team, double& lval) {
+      lval += 1;
+    }, result);
+
+  typedef Kokkos::View<double*, Kokkos::LayoutRight>   view_type_1d;
+  typedef Kokkos::View<double**, Kokkos::LayoutRight>  view_type_2d;
+  typedef Kokkos::View<double***, Kokkos::LayoutRight> view_type_3d;
+
+  // Allocate view without initializing
+  // Call a 'warmup' test with 1 repeat - this will initialize the corresponding view appropriately for test and should obey first-touch etc
+  // Second call to test is the one we actually care about and time
+  view_type_1d v_1( Kokkos::ViewAllocateWithoutInitializing("v_1"), team_range*team_size);
+  view_type_2d v_2( Kokkos::ViewAllocateWithoutInitializing("v_2"), team_range*team_size, thread_range);
+  view_type_3d v_3( Kokkos::ViewAllocateWithoutInitializing("v_3"), team_range*team_size, thread_range, vector_range);
+
+  double result_computed = 0.0;
+  double result_expect = 0.0;
+  double time = 0.0;
+
+  if(schedule==1) {
+    if ( test_type != 500 ) {
+      // warmup - no repeat of loops
+      test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
+      test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
+    }
+    else {
+      // parallel_scan: initialize 1d view for parallel_scan
+      test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,100,v_1,v_2,v_3,result_computed,result_expect,time);
+      test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
+    }
+  }
+  if(schedule==2) {
+    if ( test_type != 500 ) {
+      // warmup - no repeat of loops
+      test_policy<Kokkos::Schedule<Kokkos::Dynamic>,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
+      test_policy<Kokkos::Schedule<Kokkos::Dynamic>,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
+    }
+    else {
+      // parallel_scan: initialize 1d view for parallel_scan
+      test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,100,v_1,v_2,v_3,result_computed,result_expect,time);
+      test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
+    }
+  }
+
+  if ( disable_verbose_output == 0 ) {
+    printf("%7i %4i %2i %9i %4i %4i %4i %2i %1i %3i %e %e %lf\n",team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,schedule,test_type,result_computed,result_expect,time);
+  }
+  else {
+    printf("%lf\n",time);
+  }
+
+  Kokkos::finalize();
+
+  return 0;
+}
diff --git a/lib/kokkos/benchmarks/policy_performance/policy_perf_test.hpp b/lib/kokkos/benchmarks/policy_performance/policy_perf_test.hpp
new file mode 100644
index 0000000000..8c79f3b88d
--- /dev/null
+++ b/lib/kokkos/benchmarks/policy_performance/policy_perf_test.hpp
@@ -0,0 +1,354 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+template < class ViewType >
+struct ParallelScanFunctor {
+  using value_type = double;
+  ViewType v;
+
+  ParallelScanFunctor( const ViewType & v_ )
+    : v(v_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+    void operator()( const int idx, value_type& val, const bool& final ) const
+    {
+      // inclusive scan
+      val += v(idx);
+      if ( final ) {
+        v(idx) = val;
+      }
+    }
+};
+
+template<class ScheduleType,class IndexType,class ViewType1, class ViewType2, class ViewType3>
+void test_policy(int team_range, int thread_range, int vector_range,
+          int outer_repeat, int thread_repeat, int inner_repeat,
+          int team_size, int vector_size, int test_type,
+          ViewType1 &v1, ViewType2 &v2, ViewType3 &v3,
+          double &result, double &result_expect, double &time) {
+  
+  typedef Kokkos::TeamPolicy<ScheduleType,IndexType> t_policy;
+  typedef typename t_policy::member_type t_team;
+  Kokkos::Timer timer;
+  
+  for(int orep = 0; orep<outer_repeat; orep++) {
+
+    if (test_type == 100) {
+      Kokkos::parallel_for("100 outer for", t_policy(team_range,team_size),
+        KOKKOS_LAMBDA (const t_team& team) {
+          long idx = team.league_rank()*team.team_size() + team.team_rank();
+          v1(idx) = idx;
+          // prevent compiler optimizing loop away
+      });
+    }
+
+    if (test_type == 110) {
+      Kokkos::parallel_for("110 outer for", t_policy(team_range,team_size),
+        KOKKOS_LAMBDA (const t_team& team) {
+          long idx = team.league_rank()*team.team_size() + team.team_rank();
+          for (int tr = 0; tr<thread_repeat; ++tr) {
+            // Each team launches a parallel_for; thread_range is partitioned among team members
+            Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
+              v2( idx, t ) = t;
+              // prevent compiler optimizing loop away
+            });
+          }             
+      });
+    }
+    if (test_type == 111) {
+      Kokkos::parallel_for("111 outer for", t_policy(team_range,team_size,vector_size),
+        KOKKOS_LAMBDA (const t_team& team) {
+          long idx = team.league_rank()*team.team_size() + team.team_rank();
+          for (int tr = 0; tr<thread_repeat; ++tr) {
+            // Each team launches a parallel_for; thread_range is partitioned among team members
+            Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
+              for (int vr = 0; vr<inner_repeat; ++vr)
+                Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,vector_range), [&] (const int vi) {
+                  v3( idx, t, vi ) = vi;
+                  // prevent compiler optimizing loop away
+                });
+            });
+          }
+      });
+    }
+    if (test_type == 112) {
+      Kokkos::parallel_for("112 outer for", t_policy(team_range,team_size,vector_size),
+        KOKKOS_LAMBDA (const t_team& team) {
+          long idx = team.league_rank()*team.team_size() + team.team_rank();
+          for (int tr = 0; tr<thread_repeat; ++tr) {
+            // Each team launches a parallel_for; thread_range is partitioned among team members
+            Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
+              double vector_result = 0.0;
+              for (int vr = 0; vr<inner_repeat; ++vr) {
+                vector_result = 0.0;
+                Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,vector_range), [&] (const int vi, double &vval) {
+                  vval += 1;
+                }, vector_result);
+              }
+              v2( idx, t ) = vector_result;
+              // prevent compiler optimizing loop away
+            });
+          }
+      });
+    }
+    if (test_type == 120) {
+      Kokkos::parallel_for("120 outer for", t_policy(team_range,team_size),
+        KOKKOS_LAMBDA (const t_team& team) {
+          long idx = team.league_rank()*team.team_size() + team.team_rank();
+          double team_result = 0.0;
+          for (int tr = 0; tr<thread_repeat; ++tr) {
+            team_result = 0.0;
+            Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double &lval) {
+              lval += 1;
+            }, team_result);
+          }
+          v1(idx) = team_result;
+          // prevent compiler optimizing loop away
+      });
+    }
+    if (test_type == 121) {
+      Kokkos::parallel_for("121 outer for", t_policy(team_range,team_size,vector_size),
+        KOKKOS_LAMBDA (const t_team& team) {
+          long idx = team.league_rank()*team.team_size() + team.team_rank();
+          double team_result = 0.0;
+          for (int tr = 0; tr<thread_repeat; ++tr) {
+            team_result = 0.0;
+            Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double &lval) {
+              lval += 1;
+              for (int vr = 0; vr<inner_repeat; ++vr) {
+                Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,vector_range), [&] (const int vi) {
+                  v3( idx, t, vi ) = vi;
+                  // prevent compiler optimizing loop away
+                });
+              }
+            }, team_result);
+          }
+          v3( idx, 0, 0 ) = team_result;
+          // prevent compiler optimizing loop away
+      });
+    }
+    if (test_type == 122) {
+      Kokkos::parallel_for("122 outer for", t_policy(team_range,team_size,vector_size),
+        KOKKOS_LAMBDA (const t_team& team) {
+          long idx = team.league_rank()*team.team_size() + team.team_rank();
+          double team_result = 0.0;
+          for (int tr = 0; tr<thread_repeat; ++tr) {
+            Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double &lval) {
+              double vector_result = 0.0;
+              for (int vr = 0; vr<inner_repeat; ++vr)
+                vector_result = 0.0;
+                Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,vector_range), [&] (const int vi, double &vval) {
+                  vval += 1;
+                }, vector_result);
+                lval += vector_result;
+            }, team_result);
+          }
+          v1(idx) = team_result;
+          // prevent compiler optimizing loop away
+      });
+    }
+    if (test_type == 200) {
+      Kokkos::parallel_reduce("200 outer reduce", t_policy(team_range,team_size),                        
+        KOKKOS_LAMBDA (const t_team& team, double& lval) {
+          lval+=team.team_size()*team.league_rank() + team.team_rank();
+      },result);
+      result_expect = 0.5* (team_range*team_size)*(team_range*team_size-1);
+      // sum ( seq( [0, team_range*team_size) )
+    }
+    if (test_type == 210) {
+      Kokkos::parallel_reduce("210 outer reduce", t_policy(team_range,team_size),
+        KOKKOS_LAMBDA (const t_team& team, double& lval) {
+        long idx = team.league_rank()*team.team_size() + team.team_rank();
+        double thread_for = 1.0;
+        for(int tr = 0; tr<thread_repeat; tr++) {
+          Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
+            v2(idx,t) = t;
+            // prevent compiler optimizing loop away
+          });
+        }
+        lval+=(team.team_size()*team.league_rank() + team.team_rank() + thread_for);
+      },result);
+      result_expect = 0.5* (team_range*team_size)*(team_range*team_size-1) + (team_range*team_size);
+      // sum ( seq( [0, team_range*team_size) + 1 per team_member (total of team_range*team_size) )
+    }
+    if (test_type == 211) {
+      Kokkos::parallel_reduce("211 outer reduce", t_policy(team_range,team_size,vector_size),
+        KOKKOS_LAMBDA (const t_team& team, double& lval) {
+        long idx = team.league_rank()*team.team_size() + team.team_rank();
+        double thread_for = 1.0;
+        for(int tr = 0; tr<thread_repeat; tr++) {
+          Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
+            for (int vr = 0; vr<inner_repeat; ++vr)
+              Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_range), [&] (const int vi) {
+                v3(idx, t, vi) = vi;
+                // prevent compiler optimizing loop away
+              });
+          });
+        }
+        lval+=idx+thread_for;
+      },result);
+      result_expect = 0.5*(team_range*team_size)*(team_range*team_size-1) + (team_range*team_size);
+      // sum ( seq( [0, team_range*team_size) + 1 per team_member (total of team_range*team_size) )
+    }
+    if (test_type == 212) {
+      Kokkos::parallel_reduce("212 outer reduce", t_policy(team_range,team_size,vector_size),
+        KOKKOS_LAMBDA (const t_team& team, double& lval) {
+        long idx = team.league_rank()*team.team_size() + team.team_rank();
+        double vector_result = 0.0;
+        for(int tr = 0; tr<thread_repeat; tr++) {
+          // This parallel_for is executed by each team; the thread_range is partitioned among the team members
+          Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
+            v2(idx,t) = t;
+            // prevent compiler optimizing loop away
+            for (int vr = 0; vr<inner_repeat; ++vr) {
+              vector_result = 0.0;
+              Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, vector_range), [&] (const int vi, double &vval) {
+                vval += vi;
+              }, vector_result );
+            }
+          });
+        }
+        lval+= idx + vector_result;
+      },result);
+      result_expect = 0.5*(team_range*team_size)*(team_range*team_size-1) + (0.5*vector_range*(vector_range-1)*team_range*team_size);
+      // sum ( seq( [0, team_range*team_size) + sum( seq( [0, vector_range) ) per team_member (total of team_range*team_size) )
+    }
+    if (test_type == 220) {
+      Kokkos::parallel_reduce("220 outer reduce", t_policy(team_range,team_size),
+        KOKKOS_LAMBDA (const t_team& team, double& lval) {
+        double team_result = 0.0;
+        for(int tr = 0; tr<thread_repeat; tr++) {
+          Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double& tval) {
+            tval += t;
+          },team_result);
+        }
+        lval+=team_result*team.league_rank(); // constant * league_rank
+      },result);
+      result_expect = 0.5*(team_range)*(team_range-1) * team_size * 0.5*(thread_range)*(thread_range-1);
+      // sum ( seq( [0, team_range) * constant ); constant = sum( seq( [0, thread_range) )*team_size (1 per member, result for each team)
+    }
+    if (test_type == 221) {
+      Kokkos::parallel_reduce("221 outer reduce", t_policy(team_range,team_size,vector_size),
+        KOKKOS_LAMBDA (const t_team& team, double& lval) {
+        long idx = team.league_rank()*team.team_size() + team.team_rank();
+        double team_result = 0;
+        for(int tr = 0; tr<thread_repeat; tr++) {
+          Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double& tval) {
+            double vector_for = 1.0;
+            for (int vr = 0; vr<inner_repeat; ++vr) {
+              Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_range), [&] (const int vi) {
+                v3(idx, t, vi) = vi;
+                // prevent compiler optimizing loop away
+              });
+            }
+            tval += t + vector_for;
+          },team_result);
+        }
+        lval+=team_result*team.league_rank();
+      },result);
+      result_expect = 0.5* (team_range)*(team_range-1) * team_size * (0.5*(thread_range) * (thread_range-1) + thread_range);
+      // sum ( seq( [0, team_range) * constant ) + 1 per member per team; constant = sum( seq( [0, thread_range) )*team_size (1 per member, result for each team)
+    }
+    if (test_type == 222) {
+      Kokkos::parallel_reduce("222 outer reduce", t_policy(team_range,team_size,vector_size),
+        KOKKOS_LAMBDA (const t_team& team, double& lval) {
+        double team_result = 0.0;
+        for(int tr = 0; tr<thread_repeat; tr++) {
+          Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double& tval) {
+            double vector_result = 0.0;
+            for (int vr = 0; vr<inner_repeat; ++vr) {
+              Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, vector_range), [&] (const int vi, double& vval) {
+                vval += vi;
+              }, vector_result);
+            }
+            tval += t + vector_result;
+          },team_result);
+        }
+        lval+=team_result*team.league_rank();
+      },result);
+      result_expect = 0.5* (team_range)*(team_range-1) * team_size * (0.5*(thread_range) * (thread_range-1) + thread_range*0.5*(vector_range)*(vector_range-1));
+      // sum ( seq( [0, team_range) * constant ) + 1 + sum( seq([0,vector_range) ) per member per team; constant = sum( seq( [0, thread_range) )*team_size (1 per member, result for each team)
+    }
+
+    // parallel_for RangePolicy: range = team_size*team_range
+    if (test_type == 300) {
+      Kokkos::parallel_for("300 outer for", team_size*team_range, 
+        KOKKOS_LAMBDA (const int idx) {
+          v1(idx) = idx;
+          // prevent compiler from optimizing away the loop
+      });
+    }
+    // parallel_reduce RangePolicy: range = team_size*team_range
+    if (test_type == 400) {
+      Kokkos::parallel_reduce("400 outer reduce", team_size*team_range, 
+        KOKKOS_LAMBDA (const int idx, double& val) {
+          val += idx;
+      }, result);
+      result_expect = 0.5*(team_size*team_range)*(team_size*team_range-1);
+    }
+    // parallel_scan RangePolicy: range = team_size*team_range
+    if (test_type == 500) {
+      Kokkos::parallel_scan("500 outer scan", team_size*team_range, 
+        ParallelScanFunctor<ViewType1>(v1)
+#if 0
+        // This does not compile with pre Cuda 8.0 - see Github Issue #913 for explanation
+        KOKKOS_LAMBDA (const int idx, double& val, const bool& final) {
+          // inclusive scan
+          val += v1(idx);
+          if ( final ) {
+            v1(idx) = val;
+          }
+        }
+#endif
+      );
+      // result = v1( team_size*team_range - 1 ); // won't work with Cuda - need to copy result back to host to print
+      // result_expect = 0.5*(team_size*team_range)*(team_size*team_range-1);
+    }
+
+  } // end outer for loop
+
+  time = timer.seconds();
+} //end test_policy
diff --git a/lib/kokkos/benchmarks/policy_performance/script_basic_testing.sh b/lib/kokkos/benchmarks/policy_performance/script_basic_testing.sh
new file mode 100755
index 0000000000..e621fffbd4
--- /dev/null
+++ b/lib/kokkos/benchmarks/policy_performance/script_basic_testing.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+# Script to check policy_perf_test code works with each possible combo of options
+
+echo "Performance test results for parallel_reduce code computing sum of sequence [0,N) with various (nested) policies"
+
+EXECUTABLE=policy_performance
+
+TEAMRANGE=1000
+THREADRANGE=4
+VECTORRANGE=32
+TEAMSIZE=4
+VECTORSIZE=1
+OREPEAT=1
+MREPEAT=1
+IREPEAT=1
+SCHEDULE=1
+
+SUFFIX=host
+if [ -e $EXECUTABLE.$SUFFIX ]
+then
+SCHEDULE=1
+echo "Host tests Static schedule"
+for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500}
+do
+  OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
+done
+
+SCHEDULE=2
+echo "Host tests Dynamic schedule"
+for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500}
+do
+  OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
+done
+fi
+
+SUFFIX=cuda
+if [ -e $EXECUTABLE.$SUFFIX ]
+then
+SCHEDULE=1
+echo "Cuda tests Static schedule"
+for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500}
+do
+  ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
+done
+
+SCHEDULE=2
+echo "Cuda tests Dynamic schedule"
+for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500}
+do
+  ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
+done
+fi
diff --git a/lib/kokkos/benchmarks/policy_performance/script_sample_usage.sh b/lib/kokkos/benchmarks/policy_performance/script_sample_usage.sh
new file mode 100755
index 0000000000..f4bfb87f8f
--- /dev/null
+++ b/lib/kokkos/benchmarks/policy_performance/script_sample_usage.sh
@@ -0,0 +1,126 @@
+#!/bin/bash
+
+# Sample script for benchmarking policy performance 
+
+# Suggested enviroment variables to export prior to executing script:
+# KNL: 
+# OMP_NUM_THREADS=256 KMP_AFFINITY=compact
+# Power:
+# OMP_NUM_THREADS=64 OMP_PROC_BIND=true
+
+# Constants and Variables:
+# Vary:  TEAMSIZE, and THREADRANGE
+#  for TEAMSIZE in {1,2,4,5,8}; do
+#  for THREADRANGE in {32,41,1000}; do
+# Fixed: TEAMRANGE, VECTORRANGE, VECTORSIZE
+# System specific: Adjust REPEAT values to architecture tests are run on
+
+# Tests
+# Static SCHEDULE = 1
+# Tier 1: parallel_for + RangePolicy 300
+# Tier 2: parallel_reduce, parallel_scan + RangePolicy 400 500
+# Tier 3: 'outer' parallel_for with TeamPolicy (nested parallelism) 1XY
+# Tier 4: 'outer' parallel_reduce with TeamPolicy (nested parallelism) 2XY
+# Dynamic SCHEDULE = 2
+# Tier 5: parallel_for + RangePolicy 300
+# Tier 6: parallel_reduce, parallel_scan + RangePolicy 400 500
+# Tier 7: 'outer' parallel_for with TeamPolicy (nested parallelism) 1XY
+# Tier 8: 'outer' parallel_reduce with TeamPolicy (nested parallelism) 2XY
+
+# Results grouped by: 
+# 0) SCHEDULE  1) CODE (test)  2) TEAMRANGE  3) TEAMSIZE  4) THREADRANGE
+
+EXECUTABLE=policy_performance
+
+# Default defined values
+TEAMRANGE=1000
+THREADRANGE=1
+VECTORRANGE=32
+TEAMSIZE=1
+VECTORSIZE=1
+OREPEAT=1
+MREPEAT=1
+IREPEAT=1
+SCHEDULE=1
+
+# Host tests
+SUFFIX=host
+if [ -e $EXECUTABLE.$SUFFIX ]; then
+echo "Host"
+
+for SCHEDULE in {1,2}; do
+
+# Tier 1 and 2, 5 and 6
+for CODE in {300,400,500}; do
+    for TEAMSIZE in {1,2,4,5,8}; do
+    OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
+    done
+done
+
+# Tier 3, 7
+for CODE in {100,110,111,112,120,121,122}; do
+    for TEAMSIZE in {1,2,4,5,8}; do
+      for THREADRANGE in {32,41,1000}; do
+      OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
+      done
+    done
+done
+
+# Tier 4, 8
+for CODE in {200,210,211,212,220,221,222}; do
+    for TEAMSIZE in {1,2,4,5,8}; do
+      for THREADRANGE in {32,41,1000}; do
+      OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
+      done
+    done
+done
+
+done # end SCHEDULE
+
+fi # end host
+
+
+# Cuda tests
+SUFFIX=cuda
+# TEAMRANGE=10000, TEAMSIZE=8 too large
+# TEAMRANGE=10000, TEAMSIZE=8, THREADRANGE=1000 too large
+if [ -e $EXECUTABLE.$SUFFIX ]; then
+echo "Cuda"
+
+for SCHEDULE in {1,2}; do
+
+# Reset defaults
+TEAMRANGE=1000
+THREADRANGE=1
+VECTORRANGE=32
+TEAMSIZE=1
+VECTORSIZE=1
+
+# Tier 1 and 2, 5 and 6
+for CODE in {300,400,500}; do
+    for TEAMSIZE in {1,2,4,5,8}; do
+    ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
+    done
+done
+
+# Tier 3, 7
+for CODE in {100,110,111,112,120,121,122}; do
+    for TEAMSIZE in {1,2,4,5,8}; do
+      for THREADRANGE in {32,41,1000}; do
+      ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
+      done
+    done
+done
+
+# Tier 4, 8
+for CODE in {200,210,211,212,220,221,222}; do
+    for TEAMSIZE in {1,2,4,5,8}; do
+      for THREADRANGE in {32,41,1000}; do
+      ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
+      done
+    done
+done
+
+done # end SCHEDULE
+
+fi #end cuda
diff --git a/lib/kokkos/bin/hpcbind b/lib/kokkos/bin/hpcbind
new file mode 100755
index 0000000000..ca34648780
--- /dev/null
+++ b/lib/kokkos/bin/hpcbind
@@ -0,0 +1,454 @@
+#!/usr/bin/env bash
+
+################################################################################
+# Check if hwloc commands exist
+################################################################################
+declare -i HPCBIND_HAS_HWLOC=1
+type hwloc-bind >/dev/null 2>&1
+HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
+
+type hwloc-distrib >/dev/null 2>&1
+HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
+
+type hwloc-ls >/dev/null 2>&1
+HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
+
+type hwloc-calc >/dev/null 2>&1
+HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
+
+type hwloc-ps >/dev/null 2>&1
+HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
+
+if [[ ${HPCBIND_HAS_HWLOC} -eq 0 ]]; then
+  echo "hwloc not found, no process binding will occur"
+fi
+
+# Get parent cpuset
+HPCBIND_HWLOC_PARENT_CPUSET=""
+if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
+  MY_PID="$BASHPID"
+  HPCBIND_HWLOC_PARENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2)
+fi
+
+################################################################################
+# Check if nvidia-smi exist
+################################################################################
+declare -i HPCBIND_HAS_NVIDIA=0
+type nvidia-smi >/dev/null 2>&1
+HPCBIND_HAS_NVIDIA=$((!$?))
+
+
+################################################################################
+# Get visible gpu
+################################################################################
+declare -i NUM_GPUS=0
+HPCBIND_VISIBLE_GPUS=""
+if [[ ${HPCBIND_HAS_NVIDIA} -eq 1 ]]; then
+  NUM_GPUS=$(nvidia-smi -L | wc -l);
+  GPU_LIST="$( seq 0 $((NUM_GPUS-1)) )"
+  HPCBIND_VISIBLE_GPUS=${CUDA_VISIBLE_DEVICES:-${GPU_LIST}}
+fi
+
+declare -i HPCBIND_ENABLE_GPU_MAPPING=$((NUM_GPUS > 0))
+
+
+################################################################################
+# Get queue id
+# supports sbatch, bsub, aprun
+################################################################################
+HPCBIND_QUEUE_NAME=""
+declare -i HPCBIND_QUEUE_INDEX=0
+declare -i HPCBIND_QUEUE_GPU_MAPPING=0
+
+if [[ ! -z "${SLURM_LOCAL_ID}" ]]; then
+  HPCBIND_QUEUE_GPU_MAPPING=1
+  HPCBIND_QUEUE_NAME="sbatch"
+  HPCBIND_QUEUE_INDEX=${SLURM_LOCAL_ID}
+elif [[ ! -z "${LBS_JOBINDEX}" ]]; then
+  HPCBIND_QUEUE_GPU_MAPPING=1
+  HPCBIND_QUEUE_NAME="bsub"
+  HPCBIND_QUEUE_INDEX=${LBS_JOBINDEX}
+elif [[ ! -z "${ALPS_APP_PE}" ]]; then
+  HPCBIND_QUEUE_GPU_MAPPING=1
+  HPCBIND_QUEUE_NAME="aprun"
+  HPCBIND_QUEUE_INDEX=${ALPS_APP_PE}
+fi
+
+
+################################################################################
+# Show help
+################################################################################
+function show_help {
+  local cmd=$(basename "$0")
+  echo "Usage: ${cmd} <options> -- command ..."
+  echo "  Set the process mask, OMP environment variables and CUDA environment"
+  echo "  variables to sane values if possible. Uses hwloc and nvidia-smi if"
+  echo "  available.  Will preserve the current process binding, so it is safe"
+  echo "  to use with a queuing system or mpiexec."
+  echo ""
+  echo "Options:"
+  echo "  --no-hwloc-bind       Disable binding"
+  echo "  --proc-bind=<LOC>     Set the initial process mask for the script"
+  echo "                        LOC can be any valid location argument for"
+  echo "                        hwloc-calc  Default: all"
+  echo "  --distribute=N        Distribute the current cpuset into N partitions"
+  echo "  --distribute-partition=I"
+  echo "                        Use the i'th partition (zero based)"
+  echo "  --visible-gpus=<L>    Comma separated list of gpu ids"
+  echo "                        Default: CUDA_VISIBLE_DEVICES or all gpus in"
+  echo "                        sequential order"
+  echo "  --gpu-ignore-queue    Ignore queue job id when choosing visible GPU"
+  echo "  --no-gpu-mapping      Do not set CUDA_VISIBLE_DEVICES"
+  echo "  --openmp=M.m          Set env variables for the given OpenMP version"
+  echo "                        Default: 4.0"
+  echo "  --openmp-percent=N    Integer percentage of cpuset to use for OpenMP"
+  echo "                        threads  Default: 100"
+  echo "  --openmp-places=<Op>  Op=threads|cores|sockets. Default: threads"
+  echo "  --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES"
+  echo "  --force-openmp-num-threads=N"
+  echo "                        Override logic for selecting OMP_NUM_THREADS"
+  echo "  --force-openmp-proc-bind=<OP>"
+  echo "                        Override logic for selecting OMP_PROC_BIND"
+  echo "  --no-openmp-nested    Set OMP_NESTED to false"
+  echo "  --show-bindings       Show the bindings"
+  echo "  --lstopo              Show bindings in lstopo without executing a command"
+  echo "  -v|--verbose          Show options and relevant environment variables"
+  echo "  -h|--help             Show this message"
+  echo ""
+  echo "Sample Usage:"
+  echo "  Split the current process cpuset into 4 and use the 3rd partition"
+  echo "    ${cmd} --distribute=4 --distribute-partition=2 -v -- command ..."
+  echo "  Bing the process to all even cores"
+  echo "    ${cmd} --proc-bind=core:even -v -- command ..."
+  echo "  Bind to the first 64 cores and split the current process cpuset into 4"
+  echo "    ${cmd} --proc-bind=core:0-63 --distribute=4 --distribute-partition=0 -- command ..."
+  echo "  skip GPU 0 when mapping visible devices"
+  echo "    ${cmd} --distribute=4 --distribute-partition=0 --visible-gpus=1,2 -v -- command ..."
+  echo "  Display the current bindings"
+  echo "    ${cmd} --proc-bind=numa:0 --show-bindings -- command"
+  echo "  Display the current bindings using lstopo"
+  echo "    ${cmd} --proc-bind=numa:0.core:odd --lstopo"
+  echo ""
+}
+
+
+################################################################################
+# Parse command line arguments
+################################################################################
+# Show help if no command line arguments given
+if [[ "$#" -eq 0 ]]; then
+  show_help
+  exit 0
+fi
+
+declare -a UNKNOWN_ARGS=()
+declare -i HPCBIND_ENABLE_HWLOC_BIND=${HPCBIND_HAS_HWLOC}
+declare -i HPCBIND_DISTRIBUTE=1
+declare -i HPCBIND_PARTITION=0
+HPCBIND_PROC_BIND="all"
+HPCBIND_OPENMP_VERSION=4.0
+declare -i HPCBIND_OPENMP_PERCENT=100
+HPCBIND_OPENMP_PLACES=${OMP_PLACES:-threads}
+declare -i HPCBIND_OPENMP_PROC_BIND=1
+declare -i HPCBIND_OPENMP_FORCE_NUM_THREADS=-1
+HPCBIND_OPENMP_FORCE_PROC_BIND=""
+HPCBIND_OPENMP_NESTED=${OMP_NESTED:-true}
+declare -i HPCBIND_VERBOSE=0
+
+declare -i HPCBIND_SHOW_BINDINGS=0
+declare -i HPCBIND_LSTOPO=0
+
+for i in $@; do
+  case $i in
+    # number of partitions to create
+    --no-hwloc-bind)
+      HPCBIND_ENABLE_HWLOC_BIND=0
+      shift
+      ;;
+    --proc-bind=*)
+      HPCBIND_PROC_BIND="${i#*=}"
+      shift
+      ;;
+    --distribute=*)
+      HPCBIND_DISTRIBUTE="${i#*=}"
+      shift
+      ;;
+    # which partition to use
+    --distribute-partition=*)
+      HPCBIND_PARTITION="${i#*=}"
+      shift
+      ;;
+    --visible-gpus=*)
+      HPCBIND_VISIBLE_GPUS=$(echo "${i#*=}" | tr ',' ' ')
+      shift
+      ;;
+    --gpu-ignore-queue)
+      HPCBIND_QUEUE_GPU_MAPPING=0
+      shift
+      ;;
+    --no-gpu-mapping)
+      HPCBIND_ENABLE_GPU_MAPPING=0
+      shift
+      ;;
+    --openmp=*)
+      HPCBIND_OPENMP_VERSION="${i#*=}"
+      shift
+      ;;
+    --openmp-percent=*)
+      HPCBIND_OPENMP_PERCENT="${i#*=}"
+      shift
+      ;;
+    --openmp-places=*)
+      HPCBIND_OPENMP_PLACES="${i#*=}"
+      shift
+      ;;
+    --no-openmp-proc-bind)
+      HPCBIND_OPENMP_PROC_BIND=0
+      shift
+      ;;
+    --force-openmp-proc-bind=*)
+      HPCBIND_OPENMP_FORCE_PROC_BIND="${i#*=}"
+      shift
+      ;;
+    --force-openmp-num-threads=*)
+      HPCBIND_OPENMP_FORCE_NUM_THREADS="${i#*=}"
+      shift
+      ;;
+    --no-openmp-nested)
+      HPCBIND_OPENMP_NESTED="false"
+      shift
+      ;;
+    --show-bindings)
+      HPCBIND_VERBOSE=1
+      HPCBIND_SHOW_BINDINGS=1
+      shift
+      ;;
+    --lstopo)
+      HPCBIND_VERBOSE=1
+      HPCBIND_SHOW_BINDINGS=0
+      HPCBIND_LSTOPO=1
+      shift
+      ;;
+    -v|--verbose)
+      HPCBIND_VERBOSE=1
+      shift
+      ;;
+    -h|--help)
+      show_help
+      exit 0
+      ;;
+    # ignore remaining arguments
+    --)
+      shift
+      break
+      ;;
+    # unknown option
+    *)
+      UNKNOWN_ARGS+=("$i")
+      shift
+      ;;
+  esac
+done
+
+
+################################################################################
+# Check unknown arguments
+################################################################################
+if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then
+  echo "Uknown options: ${UNKNOWN_ARGS[*]}"
+  exit 1
+fi
+
+
+################################################################################
+# Check that visible gpus are valid
+################################################################################
+HPCBIND_VISIBLE_GPUS=(${HPCBIND_VISIBLE_GPUS})
+if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then
+  for ((i=0; i < ${#HPCBIND_VISIBLE_GPUS[*]}; i++)); do
+    if [[ ${HPCBIND_VISIBLE_GPUS[$i]} -ge ${NUM_GPUS} ||
+      ${HPCBIND_VISIBLE_GPUS[$i]} -lt 0 ]]; then
+      echo "Invaild GPU ID ${HPCBIND_VISIBLE_GPUS[$i]}, setting to 0"
+      HPCBIND_VISIBLE_GPUS[$i]=0;
+    fi
+  done
+  NUM_GPUS=${#HPCBIND_VISIBLE_GPUS[@]}
+fi
+
+
+################################################################################
+# Check OpenMP percent
+################################################################################
+if [[ ${HPCBIND_OPENMP_PERCENT} -lt 1 ]]; then
+  echo "OpenMP percent < 1, setting to 1"
+  HPCBIND_OPENMP_PERCENT=1
+elif [[ ${HPCBIND_OPENMP_PERCENT} -gt 100 ]]; then
+  echo "OpenMP percent > 100, setting to 100"
+  HPCBIND_OPENMP_PERCENT=100
+fi
+
+################################################################################
+# Check distribute
+################################################################################
+if [[ ${HPCBIND_DISTRIBUTE} -le 0 ]]; then
+  echo "Invalid input for distribute, changing distribute to 1"
+  HPCBIND_DISTRIBUTE=1
+fi
+
+if [[ ${HPCBIND_PARTITION} -ge ${HPCBIND_DISTRIBUTE} ]]; then
+  echo "Invalid input for distribute-partition, changing to 0"
+  HPCBIND_PARTITION=0
+fi
+
+
+################################################################################
+# Find cpuset and num threads
+################################################################################
+HPCBIND_HWLOC_CPUSET=""
+declare -i HPCBIND_NUM_PUS=0
+
+if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
+  if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then
+    BINDING=$(hwloc-calc ${HPCBIND_PROC_BIND})
+  else
+    BINDING=$(hwloc-calc --restrict ${HPCBIND_HWLOC_PARENT_CPUSET} ${HPCBIND_PROC_BIND})
+  fi
+
+  CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${HPCBIND_DISTRIBUTE}))
+  HPCBIND_HWLOC_CPUSET=${CPUSETS[${HPCBIND_PARTITION}]}
+  HPCBIND_NUM_PUS=$(hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu | wc -l)
+else
+  HPCBIND_NUM_PUS=$(cat /proc/cpuinfo | grep -c processor)
+fi
+
+declare -i HPCBIND_OPENMP_NUM_THREADS=$((HPCBIND_NUM_PUS * HPCBIND_OPENMP_PERCENT))
+HPCBIND_OPENMP_NUM_THREADS=$((HPCBIND_OPENMP_NUM_THREADS / 100))
+
+
+if [[ ${HPCBIND_OPENMP_NUM_THREADS} -lt 1 ]]; then
+  HPCBIND_OPENMP_NUM_THREADS=1
+elif [[ ${HPCBIND_OPENMP_NUM_THREADS} -gt ${HPCBIND_NUM_PUS} ]]; then
+  HPCBIND_OPENMP_NUM_THREADS=${HPCBIND_NUM_PUS}
+fi
+
+if [[ ${HPCBIND_OPENMP_FORCE_NUM_THREADS} -gt 0 ]]; then
+  HPCBIND_OPENMP_NUM_THREADS=${HPCBIND_OPENMP_FORCE_NUM_THREADS}
+fi
+
+################################################################################
+# Set OpenMP environment variables
+################################################################################
+
+# set OMP_NUM_THREADS
+export OMP_NUM_THREADS=${HPCBIND_OPENMP_NUM_THREADS}
+
+# set OMP_PROC_BIND and OMP_PLACES
+if [[ ${HPCBIND_OPENMP_PROC_BIND} -eq 1 ]]; then
+  if [[ "${HPCBIND_OPENMP_FORCE_PROC_BIND}" == "" ]]; then
+    #default proc bind logic
+    if [[ "${HPCBIND_OPENMP_VERSION}" == "4.0" || "${HPCBIND_OPENMP_VERSION}" > "4.0" ]]; then
+      export OMP_PLACES="${HPCBIND_OPENMP_PLACES}"
+      export OMP_PROC_BIND="spread"
+    else
+      export OMP_PROC_BIND="true"
+      unset OMP_PLACES
+    fi
+  else
+    #force proc bind
+    export OMP_PLACES="${HPCBIND_OPENMP_PLACES}"
+    export OMP_PROC_BIND="${HPCBIND_OPENMP_FORCE_PROC_BIND}"
+  fi
+else
+  # no openmp proc bind
+  unset OMP_PLACES
+  unset OMP_PROC_BIND
+fi
+
+# set OMP_NESTED
+export OMP_NESTED=${HPCBIND_OPENMP_NESTED}
+
+
+################################################################################
+# Set CUDA environment variables
+################################################################################
+
+if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then
+  if [[ ${HPCBIND_QUEUE_GPU_MAPPING} -eq 0 ]]; then
+    declare -i GPU_ID=$((HPCBIND_PARTITION % NUM_GPUS))
+    export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}
+  else
+    declare -i MY_TASK_ID=$((HPCBIND_QUEUE_INDEX * HPCBIND_DISTRIBUTE + HPCBIND_PARTITION))
+    declare -i GPU_ID=$((MY_TASK_ID % NUM_GPUS))
+    export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}
+  fi
+fi
+
+################################################################################
+# Set hpcbind environment variables
+################################################################################
+export HPCBIND_HAS_HWLOC=${HPCBIND_HAS_HWLOC}
+export HPCBIND_HAS_NVIDIA=${HPCBIND_HAS_NVIDIA}
+export HPCBIND_NUM_PUS=${HPCBIND_NUM_PUS}
+export HPCBIND_HWLOC_CPUSET=${HPCBIND_HWLOC_CPUSET}
+export HPCBIND_HWLOC_DISTRIBUTE=${HPCBIND_DISTRIBUTE}
+export HPCBIND_HWLOC_DISTRIBUTE_PARTITION=${HPCBIND_PARTITION}
+if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then
+  export HPCBIND_HWLOC_PARENT_CPUSET="all"
+else
+  export HPCBIND_HWLOC_PARENT_CPUSET=${HPCBIND_HWLOC_PARENT_CPUSET}
+fi
+export HPCBIND_HWLOC_PROC_BIND=${HPCBIND_PROC_BIND}
+export HPCBIND_NVIDIA_ENABLE_GPU_MAPPING=${HPCBIND_ENABLE_GPU_MAPPING}
+export HPCBIND_NVIDIA_VISIBLE_GPUS=$(echo "${HPCBIND_VISIBLE_GPUS[*]}" | tr ' ' ',')
+export HPCBIND_OPENMP_VERSION=${HPCBIND_OPENMP_VERSION}
+if [[ "${HPCBIND_QUEUE_NAME}" != "" ]]; then
+  export HPCBIND_QUEUE_INDEX=${HPCBIND_QUEUE_INDEX}
+  export HPCBIND_QUEUE_NAME=${HPCBIND_QUEUE_NAME}
+  export HPCBIND_QUEUE_GPU_MAPPING=${HPCBIND_QUEUE_GPU_MAPPING}
+fi
+
+
+################################################################################
+# Print verbose
+################################################################################
+
+if [[ ${HPCBIND_VERBOSE} -eq 1 ]]; then
+  MY_ENV=$(env | sort)
+  echo "[HPCBIND]"
+  echo "${MY_ENV}" | grep -E "^HPCBIND_"
+  echo "[CUDA]"
+  echo "${MY_ENV}" | grep -E "^CUDA_"
+  echo "[OPENMP]"
+  echo "${MY_ENV}" | grep -E "^OMP_"
+fi
+
+if [[ ${HPCBIND_HAS_HWLOC} -eq 1 && ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then
+  echo "[BINDINGS]"
+  hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu
+elif [[ ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then
+  echo "Unable to show bindings, hwloc not available."
+fi
+
+################################################################################
+# Run command
+################################################################################
+
+if [[ ${HPCBIND_LSTOPO} -eq 0 ]]; then
+  if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
+    hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- $@
+  else
+    eval $@
+  fi
+else
+  if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
+    if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 && ! -z ${DISPLAY} ]]; then
+      echo "[BINDINGS]"
+      hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu
+      hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- lstopo --pid 0
+    else
+      hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET}
+    fi
+  else
+    echo "Unable to show bindings, hwloc not available."
+  fi
+fi
diff --git a/lib/kokkos/bin/kokkos-bind b/lib/kokkos/bin/kokkos-bind
new file mode 100755
index 0000000000..b6fe07a1bd
--- /dev/null
+++ b/lib/kokkos/bin/kokkos-bind
@@ -0,0 +1,221 @@
+#!/usr/bin/env bash
+
+# check if hwloc commands exist
+declare -i HAS_HWLOC=0
+type hwloc-bind >/dev/null 2>&1
+HAS_HWLOC="${HAS_HWLOC} + $?"
+
+type hwloc-distrib >/dev/null 2>&1
+HAS_HWLOC="${HAS_HWLOC} + $?"
+
+type hwloc-ls >/dev/null 2>&1
+HAS_HWLOC="${HAS_HWLOC} + $?"
+
+type hwloc-calc >/dev/null 2>&1
+HAS_HWLOC="${HAS_HWLOC} + $?"
+
+type hwloc-ps >/dev/null 2>&1
+HAS_HWLOC="${HAS_HWLOC} + $?"
+
+
+#parse args
+declare -a UNKNOWN_ARGS=()
+declare -i DISTRIBUTE=1
+declare -i INDEX=0
+PROC_BIND="all"
+CURRENT_CPUSET=""
+OPENMP_VERSION=4.0
+OPENMP_PROC_BIND=True
+OPENMP_NESTED=True
+VERBOSE=False
+
+#get the current process cpuset
+if [[ ${HAS_HWLOC} -eq 0 ]]; then
+  MY_PID="$BASHPID"
+  CURRENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2)
+  echo "$CURRENT_CPUSET"
+fi
+
+function show_help {
+  local cmd=$(basename "$0")
+  echo "Usage: ${cmd} <options> -- command ..." 
+  echo "  Uses hwloc to divide the node into the given number of groups,"
+  echo "  set the appropriate OMP_NUM_THREADS and execute the command on the"
+  echo "  selected group."
+  echo ""
+  echo "  NOTE: This command assumes it has exclusive use of the node"
+  echo ""
+  echo "Options:"
+  echo "  --proc-bind=<LOC>     Set the initial process mask for the script.  "
+  echo "                        LOC can be any valid location argumnet for"
+  echo "                        hwloc-calc.  Defaults to the entire machine"
+  echo "  --distribute=N        Distribute the current proc-bind into N groups" 
+  echo "  --index=I             Use the i'th group (zero based)" 
+  echo "  --openmp=M.m          Set env variables for the given OpenMP version"
+  echo "                        (default 4.0)"
+  echo "  --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES"    
+  echo "  --no-openmp-nested    Set OMP_NESTED to false"
+  echo "  -v|--verbose" 
+  echo "  -h|--help" 
+  echo ""
+  echo "Sample Usage:"
+  echo "  ${cmd} --distribute=4 --index=2 -v -- command ..."
+  echo ""
+}
+
+if [[ "$#" -eq 0 ]]; then
+  show_help 
+  exit 0
+fi
+
+
+for i in $@; do
+  case $i in
+    # number of partitions to create
+    --proc-bind=*)
+      PROC_BIND="${i#*=}"
+      shift
+      ;;
+    --distribute=*)
+      DISTRIBUTE="${i#*=}"
+      shift
+      ;;
+    # which group to use
+    --index=*)
+      INDEX="${i#*=}"
+      shift
+      ;;
+    --openmp=*)
+      OPENMP_VERSION="${i#*=}"
+      shift
+      ;;
+    --no-openmp-proc-bind)
+      OPENMP_PROC_BIND=False
+      shift
+      ;;
+    --no-openmp-nested)
+      OPENMP_NESTED=False
+      shift
+      ;;
+    -v|--verbose)
+      VERBOSE=True
+      shift
+      ;;
+    -h|--help)
+      show_help
+      exit 0
+      ;;
+    # ignore remaining arguments
+    --)
+      shift
+      break
+      ;;
+    # unknown option
+    *)
+      UNKNOWN_ARGS+=("$i")
+      shift
+      ;;
+  esac
+done
+
+if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then
+  echo "Uknown options: ${UNKNOWN_ARGS[*]}"
+  exit 1
+fi
+
+if [[ ${DISTRIBUTE} -le 0 ]]; then
+  echo "Invalid input for distribute, changing distribute to 1"
+  DISTRIBUTE=1
+fi
+
+if [[ ${INDEX} -ge ${DISTRIBUTE} ]]; then
+  echo "Invalid input for index, changing index to 0"
+  INDEX=0
+fi
+
+if [[ ${HAS_HWLOC} -ne 0 ]]; then
+  echo "hwloc not found, no process binding will occur"
+  DISTRIBUTE=1
+  INDEX=0
+fi
+
+if [[ ${HAS_HWLOC} -eq 0 ]]; then
+
+  if [[ "${CURRENT_CPUSET}" == "" ]]; then
+    BINDING=$(hwloc-calc ${PROC_BIND})
+  else 
+    BINDING=$(hwloc-calc --restrict ${CURRENT_CPUSET} ${PROC_BIND})
+  fi
+
+  CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${DISTRIBUTE}))
+  CPUSET=${CPUSETS[${INDEX}]}
+  NUM_THREADS=$(hwloc-ls --restrict ${CPUSET} --only pu | wc -l)
+
+  if [[ "${VERBOSE}" == "True" ]]; then
+    echo "hwloc:         true"
+    echo "  proc_bind:     ${PROC_BIND}"
+    echo "  distribute:    ${DISTRIBUTE}"
+    echo "  index:         ${INDEX}"
+    echo "  parent_cpuset: ${CURRENT_CPUSET}"
+    echo "  cpuset:        ${CPUSET}"
+    echo "omp_num_threads: ${NUM_THREADS}"
+    echo "omp_proc_bind:   ${OPENMP_PROC_BIND}"
+    echo "omp_nested:      ${OPENMP_NESTED}"
+    echo "OpenMP:          ${OPENMP_VERSION}"
+  fi
+
+  # set OMP env
+  if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then
+    if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then
+      export OMP_PLACES="threads"
+      export OMP_PROC_BIND="spread"
+    else
+      export OMP_PROC_BIND="true"
+      unset OMP_PLACES
+    fi
+  else
+    unset OMP_PLACES
+    unset OMP_PROC_BIND
+  fi
+  if [[ "${OPENMP_NESTED}" == "True" ]]; then
+    export OMP_NESTED="true"
+  else
+    export OMP_NESTED="false"
+  fi
+  export OMP_NUM_THREADS="${NUM_THREADS}"
+
+  hwloc-bind ${CPUSET} -- $@
+else
+  NUM_THREADS=$(cat /proc/cpuinfo | grep -c processor)
+
+  if [[ "${VERBOSE}" == "True" ]]; then
+    echo "hwloc:           false"
+    echo "omp_num_threads: ${NUM_THREADS}"
+    echo "omp_proc_bind:   ${OPENMP_PROC_BIND}"
+    echo "omp_nested:      ${OPENMP_NESTED}"
+    echo "OpenMP:          ${OPENMP_VERSION}"
+  fi
+    
+  # set OMP env
+  if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then
+    if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then
+      export OMP_PLACES="threads"
+      export OMP_PROC_BIND="spread"
+    else
+      export OMP_PROC_BIND="true"
+      unset OMP_PLACES
+    fi
+  else
+    unset OMP_PLACES
+    unset OMP_PROC_BIND
+  fi
+  if [[ "${OPENMP_NESTED}" == "True" ]]; then
+    export OMP_NESTED="true"
+  else
+    export OMP_NESTED="false"
+  fi
+  export OMP_NUM_THREADS="${NUM_THREADS}"
+
+  eval $@
+fi
+
diff --git a/lib/kokkos/bin/runtest b/lib/kokkos/bin/runtest
new file mode 100755
index 0000000000..92411fe5ba
--- /dev/null
+++ b/lib/kokkos/bin/runtest
@@ -0,0 +1,165 @@
+#!/usr/bin/env bash
+
+function get_path() {
+  cd "$(dirname "$0")"
+  cd ..
+  echo "$(pwd -P)"
+}
+
+KOKKOS_PATH="$(get_path "$0")"
+
+function show_help() {
+  local cmd=$(basename "$0")
+  echo "Usage: ${cmd} <options> "
+  echo "  Build and run the tests"
+  echo ""
+  echo "Options:"
+  echo "  -j=N|--make-j=N        Build the tests in parallel"
+  echo "  -c|--clean             Clean build and regenerate make files"
+  echo "  --clean-on-pass        Clean build when runtest passes"
+  echo "  --output-prefix=<pre>  Prefix of log files  Default: runtest"
+  echo "  --build-only           Only build the tests"
+  echo "  -v|--verbose           Tee STDOUT and STDERR to screen and files"
+  echo "  -h|--help              Show this message"
+  echo ""
+  ${KOKKOS_PATH}/generate_makefile.bash --help
+  return 0
+}
+
+
+declare -a GENERATE_ARGS=()
+declare -i VERBOSE=0
+declare -i CLEAN=0
+declare -i CLEAN_ON_PASS=0
+declare -i BUILD_ONLY=0
+OUTPUT="runtest"
+
+declare -i MAKE_J=${HPCBIND_NUM_PUS:-1}
+
+for i in $@; do
+  case $i in
+    -j=*|--make-j=*)
+      MAKE_J=${i#*=}
+      shift
+      ;;
+    -c|--clean)
+      CLEAN=1
+      shift
+      ;;
+    --clean-on-pass)
+      CLEAN_ON_PASS=1
+      shift
+      ;;
+    --output-prefix=*)
+      OUTPUT=${i#*=}
+      shift
+      ;;
+    --build-only)
+      BUILD_ONLY=1
+      shift
+      ;;
+    -v|--verbose)
+      VERBOSE=1
+      shift
+      ;;
+    -h|--help)
+      show_help
+      exit 0
+      ;;
+    *)
+      GENERATE_ARGS+=("$i")
+      shift
+      ;;
+  esac
+done
+
+if [[ "$(pwd -P)" == ${KOKKOS_PATH} ]]; then
+  echo "Cannot call $0 from root repository path ${KOKKOS_PATH}"
+  exit 1
+fi
+
+# Some makefile dependencies are incorrect, so clean needs to force
+# a new call to generate_makefiles.bash
+if [[ ${CLEAN} -eq 1 ]]; then
+  START=${SECONDS}
+  echo "Cleaning"
+  /bin/rm -rf algorithms containers core example install Makefile >/dev/null 2>&1
+  END=${SECONDS}
+  echo "    $((END-START)) seconds"
+  if [[ ${VERBOSE} -eq 1 ]]; then
+    echo ""
+    echo ""
+  fi
+fi
+
+declare -i START=${SECONDS}
+echo "Generating Makefile"
+echo "    ${KOKKOS_PATH}/generate_makefile.bash --kokkos-path=${KOKKOS_PATH} ${GENERATE_ARGS[@]}"
+
+if [[ ${VERBOSE} -eq 0 ]]; then
+  "${KOKKOS_PATH}"/generate_makefile.bash --kokkos-path="${KOKKOS_PATH}" "${GENERATE_ARGS[@]}" > ${OUTPUT}.out 2> >(tee ${OUTPUT}.err >&2)
+else
+  "${KOKKOS_PATH}"/generate_makefile.bash --kokkos-path="${KOKKOS_PATH}" "${GENERATE_ARGS[@]}" > >(tee ${OUTPUT}.out) 2> >(tee ${OUTPUT}.err >&2)
+fi
+declare -i RESULT=$?
+declare -i END=${SECONDS}
+if [[ ${RESULT} -eq 0 ]]; then
+  echo "    PASS:  $((END-START)) seconds"
+  if [[ ${VERBOSE} -eq 1 ]]; then
+    echo ""
+    echo ""
+  fi
+else
+  cat ${OUTPUT}.out | grep "FAIL"
+  cat ${OUTPUT}.err | grep "FAIL"
+  echo "    FAIL:  $((END-START)) seconds"
+  exit 1
+fi
+
+START=${SECONDS}
+echo "Building"
+if [[ ${VERBOSE} -eq 0 ]]; then
+  make --keep-going -j ${MAKE_J} build-test >> ${OUTPUT}.out 2> >(tee -a ${OUTPUT}.err >&2)
+else
+  make --keep-going -j ${MAKE_J} build-test > >(tee -a ${OUTPUT}.out) 2> >(tee -a ${OUTPUT}.err >&2)
+fi
+RESULT=$?
+END=${SECONDS}
+if [[ ${RESULT} -eq 0 ]]; then
+  echo "    PASS:  $((END-START)) seconds"
+  if [[ ${VERBOSE} -eq 1 ]]; then
+    echo ""
+    echo ""
+  fi
+else
+  cat ${OUTPUT}.out | grep -E "[[:space:]]error:[[:space:]]"
+  cat ${OUTPUT}.err | grep -E "[[:space:]]error:[[:space:]]"
+  echo "    FAIL:  $((END-START)) seconds"
+  exit 1
+fi
+
+if [[ ${BUILD_ONLY} -eq 0 ]]; then
+  START=${SECONDS}
+  echo "Testing"
+  if [[ ${VERBOSE} -eq 0 ]]; then
+    make --keep-going test >> ${OUTPUT}.out 2> >(tee -a ${OUTPUT}.err >&2)
+  else
+    make --keep-going test > >(tee -a ${OUTPUT}.out) 2> >(tee -a ${OUTPUT}.err >&2)
+  fi
+  RESULT=$?
+  END=${SECONDS}
+  if [[ ${RESULT} -eq 0 ]]; then
+    echo "    PASS:  $((END-START)) seconds"
+    if [[ ${CLEAN_ON_PASS} -eq 1 ]]; then
+      make clean
+    fi
+  else
+    cat ${OUTPUT}.out | grep "FAIL"
+    cat ${OUTPUT}.err | grep "FAIL"
+    echo "    FAIL:  $((END-START)) seconds"
+    exit 1
+  fi
+fi
+
+exit ${RESULT}
+
diff --git a/lib/kokkos/cmake/kokkos.cmake b/lib/kokkos/cmake/kokkos.cmake
index 235b7eaba4..396822c7fa 100644
--- a/lib/kokkos/cmake/kokkos.cmake
+++ b/lib/kokkos/cmake/kokkos.cmake
@@ -999,8 +999,12 @@ SET (Kokkos_INCLUDE_DIRS
     ${Kokkos_SOURCE_DIR}/containers/src
     ${Kokkos_SOURCE_DIR}/algorithms/src
     ${Kokkos_BINARY_DIR}  # to find KokkosCore_config.h
+    ${KOKKOS_INCLUDE_DIRS}
 )
 
+# pass include dirs back to parent scope
+SET(Kokkos_INCLUDE_DIRS_RET ${Kokkos_INCLUDE_DIRS} PARENT_SCOPE)
+
 INCLUDE_DIRECTORIES(${Kokkos_INCLUDE_DIRS})
 
 IF(KOKKOS_SEPARATE_LIBS)
diff --git a/lib/kokkos/config/master_history.txt b/lib/kokkos/config/master_history.txt
index cc6f4c97d7..0447db4b2b 100644
--- a/lib/kokkos/config/master_history.txt
+++ b/lib/kokkos/config/master_history.txt
@@ -7,3 +7,4 @@ tag:  2.02.07    date: 12:16:2016    master: 4b4cc4ba    develop: 382c0966
 tag:  2.02.15    date: 02:10:2017    master: 8c64cd93    develop: 28dea8b6
 tag:  2.03.00    date: 04:25:2017    master: 120d9ce7    develop: 015ba641
 tag:  2.03.05    date: 05:27:2017    master: 36b92f43    develop: 79073186
+tag:  2.03.13    date: 07:27:2017    master: da314444    develop: 29ccb58a
diff --git a/lib/kokkos/config/query_cuda_arch.cpp b/lib/kokkos/config/query_cuda_arch.cpp
new file mode 100644
index 0000000000..383f04e34e
--- /dev/null
+++ b/lib/kokkos/config/query_cuda_arch.cpp
@@ -0,0 +1,24 @@
+#include <cstdio>
+#include <cuda_runtime_api.h>
+int main()
+{
+	cudaDeviceProp prop;
+  const cudaError_t err_code = cudaGetDeviceProperties(&prop, 0);
+  if (cudaSuccess != err_code) {
+		fprintf(stderr,"cudaGetDeviceProperties failed: %s\n", cudaGetErrorString(err_code));
+		return -1;
+	}
+  switch (prop.major) {
+    case 3:
+      printf("Kepler"); break;
+    case 5:
+      printf("Maxwell"); break;
+    case 6:
+      printf("Pascal"); break;
+    default:
+      fprintf(stderr, "Unspported Device %d%d\n", (int)prop.major, (int)prop.minor);
+      return -1;
+  }
+  printf("%d%d\n", (int)prop.major, (int)prop.minor);
+  return 0;
+}
diff --git a/lib/kokkos/config/test_all_sandia b/lib/kokkos/config/test_all_sandia
index 8e1246bf8b..005cd20721 100755
--- a/lib/kokkos/config/test_all_sandia
+++ b/lib/kokkos/config/test_all_sandia
@@ -160,9 +160,14 @@ if [ "$MACHINE" = "sems" ]; then
     # Format: (compiler module-list build-list exe-name warning-flag)
     COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
                "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
                "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "intel/16.0.3 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
                "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
                "clang/3.7.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
                "clang/3.8.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
@@ -280,13 +285,13 @@ elif [ "$MACHINE" = "apollo" ]; then
                "gcc/5.1.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
                "intel/16.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
                "clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS"
-               "clang/head $CLANG_MODULE_LIST "Cuda_Pthread" clang++ $CUDA_WARNING_FLAGS"
+               "clang/4.0.0 $CLANG_MODULE_LIST "Cuda_Pthread" clang++ $CUDA_WARNING_FLAGS"
                "cuda/8.0.44 $CUDA_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
     )
   else
     # Format: (compiler module-list build-list exe-name warning-flag)
     COMPILERS=("cuda/8.0.44 $CUDA8_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
-               "clang/head $CLANG_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS"
+               "clang/4.0.0 $CLANG_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS"
                "clang/3.9.0 $CLANG_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS"
                "gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
@@ -584,7 +589,7 @@ single_build_and_test() {
   else
     run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
     local -i build_start_time=$(date +%s)
-    run_cmd make build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
+    run_cmd make -j 32 build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
     local -i build_end_time=$(date +%s)
     comment="build_time=$(($build_end_time-$build_start_time))"
 
diff --git a/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_pthread_intel b/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_pthread_intel
index 23968e8c0f..6527df2eb9 100755
--- a/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_pthread_intel
+++ b/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_pthread_intel
@@ -28,14 +28,14 @@ export JENKINS_DO_PTHREAD=ON
 export JENKINS_DO_SERIAL=OFF
 export JENKINS_DO_COMPLEX=OFF
 
-export ARCH_CXX_FLAG="-xCORE-AVX2 -mkl"
-export ARCH_C_FLAG="-xCORE-AVX2 -mkl"
+export JENKINS_ARCH_CXX_FLAG="-xCORE-AVX2 -mkl"
+export JENKINS_ARCH_C_FLAG="-xCORE-AVX2 -mkl"
 export BLAS_LIBRARIES="-mkl;${MKLROOT}/lib/intel64/libmkl_intel_lp64.a;${MKLROOT}/lib/intel64/libmkl_intel_thread.a;${MKLROOT}/lib/intel64/libmkl_core.a"
 export LAPACK_LIBRARIES=${BLAS_LIBRARIES}
 
 export JENKINS_DO_TESTS=ON
 export JENKINS_DO_EXAMPLES=ON
-export JENKINS_DO_SHARED=OFF
+export JENKINS_DO_SHARED=ON
 
 export QUEUE=haswell
 
diff --git a/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_serial_intel b/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_serial_intel
index 964de3a002..1a306bc2b2 100755
--- a/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_serial_intel
+++ b/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_serial_intel
@@ -28,14 +28,14 @@ export JENKINS_DO_PTHREAD=OFF
 export JENKINS_DO_SERIAL=ON
 export JENKINS_DO_COMPLEX=ON
 
-export ARCH_CXX_FLAG="-xCORE-AVX2 -mkl"
-export ARCH_C_FLAG="-xCORE-AVX2 -mkl"
+export JENKINS_ARCH_CXX_FLAG="-xCORE-AVX2 -mkl"
+export JENKINS_ARCH_C_FLAG="-xCORE-AVX2 -mkl"
 export BLAS_LIBRARIES="-mkl;${MKLROOT}/lib/intel64/libmkl_intel_lp64.a;${MKLROOT}/lib/intel64/libmkl_intel_thread.a;${MKLROOT}/lib/intel64/libmkl_core.a"
 export LAPACK_LIBRARIES=${BLAS_LIBRARIES}
 
 export JENKINS_DO_TESTS=ON
 export JENKINS_DO_EXAMPLES=ON
-export JENKINS_DO_SHARED=OFF
+export JENKINS_DO_SHARED=ON
 
 export QUEUE=haswell
 
diff --git a/lib/kokkos/containers/performance_tests/Makefile b/lib/kokkos/containers/performance_tests/Makefile
index edaaf1ee51..ec69363a17 100644
--- a/lib/kokkos/containers/performance_tests/Makefile
+++ b/lib/kokkos/containers/performance_tests/Makefile
@@ -60,7 +60,6 @@ test-threads: KokkosContainers_PerformanceTest_Threads
 test-openmp: KokkosContainers_PerformanceTest_OpenMP
 	./KokkosContainers_PerformanceTest_OpenMP
 
-
 build_all: $(TARGETS)
 
 test: $(TEST_TARGETS)
diff --git a/lib/kokkos/containers/performance_tests/TestMain.cpp b/lib/kokkos/containers/performance_tests/TestMain.cpp
index f952ab3db5..1224af7cdb 100644
--- a/lib/kokkos/containers/performance_tests/TestMain.cpp
+++ b/lib/kokkos/containers/performance_tests/TestMain.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,12 +36,15 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
 
 #include <gtest/gtest.h>
+#include <cstdlib>
+
+#include <Kokkos_Macros.hpp>
 
 int main(int argc, char *argv[]) {
   ::testing::InitGoogleTest(&argc,argv);
diff --git a/lib/kokkos/containers/performance_tests/TestOpenMP.cpp b/lib/kokkos/containers/performance_tests/TestOpenMP.cpp
index b674ec4a74..6631184624 100644
--- a/lib/kokkos/containers/performance_tests/TestOpenMP.cpp
+++ b/lib/kokkos/containers/performance_tests/TestOpenMP.cpp
@@ -69,30 +69,13 @@ protected:
   {
     std::cout << std::setprecision(5) << std::scientific;
 
-    unsigned num_threads = 4;
-
-    if (Kokkos::hwloc::available()) {
-      num_threads = Kokkos::hwloc::get_available_numa_count()
-                    * Kokkos::hwloc::get_available_cores_per_numa()
-                    * Kokkos::hwloc::get_available_threads_per_core()
-                    ;
-
-    }
-
-    std::cout << "OpenMP: " << num_threads << std::endl;
-
-    Kokkos::OpenMP::initialize( num_threads );
-
-    std::cout << "available threads: " << omp_get_max_threads() << std::endl;
+    Kokkos::OpenMP::initialize();
+    Kokkos::OpenMP::print_configuration( std::cout );
   }
 
   static void TearDownTestCase()
   {
     Kokkos::OpenMP::finalize();
-
-    omp_set_num_threads(1);
-
-    ASSERT_EQ( 1 , omp_get_max_threads() );
   }
 };
 
diff --git a/lib/kokkos/containers/src/Kokkos_DualView.hpp b/lib/kokkos/containers/src/Kokkos_DualView.hpp
index 937eab0d88..35cc8ec753 100644
--- a/lib/kokkos/containers/src/Kokkos_DualView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DualView.hpp
@@ -564,7 +564,7 @@ namespace Impl {
 template< class D, class A1, class A2, class A3, class ... Args >
 struct DualViewSubview {
 
-  typedef typename Kokkos::Experimental::Impl::ViewMapping
+  typedef typename Kokkos::Impl::ViewMapping
     < void
     , Kokkos::ViewTraits< D, A1, A2, A3 >
     , Args ...
diff --git a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp
index 8e464506f9..d22d6b865d 100644
--- a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp
@@ -46,19 +46,6 @@
 ///
 /// This header file declares and defines Kokkos::Experimental::DynRankView and its
 /// related nonmember functions.
-/*
- *   Changes from View
- *   1. The rank of the DynRankView is returned by the method rank()
- *   2. Max rank of a DynRankView is 7
- *   3. subview name is subdynrankview
- *   4. Every subdynrankview is returned with LayoutStride
- *
- *   NEW: Redesigned DynRankView
- *   5. subview function name now available
- *   6. Copy and Copy-Assign View to DynRankView
- *   7. deep_copy between Views and DynRankViews
- *   8. rank( view ); returns the rank of View or DynRankView
- */
 
 #ifndef KOKKOS_DYNRANKVIEW_HPP
 #define KOKKOS_DYNRANKVIEW_HPP
@@ -117,6 +104,14 @@ struct DynRankDimTraits {
                       , layout.dimension[7] );
   }
 
+  // Extra overload to match that for specialize types v2
+  template <typename Layout, typename ... P>
+  KOKKOS_INLINE_FUNCTION
+  static size_t computeRank( const Kokkos::Impl::ViewCtorProp<P...>& prop, const Layout& layout )
+  {
+    return computeRank(layout);
+  }
+
   // Create the layout for the rank-7 view.
   // Non-strided Layout
   template <typename Layout>
@@ -158,8 +153,17 @@ struct DynRankDimTraits {
                  );
   }
 
+  // Extra overload to match that for specialize types
+  template <typename Traits, typename ... P>
+  KOKKOS_INLINE_FUNCTION
+  static typename std::enable_if< (std::is_same<typename Traits::array_layout , Kokkos::LayoutRight>::value || std::is_same<typename Traits::array_layout , Kokkos::LayoutLeft>::value || std::is_same<typename Traits::array_layout , Kokkos::LayoutStride>::value) , typename Traits::array_layout >::type createLayout( const ViewCtorProp<P...>& prop, const typename Traits::array_layout& layout )
+  {
+    return createLayout( layout );
+  }
+
   // Create a view from the given dimension arguments.
   // This is only necessary because the shmem constructor doesn't take a layout.
+  //   NDE shmem View's are not compatible with the added view_alloc value_type / fad_dim deduction functionality
   template <typename ViewType, typename ViewArg>
   static ViewType createView( const ViewArg& arg
                             , const size_t N0
@@ -186,7 +190,8 @@ struct DynRankDimTraits {
   // Non-strided Layout
   template <typename Layout , typename iType>
   KOKKOS_INLINE_FUNCTION
-  static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutRight>::value || std::is_same<Layout , Kokkos::LayoutLeft>::value) && std::is_integral<iType>::value , Layout >::type reconstructLayout( const Layout& layout , iType dynrank )
+  static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutRight>::value || std::is_same<Layout , Kokkos::LayoutLeft>::value) && std::is_integral<iType>::value , Layout >::type
+  reconstructLayout( const Layout& layout , iType dynrank )
   {
     return Layout( dynrank > 0 ? layout.dimension[0] : ~size_t(0)
                  , dynrank > 1 ? layout.dimension[1] : ~size_t(0)
@@ -202,7 +207,8 @@ struct DynRankDimTraits {
   // LayoutStride
   template <typename Layout , typename iType>
   KOKKOS_INLINE_FUNCTION
-  static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutStride>::value) && std::is_integral<iType>::value , Layout >::type reconstructLayout( const Layout& layout , iType dynrank )
+  static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutStride>::value) && std::is_integral<iType>::value , Layout >::type
+  reconstructLayout( const Layout& layout , iType dynrank )
   {
     return Layout( dynrank > 0 ? layout.dimension[0] : ~size_t(0)
                  , dynrank > 0 ? layout.stride[0] : (0)
@@ -311,6 +317,11 @@ void dyn_rank_view_verify_operator_bounds
 /** \brief  Assign compatible default mappings */
 struct ViewToDynRankViewTag {};
 
+} // namespace Impl
+} // namespace Experimental
+
+namespace Impl {
+
 template< class DstTraits , class SrcTraits >
 class ViewMapping< DstTraits , SrcTraits ,
   typename std::enable_if<(
@@ -337,7 +348,7 @@ class ViewMapping< DstTraits , SrcTraits ,
         )
       )
     )
-  ) , ViewToDynRankViewTag >::type >
+  ) , Kokkos::Experimental::Impl::ViewToDynRankViewTag >::type >
 {
 private:
 
@@ -376,7 +387,7 @@ public:
 
       typedef typename DstType::offset_type  dst_offset_type ;
       dst.m_map.m_offset = dst_offset_type(std::integral_constant<unsigned,0>() , src.layout() ); //Check this for integer input1 for padding, etc
-      dst.m_map.m_handle = Kokkos::Experimental::Impl::ViewDataHandle< DstTraits >::assign( src.m_map.m_handle , src.m_track );
+      dst.m_map.m_handle = Kokkos::Impl::ViewDataHandle< DstTraits >::assign( src.m_map.m_handle , src.m_track );
       dst.m_track.assign( src.m_track , DstTraits::is_managed );
       dst.m_rank = src.Rank ;
     }
@@ -384,22 +395,20 @@ public:
 
 } //end Impl
 
+namespace Experimental {
+
 /* \class DynRankView
  * \brief Container that creates a Kokkos view with rank determined at runtime.
- *   Essentially this is a rank 7 view that wraps the access operators
- *   to yield the functionality of a view
+ *   Essentially this is a rank 7 view
  *
  *   Changes from View
  *   1. The rank of the DynRankView is returned by the method rank()
  *   2. Max rank of a DynRankView is 7
- *   3. subview name is subdynrankview
- *   4. Every subdynrankview is returned with LayoutStride
- *
- *   NEW: Redesigned DynRankView
- *   5. subview function name now available
- *   6. Copy and Copy-Assign View to DynRankView
- *   7. deep_copy between Views and DynRankViews
- *   8. rank( view ); returns the rank of View or DynRankView
+ *   3. subview called with 'subview(...)' or 'subdynrankview(...)' (backward compatibility) 
+ *   4. Every subview is returned with LayoutStride
+ *   5. Copy and Copy-Assign View to DynRankView
+ *   6. deep_copy between Views and DynRankViews
+ *   7. rank( view ); returns the rank of View or DynRankView
  *
  */
 
@@ -427,7 +436,7 @@ public:
 
 
 private:
-  typedef Kokkos::Experimental::Impl::ViewMapping< traits , void > map_type ;
+  typedef Kokkos::Impl::ViewMapping< traits , void > map_type ;
   typedef Kokkos::Experimental::Impl::SharedAllocationTracker      track_type ;
 
   track_type  m_track ;
@@ -556,7 +565,7 @@ public:
   // Allow specializations to query their specialized map
 
   KOKKOS_INLINE_FUNCTION
-  const Kokkos::Experimental::Impl::ViewMapping< traits , void > &
+  const Kokkos::Impl::ViewMapping< traits , void > &
   implementation_map() const { return m_map ; }
 
   //----------------------------------------
@@ -803,7 +812,7 @@ public:
     , m_rank(rhs.m_rank)
     {
       typedef typename DynRankView<RT,RP...> ::traits SrcTraits ;
-      typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
+      typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
       static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
       Mapping::assign( m_map , rhs.m_map , rhs.m_track );
     }
@@ -813,7 +822,7 @@ public:
   DynRankView & operator = (const DynRankView<RT,RP...> & rhs )
     {
       typedef typename DynRankView<RT,RP...> ::traits SrcTraits ;
-      typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
+      typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
       static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
       Mapping::assign( m_map , rhs.m_map , rhs.m_track );
       m_track.assign( rhs.m_track , traits::is_managed );
@@ -831,7 +840,7 @@ public:
     , m_rank( rhs.Rank )
     {
       typedef typename View<RT,RP...>::traits  SrcTraits ;
-      typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag >  Mapping ;
+      typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag >  Mapping ;
       static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
       Mapping::assign( *this , rhs );
     }
@@ -841,7 +850,7 @@ public:
   DynRankView & operator = ( const View<RT,RP...> & rhs )
     {
       typedef typename View<RT,RP...>::traits  SrcTraits ;
-      typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag >  Mapping ;
+      typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag >  Mapping ;
       static_assert( Mapping::is_assignable , "Incompatible View to DynRankView copy assignment" );
       Mapping::assign( *this , rhs );
       return *this ;
@@ -870,7 +879,7 @@ public:
       )
       : m_track()
       , m_map()
-      , m_rank( Impl::DynRankDimTraits<typename traits::specialize>::computeRank(arg_layout) )
+      , m_rank( Impl::DynRankDimTraits<typename traits::specialize>::template computeRank< typename traits::array_layout, P...>(arg_prop, arg_layout) )
     {
       // Append layout and spaces if not input
       typedef Impl::ViewCtorProp< P ... > alloc_prop_input ;
@@ -923,7 +932,7 @@ public:
 //------------------------------------------------------------
 
       Kokkos::Experimental::Impl::SharedAllocationRecord<> *
-        record = m_map.allocate_shared( prop , Impl::DynRankDimTraits<typename traits::specialize>::createLayout(arg_layout) );
+        record = m_map.allocate_shared( prop , Impl::DynRankDimTraits<typename traits::specialize>::template createLayout<traits, P...>(arg_prop, arg_layout) );
 
 //------------------------------------------------------------
 #if defined( KOKKOS_ENABLE_CUDA )
@@ -947,8 +956,8 @@ public:
                                >::type const & arg_layout
       )
       : m_track() // No memory tracking
-      , m_map( arg_prop , Impl::DynRankDimTraits<typename traits::specialize>::createLayout(arg_layout) )
-      , m_rank( Impl::DynRankDimTraits<typename traits::specialize>::computeRank(arg_layout) )
+      , m_map( arg_prop , Impl::DynRankDimTraits<typename traits::specialize>::template createLayout<traits, P...>(arg_prop, arg_layout) )
+      , m_rank( Impl::DynRankDimTraits<typename traits::specialize>::template computeRank< typename traits::array_layout, P...>(arg_prop, arg_layout) )
     {
       static_assert(
         std::is_same< pointer_type
@@ -1034,6 +1043,7 @@ public:
     {}
 
   // For backward compatibility
+  // NDE This ctor does not take ViewCtorProp argument - should not use alternative createLayout call
   explicit inline
   DynRankView( const ViewAllocateWithoutInitializing & arg_prop
       , const typename traits::array_layout & arg_layout
@@ -1179,6 +1189,11 @@ namespace Impl {
 
 struct DynRankSubviewTag {};
 
+} // namespace Impl
+} // namespace Experimental
+
+namespace Impl {
+
 template< class SrcTraits , class ... Args >
 struct ViewMapping
   < typename std::enable_if<(
@@ -1192,7 +1207,7 @@ struct ViewMapping
         std::is_same< typename SrcTraits::array_layout
                     , Kokkos::LayoutStride >::value
       )
-    ), DynRankSubviewTag >::type
+    ), Kokkos::Experimental::Impl::DynRankSubviewTag >::type
   , SrcTraits
   , Args ... >
 {
@@ -1264,7 +1279,7 @@ public:
   };
 
 
-  typedef DynRankView< value_type , array_layout , typename SrcTraits::device_type , typename SrcTraits::memory_traits >  ret_type;
+  typedef Kokkos::Experimental::DynRankView< value_type , array_layout , typename SrcTraits::device_type , typename SrcTraits::memory_traits >  ret_type;
 
   template < typename T , class ... P >
   KOKKOS_INLINE_FUNCTION
@@ -1336,9 +1351,10 @@ public:
 
 } // end Impl
 
+namespace Experimental {
 
 template< class V , class ... Args >
-using Subdynrankview = typename Kokkos::Experimental::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , V , Args... >::ret_type ;
+using Subdynrankview = typename Kokkos::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , V , Args... >::ret_type ;
 
 template< class D , class ... P , class ...Args >
 KOKKOS_INLINE_FUNCTION
@@ -1348,7 +1364,7 @@ subdynrankview( const Kokkos::Experimental::DynRankView< D , P... > &src , Args.
     if ( src.rank() > sizeof...(Args) ) //allow sizeof...(Args) >= src.rank(), ignore the remaining args
       { Kokkos::abort("subdynrankview: num of args must be >= rank of the source DynRankView"); }
 
-    typedef Kokkos::Experimental::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , Kokkos::ViewTraits< D*******, P... > , Args... > metafcn ;
+    typedef Kokkos::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , Kokkos::ViewTraits< D*******, P... > , Args... > metafcn ;
 
     return metafcn::subview( src.rank() , src , args... );
   }
diff --git a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
index da96db2d6b..e9059d64c4 100644
--- a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -57,7 +57,7 @@ namespace Experimental {
  */
 template< typename DataType , typename ... P >
 class DynamicView : public Kokkos::ViewTraits< DataType , P ... >
-{ 
+{
 public:
 
   typedef Kokkos::ViewTraits< DataType , P ... >  traits ;
@@ -68,7 +68,7 @@ private:
 
   typedef Kokkos::Experimental::Impl::SharedAllocationTracker   track_type ;
 
-  static_assert( traits::rank == 1 && traits::rank_dynamic == 1 
+  static_assert( traits::rank == 1 && traits::rank_dynamic == 1
                , "DynamicView must be rank-one" );
 
   static_assert( std::is_trivial< typename traits::value_type >::value &&
@@ -216,14 +216,14 @@ public:
         // Verify that allocation of the requested chunk in in progress.
 
         // The allocated chunk counter is m_chunks[ m_chunk_max ]
-        const uintptr_t n = 
+        const uintptr_t n =
           *reinterpret_cast<uintptr_t volatile *>( m_chunks + m_chunk_max );
 
         if ( n <= ic ) {
           Kokkos::abort("Kokkos::DynamicView array bounds error");
         }
 
-        // Allocation of this chunk is in progress 
+        // Allocation of this chunk is in progress
         // so wait for allocation to complete.
         while ( 0 == *ch );
       }
@@ -267,7 +267,7 @@ public:
         const uintptr_t jc_try = jc ;
 
         // Jump iteration to the chunk counter.
-        
+
         jc = atomic_compare_exchange( pc , jc_try , jc_try + 1 );
 
         if ( jc_try == jc ) {
@@ -316,7 +316,7 @@ public:
       }
       else {
         while ( NC + 1 <= *pc ) {
-          --*pc ;        
+          --*pc ;
           m_pool.deallocate( m_chunks[*pc]
                            , sizeof(value_type) << m_chunk_shift );
           m_chunks[*pc] = 0 ;
@@ -331,7 +331,7 @@ public:
     typename traits::value_type ** m_chunks ;
     uintptr_t                    * m_pc ;
     uintptr_t                      m_nc ;
-    unsigned                       m_chunk_shift ;  
+    unsigned                       m_chunk_shift ;
 
     KOKKOS_INLINE_FUNCTION
     void operator()( int ) const
@@ -348,7 +348,7 @@ public:
         }
         else {
           while ( m_nc + 1 <= *m_pc ) {
-            --*m_pc ;        
+            --*m_pc ;
             m_pool.deallocate( m_chunks[*m_pc]
                              , sizeof(value_type) << m_chunk_shift );
             m_chunks[*m_pc] = 0 ;
@@ -482,7 +482,7 @@ public:
   };
 
 
-  /**\brief  Allocation constructor 
+  /**\brief  Allocation constructor
    *
    *  Memory is allocated in chunks from the memory pool.
    *  The chunk size conforms to the memory pool's chunk size.
@@ -557,7 +557,7 @@ void deep_copy( const View<T,DP...> & dst
 
   if ( DstExecCanAccessSrc ) {
     // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
-    Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type >( dst , src );
+    Kokkos::Impl::ViewRemap< dst_type , src_type >( dst , src );
   }
   else {
     Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation");
@@ -581,7 +581,7 @@ void deep_copy( const DynamicView<T,DP...> & dst
 
   if ( DstExecCanAccessSrc ) {
     // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
-    Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type >( dst , src );
+    Kokkos::Impl::ViewRemap< dst_type , src_type >( dst , src );
   }
   else {
     Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation");
diff --git a/lib/kokkos/containers/unit_tests/TestCuda.cpp b/lib/kokkos/containers/unit_tests/TestCuda.cpp
index 5a78a5de9e..651a4e7eb8 100644
--- a/lib/kokkos/containers/unit_tests/TestCuda.cpp
+++ b/lib/kokkos/containers/unit_tests/TestCuda.cpp
@@ -69,6 +69,8 @@
 #include <Kokkos_ErrorReporter.hpp>
 #include <TestErrorReporter.hpp>
 
+#include <TestViewCtorPropEmbeddedDim.hpp>
+
 //----------------------------------------------------------------------------
 
 
@@ -94,6 +96,10 @@ TEST_F( cuda , dyn_view_api) {
   TestDynViewAPI< double , Kokkos::Cuda >();
 }
 
+TEST_F( cuda, viewctorprop_embedded_dim ) {
+  TestViewCtorProp_EmbeddedDim< Kokkos::Cuda >::test_vcpt( 2, 3 );
+}
+
 TEST_F( cuda , staticcrsgraph )
 {
   TestStaticCrsGraph::run_test_graph< Kokkos::Cuda >();
diff --git a/lib/kokkos/containers/unit_tests/TestOpenMP.cpp b/lib/kokkos/containers/unit_tests/TestOpenMP.cpp
index 2448bd077b..5365d91361 100644
--- a/lib/kokkos/containers/unit_tests/TestOpenMP.cpp
+++ b/lib/kokkos/containers/unit_tests/TestOpenMP.cpp
@@ -66,6 +66,8 @@
 #include <Kokkos_ErrorReporter.hpp>
 #include <TestErrorReporter.hpp>
 
+#include <TestViewCtorPropEmbeddedDim.hpp>
+
 #include <iomanip>
 
 namespace Test {
@@ -76,14 +78,7 @@ protected:
   {
     std::cout << std::setprecision(5) << std::scientific;
 
-    unsigned threads_count = 4 ;
-
-    if ( Kokkos::hwloc::available() ) {
-      threads_count = Kokkos::hwloc::get_available_numa_count() *
-                      Kokkos::hwloc::get_available_cores_per_numa();
-    }
-
-    Kokkos::OpenMP::initialize( threads_count );
+    Kokkos::OpenMP::initialize();
   }
 
   static void TearDownTestCase()
@@ -96,6 +91,10 @@ TEST_F( openmp, dyn_view_api) {
   TestDynViewAPI< double , Kokkos::OpenMP >();
 }
 
+TEST_F( openmp, viewctorprop_embedded_dim ) {
+  TestViewCtorProp_EmbeddedDim< Kokkos::OpenMP >::test_vcpt( 2, 3 );
+}
+
 TEST_F( openmp, bitset )
 {
   test_bitset<Kokkos::OpenMP>();
diff --git a/lib/kokkos/containers/unit_tests/TestSerial.cpp b/lib/kokkos/containers/unit_tests/TestSerial.cpp
index 06c4d9f6ed..1b9b5a2da3 100644
--- a/lib/kokkos/containers/unit_tests/TestSerial.cpp
+++ b/lib/kokkos/containers/unit_tests/TestSerial.cpp
@@ -67,6 +67,8 @@
 #include <Kokkos_ErrorReporter.hpp>
 #include <TestErrorReporter.hpp>
 
+#include <TestViewCtorPropEmbeddedDim.hpp>
+
 namespace Test {
 
 class serial : public ::testing::Test {
@@ -85,6 +87,10 @@ TEST_F( serial, dyn_view_api) {
   TestDynViewAPI< double , Kokkos::Serial >();
 }
 
+TEST_F( serial, viewctorprop_embedded_dim ) {
+  TestViewCtorProp_EmbeddedDim< Kokkos::Serial >::test_vcpt( 2, 3 );
+}
+
 TEST_F( serial , staticcrsgraph )
 {
   TestStaticCrsGraph::run_test_graph< Kokkos::Serial >();
diff --git a/lib/kokkos/containers/unit_tests/TestThreads.cpp b/lib/kokkos/containers/unit_tests/TestThreads.cpp
index 938ec88e90..aca0b57d65 100644
--- a/lib/kokkos/containers/unit_tests/TestThreads.cpp
+++ b/lib/kokkos/containers/unit_tests/TestThreads.cpp
@@ -70,6 +70,8 @@
 #include <Kokkos_ErrorReporter.hpp>
 #include <TestErrorReporter.hpp>
 
+#include <TestViewCtorPropEmbeddedDim.hpp>
+
 namespace Test {
 
 class threads : public ::testing::Test {
@@ -103,6 +105,10 @@ TEST_F( threads , dyn_view_api) {
   TestDynViewAPI< double , Kokkos::Threads >();
 }
 
+TEST_F( threads, viewctorprop_embedded_dim ) {
+  TestViewCtorProp_EmbeddedDim< Kokkos::Threads >::test_vcpt( 2, 3 );
+}
+
 TEST_F( threads , staticcrsgraph )
 {
   TestStaticCrsGraph::run_test_graph< Kokkos::Threads >();
diff --git a/lib/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp b/lib/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp
new file mode 100644
index 0000000000..1efd1ddc51
--- /dev/null
+++ b/lib/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp
@@ -0,0 +1,213 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cstdio>
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_DynRankView.hpp>
+
+#include <type_traits>
+#include <typeinfo>
+
+namespace Test {
+
+namespace {
+
+template <typename ExecSpace >
+struct TestViewCtorProp_EmbeddedDim {
+
+  using ViewIntType     = typename Kokkos::View< int**, ExecSpace >;
+  using ViewDoubleType     = typename Kokkos::View< double*, ExecSpace >;
+
+  using DynRankViewIntType     = typename Kokkos::DynRankView< int, ExecSpace >;
+  using DynRankViewDoubleType     = typename Kokkos::DynRankView< double, ExecSpace >;
+
+  // Cuda 7.0 has issues with using a lamda in parallel_for to initialize the view - replace with this functor
+  template < class ViewType >
+  struct Functor {
+
+    ViewType v;
+
+    Functor( const ViewType & v_ ) : v(v_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( const int i ) const {
+      v(i) = i;
+    }
+
+  };
+
+
+  static void test_vcpt( const int N0, const int N1 )
+  {
+
+    // Create two views to test
+    {
+      using VIT = typename TestViewCtorProp_EmbeddedDim::ViewIntType ;
+      using VDT = typename TestViewCtorProp_EmbeddedDim::ViewDoubleType ;
+
+      VIT vi1("vi1", N0, N1);
+      VDT vd1("vd1", N0);
+
+      // TEST: Test for common type between two views, one with type double, other with type int
+      // Deduce common value_type and construct a view with that type
+      {
+        // Two views
+        auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1, vd1);
+        typedef typename decltype( view_alloc_arg )::value_type                    CommonViewValueType;
+        typedef typename Kokkos::View< CommonViewValueType*, ExecSpace >  CVT;
+        typedef typename CVT::HostMirror                                           HostCVT;
+
+        // Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
+        CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
+
+        Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1), 
+          Functor<CVT>(cv1)
+        );
+
+        HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
+        Kokkos::deep_copy( hcv1, cv1 );
+
+        ASSERT_EQ( (std::is_same< CommonViewValueType, double >::value) , true ) ;
+      #if 0
+      // debug output
+      for ( int i = 0; i < N0*N1; ++i ) {
+        printf(" Output check: hcv1(%d) = %lf\n ", i, hcv1(i) );
+      }
+
+      printf( " Common value type view: %s \n", typeid( CVT() ).name() );
+      printf( " Common value type: %s \n", typeid( CommonViewValueType() ).name() );
+      if ( std::is_same< CommonViewValueType, double >::value == true ) {
+        printf("Proper common value_type\n");
+      }
+      else {
+        printf("WRONG common value_type\n");
+      }
+      // end debug output
+      #endif
+      }
+
+      {
+        // Single view
+        auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1);
+        typedef typename decltype( view_alloc_arg )::value_type                    CommonViewValueType;
+        typedef typename Kokkos::View< CommonViewValueType*, ExecSpace >  CVT;
+        typedef typename CVT::HostMirror                                           HostCVT;
+
+        // Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
+        CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
+
+        Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1), 
+          Functor<CVT>(cv1)
+        );
+
+        HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
+        Kokkos::deep_copy( hcv1, cv1 );
+
+        ASSERT_EQ( (std::is_same< CommonViewValueType, int>::value) , true ) ;
+      }
+
+    }
+
+    // Create two dynamic rank views to test
+    {
+      using VIT = typename TestViewCtorProp_EmbeddedDim::DynRankViewIntType ;
+      using VDT = typename TestViewCtorProp_EmbeddedDim::DynRankViewDoubleType ;
+
+      VIT vi1("vi1", N0, N1);
+      VDT vd1("vd1", N0);
+
+      // TEST: Test for common type between two views, one with type double, other with type int
+      // Deduce common value_type and construct a view with that type
+      {
+        // Two views
+        auto view_alloc_arg = Kokkos::common_view_alloc_prop( vi1, vd1 );
+        typedef typename decltype( view_alloc_arg )::value_type                    CommonViewValueType;
+        typedef typename Kokkos::View< CommonViewValueType*, ExecSpace >  CVT;
+        typedef typename CVT::HostMirror                                           HostCVT;
+
+        // Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
+        CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
+
+
+        Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1), 
+          Functor<CVT>(cv1)
+        );
+
+        HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
+        Kokkos::deep_copy( hcv1, cv1 );
+
+        ASSERT_EQ( (std::is_same< CommonViewValueType, double >::value) , true ) ;
+      }
+
+      {
+        // Single views
+        auto view_alloc_arg = Kokkos::common_view_alloc_prop( vi1 );
+        typedef typename decltype( view_alloc_arg )::value_type                    CommonViewValueType;
+        typedef typename Kokkos::View< CommonViewValueType*, ExecSpace >  CVT;
+        typedef typename CVT::HostMirror                                           HostCVT;
+
+        // Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
+        CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
+
+        Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1), 
+          Functor<CVT>(cv1)
+        );
+
+        HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
+        Kokkos::deep_copy( hcv1, cv1 );
+
+        ASSERT_EQ( (std::is_same< CommonViewValueType, int>::value) , true ) ;
+      }
+    }
+
+
+  } // end test_vcpt
+
+}; // end struct
+
+} // namespace
+
+} // namespace Test
diff --git a/lib/kokkos/containers/unit_tests/UnitTestMain.cpp b/lib/kokkos/containers/unit_tests/UnitTestMain.cpp
index f952ab3db5..2b73535c83 100644
--- a/lib/kokkos/containers/unit_tests/UnitTestMain.cpp
+++ b/lib/kokkos/containers/unit_tests/UnitTestMain.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,12 +36,14 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
 
 #include <gtest/gtest.h>
+#include <cstdlib>
+#include <Kokkos_Macros.hpp>
 
 int main(int argc, char *argv[]) {
   ::testing::InitGoogleTest(&argc,argv);
diff --git a/lib/kokkos/core/perf_test/Makefile b/lib/kokkos/core/perf_test/Makefile
index f59e7bbe1c..bb9353f583 100644
--- a/lib/kokkos/core/perf_test/Makefile
+++ b/lib/kokkos/core/perf_test/Makefile
@@ -79,7 +79,6 @@ test-mempool: KokkosCore_PerformanceTest_Mempool
 test-taskdag: KokkosCore_PerformanceTest_TaskDAG
 	./KokkosCore_PerformanceTest_TaskDAG
 
-
 build_all: $(TARGETS)
 
 test: $(TEST_TARGETS)
diff --git a/lib/kokkos/core/perf_test/PerfTestMain.cpp b/lib/kokkos/core/perf_test/PerfTestMain.cpp
index d80cfab8b5..832f650b9a 100644
--- a/lib/kokkos/core/perf_test/PerfTestMain.cpp
+++ b/lib/kokkos/core/perf_test/PerfTestMain.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,12 +36,14 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
 
 #include <gtest/gtest.h>
+#include <cstdlib>
+
 #include <Kokkos_Core.hpp>
 
 namespace Test {
diff --git a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp
new file mode 100644
index 0000000000..46321378d9
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp
@@ -0,0 +1,2715 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_EXP_ITERATE_TILE_REFACTOR_HPP
+#define KOKKOS_CUDA_EXP_ITERATE_TILE_REFACTOR_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
+
+#include <iostream>
+#include <algorithm>
+#include <cstdio>
+
+#include <utility>
+
+// #include<Cuda/Kokkos_CudaExec.hpp>
+// Including the file above leads to following type of errors:
+// /home/ndellin/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp(84): error: incomplete type is not allowed
+// use existing Kokkos functionality, e.g. max blocks, once resolved
+
+#if defined(KOKKOS_ENABLE_PROFILING)
+#include <impl/Kokkos_Profiling_Interface.hpp>
+#include <typeinfo>
+#endif
+
+namespace Kokkos { namespace Experimental { namespace Impl {
+
+namespace Refactor {
+
+// ------------------------------------------------------------------ //
+// ParallelFor iteration pattern
+template< int N , typename RP , typename Functor , typename Tag >
+struct DeviceIterateTile;
+
+//Rank 2
+// Specializations for void tag type
+template< typename RP , typename Functor >
+struct DeviceIterateTile<2,RP,Functor,void >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    // LL
+    if (RP::inner_direction == RP::Left) {
+      for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+        const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+        if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
+
+          for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+            const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+            if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
+              m_func(offset_0 , offset_1);
+            }
+          }
+        }
+      }
+    }
+    // LR
+    else {
+      for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+        if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
+
+          for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+            if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
+              m_func(offset_0 , offset_1);
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag >
+struct DeviceIterateTile<2,RP,Functor,Tag>
+{
+  using index_type = typename RP::index_type;
+
+  inline __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    if (RP::inner_direction == RP::Left) {
+      // Loop over size maxnumblocks until full range covered
+      for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+        const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+        if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
+
+          for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+            const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+            if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
+              m_func(Tag(), offset_0 , offset_1);
+            }
+          }
+        }
+      }
+    }
+    else {
+      for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+        if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
+
+          for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+            if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
+              m_func(Tag(), offset_0 , offset_1);
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+
+//Rank 3
+// Specializations for void tag type
+template< typename RP , typename Functor >
+struct DeviceIterateTile<3,RP,Functor,void >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    // LL
+    if (RP::inner_direction == RP::Left) {
+      for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
+        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
+        if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
+
+          for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+            if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+                if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
+                  m_func(offset_0 , offset_1 , offset_2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // LR
+    else {
+      for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+        if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
+
+          for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+            if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
+                if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
+                  m_func(offset_0 , offset_1 , offset_2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+// Specializations for void tag type
+template< typename RP , typename Functor , typename Tag >
+struct DeviceIterateTile<3,RP,Functor,Tag>
+{
+  using index_type = typename RP::index_type;
+
+  inline __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    if (RP::inner_direction == RP::Left) {
+      for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
+        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
+        if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
+
+          for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+            if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+                if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
+                  m_func(Tag(), offset_0 , offset_1 , offset_2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    else {
+      for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+        if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
+
+          for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+            if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
+                if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
+                  m_func(Tag(), offset_0 , offset_1 , offset_2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+
+//Rank 4
+// Specializations for void tag type
+template< typename RP , typename Functor >
+struct DeviceIterateTile<4,RP,Functor,void >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    // LL
+    if (RP::inner_direction == RP::Left) {
+      const index_type temp0  =  m_rp.m_tile_end[0];
+      const index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx.x % numbl0;
+      const index_type tile_id1 = (index_type)blockIdx.x / numbl0;
+      const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0];
+      const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
+
+      for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
+        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
+        if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
+
+          for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
+            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
+            if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
+
+              for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                  for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                    if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                      m_func(offset_0 , offset_1 , offset_2 , offset_3);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // LR
+    else {
+      const index_type temp0  =  m_rp.m_tile_end[0];
+      const index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+          ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx.x / numbl1;
+      const index_type tile_id1 = (index_type)blockIdx.x % numbl1;
+      const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1];
+      const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
+                if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
+
+                  for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
+                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
+                    if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
+                      m_func(offset_0 , offset_1 , offset_2 , offset_3);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+// Specializations for void tag type
+template< typename RP , typename Functor , typename Tag >
+struct DeviceIterateTile<4,RP,Functor,Tag>
+{
+  using index_type = typename RP::index_type;
+
+  inline __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if (RP::inner_direction == RP::Left) {
+      const index_type temp0  =  m_rp.m_tile_end[0];
+      const index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx.x % numbl0;
+      const index_type tile_id1 = (index_type)blockIdx.x / numbl0;
+      const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0];
+      const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
+
+      for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
+        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
+        if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
+
+          for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
+            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
+            if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
+
+              for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                  for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                    if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                      m_func(Tag(), offset_0 , offset_1 , offset_2 , offset_3);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    else {
+      const index_type temp0  =  m_rp.m_tile_end[0];
+      const index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+          ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx.x / numbl1;
+      const index_type tile_id1 = (index_type)blockIdx.x % numbl1;
+      const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1];
+      const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1;
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
+                if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
+
+                  for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
+                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
+                    if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
+                      m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+
+//Rank 5
+// Specializations for void tag type
+template< typename RP , typename Functor >
+struct DeviceIterateTile<5,RP,Functor,void >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    // LL
+    if (RP::inner_direction == RP::Left) {
+
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx.x % numbl0;
+      const index_type tile_id1 = (index_type)blockIdx.x / numbl0;
+      const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0];
+      const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id2 = (index_type)blockIdx.y % numbl2;
+      const index_type tile_id3 = (index_type)blockIdx.y / numbl2;
+      const index_type thr_id2 = (index_type)threadIdx.y % m_rp.m_tile[2];
+      const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];
+
+      for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
+        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
+        if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
+
+          for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+            if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                    if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                      for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                          m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4);
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // LR
+    else {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+          ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx.x / numbl1;
+      const index_type tile_id1 = (index_type)blockIdx.x % numbl1;
+      const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1];
+      const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
+          (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id2 = (index_type)blockIdx.y / numbl3;
+      const index_type tile_id3 = (index_type)blockIdx.y % numbl3;
+      const index_type thr_id2 = (index_type)threadIdx.y / m_rp.m_tile[3];
+      const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                      for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
+                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
+                        if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
+                          m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag >
+struct DeviceIterateTile<5,RP,Functor,Tag>
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    // LL
+    if (RP::inner_direction == RP::Left) {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx.x % numbl0;
+      const index_type tile_id1 = (index_type)blockIdx.x / numbl0;
+      const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0];
+      const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id2 = (index_type)blockIdx.y % numbl2;
+      const index_type tile_id3 = (index_type)blockIdx.y / numbl2;
+      const index_type thr_id2 = (index_type)threadIdx.y % m_rp.m_tile[2];
+      const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];
+
+      for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
+        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
+        if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
+
+          for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+            if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                    if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                      for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                          m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4);
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // LR
+    else {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+          ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx.x / numbl1;
+      const index_type tile_id1 = (index_type)blockIdx.x % numbl1;
+      const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1];
+      const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
+          (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id2 = (index_type)blockIdx.y / numbl3;
+      const index_type tile_id3 = (index_type)blockIdx.y % numbl3;
+      const index_type thr_id2 = (index_type)threadIdx.y / m_rp.m_tile[3];
+      const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                      for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
+                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
+                        if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
+                          m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+
+//Rank 6
+// Specializations for void tag type
+template< typename RP , typename Functor >
+struct DeviceIterateTile<6,RP,Functor,void >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    // LL
+    if (RP::inner_direction == RP::Left) {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx.x % numbl0;
+      const index_type tile_id1 = (index_type)blockIdx.x / numbl0;
+      const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0];
+      const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id2 = (index_type)blockIdx.y % numbl2;
+      const index_type tile_id3 = (index_type)blockIdx.y / numbl2;
+      const index_type thr_id2 = (index_type)threadIdx.y % m_rp.m_tile[2];
+      const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];
+
+      temp0  =  m_rp.m_tile_end[4];
+      temp1  =  m_rp.m_tile_end[5];
+      const index_type numbl4 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl5 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl4 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id4 = (index_type)blockIdx.z % numbl4;
+      const index_type tile_id5 = (index_type)blockIdx.z / numbl4;
+      const index_type thr_id4 = (index_type)threadIdx.z % m_rp.m_tile[4];
+      const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4];
+
+      for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
+        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+        if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
+
+          for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
+            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+            if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
+
+              for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                  for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                    if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                      for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                        if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                          for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                            if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                              m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // LR
+    else {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+          ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx.x / numbl1;
+      const index_type tile_id1 = (index_type)blockIdx.x % numbl1;
+      const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1];
+      const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
+          (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id2 = (index_type)blockIdx.y / numbl3;
+      const index_type tile_id3 = (index_type)blockIdx.y % numbl3;
+      const index_type thr_id2 = (index_type)threadIdx.y / m_rp.m_tile[3];
+      const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];
+
+      temp0  =  m_rp.m_tile_end[4];
+      temp1  =  m_rp.m_tile_end[5];
+      const index_type numbl5 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl4 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl5 ) :
+          (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id4 = (index_type)blockIdx.z / numbl5;
+      const index_type tile_id5 = (index_type)blockIdx.z % numbl5;
+      const index_type thr_id4 = (index_type)threadIdx.z / m_rp.m_tile[5];
+      const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                      for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
+                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+                        if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
+
+                          for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
+                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+                            if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
+                              m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag >
+struct DeviceIterateTile<6,RP,Functor,Tag>
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    // LL
+    if (RP::inner_direction == RP::Left) {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx.x % numbl0;
+      const index_type tile_id1 = (index_type)blockIdx.x / numbl0;
+      const index_type thr_id0 = (index_type)threadIdx.x % m_rp.m_tile[0];
+      const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id2 = (index_type)blockIdx.y % numbl2;
+      const index_type tile_id3 = (index_type)blockIdx.y / numbl2;
+      const index_type thr_id2 = (index_type)threadIdx.y % m_rp.m_tile[2];
+      const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];
+
+      temp0  =  m_rp.m_tile_end[4];
+      temp1  =  m_rp.m_tile_end[5];
+      const index_type numbl4 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl5 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl4 ) :
+          (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id4 = (index_type)blockIdx.z % numbl4;
+      const index_type tile_id5 = (index_type)blockIdx.z / numbl4;
+      const index_type thr_id4 = (index_type)threadIdx.z % m_rp.m_tile[4];
+      const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4];
+
+      for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
+        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+        if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
+
+          for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
+            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+            if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
+
+              for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                  for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                    if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                      for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                        if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                          for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                            if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                              m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // LR
+    else {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+          ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = (index_type)blockIdx.x / numbl1;
+      const index_type tile_id1 = (index_type)blockIdx.x % numbl1;
+      const index_type thr_id0 = (index_type)threadIdx.x / m_rp.m_tile[1];
+      const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
+          (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id2 = (index_type)blockIdx.y / numbl3;
+      const index_type tile_id3 = (index_type)blockIdx.y % numbl3;
+      const index_type thr_id2 = (index_type)threadIdx.y / m_rp.m_tile[3];
+      const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];
+
+      temp0  =  m_rp.m_tile_end[4];
+      temp1  =  m_rp.m_tile_end[5];
+      const index_type numbl5 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl4 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl5 ) :
+          (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id4 = (index_type)blockIdx.z / numbl5;
+      const index_type tile_id5 = (index_type)blockIdx.z % numbl5;
+      const index_type thr_id4 = (index_type)threadIdx.z / m_rp.m_tile[5];
+      const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                      for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
+                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+                        if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
+
+                          for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
+                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+                            if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
+                              m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+} // Refactor
+
+// ----------------------------------------------------------------------------------
+
+namespace Reduce {
+
+template < typename T >
+using is_void = std::is_same< T, void >;
+
+template < typename T >
+struct is_array_type : std::false_type
+{
+  using value_type = T;
+};
+
+template < typename T >
+struct is_array_type< T* > : std::true_type
+{
+  using value_type = T;
+};
+
+template < typename T >
+struct is_array_type< T[] > : std::true_type
+{
+  using value_type = T;
+};
+
+// ------------------------------------------------------------------ //
+template< int N , typename RP , typename Functor , typename Tag , typename ValueType , typename Enable = void >
+struct DeviceIterateTile;
+
+// ParallelReduce iteration pattern
+// Scalar reductions
+
+// num_blocks = min( num_tiles, max_num_blocks ); //i.e. determined by number of tiles and reduction algorithm constraints
+// extract n-dim tile offsets (i.e. tile's global starting mulit-index) from the tileid = blockid using tile dimensions
+// local indices within a tile extracted from (index_type)threadIdx.x using tile dims, constrained by blocksize
+// combine tile and local id info for multi-dim global ids
+
+// Pattern:
+// Each block+thread is responsible for a tile+local_id combo (additional when striding by num_blocks)
+// 1. create offset arrays
+// 2. loop over number of tiles, striding by griddim (equal to num tiles, or max num blocks)
+// 3. temps set for tile_idx and thrd_idx, which will be modified
+// 4. if LL vs LR:
+//      determine tile starting point offsets (multidim)
+//      determine local index offsets (multidim)
+//      concatentate tile offset + local offset for global multi-dim index
+//    if offset withinin range bounds AND local offset within tile bounds, call functor
+
+// ValueType = T
+//Rank 2
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<2,RP,Functor,void,ValueType, typename std::enable_if< !is_array_type<ValueType>::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            // Deduce this blocks tile_id
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          {
+            m_func( m_offset[0], m_offset[1], m_v );
+          }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_v ); }
+        }
+      }
+    }
+
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<2,RP,Functor,Tag, ValueType, typename std::enable_if< !is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  inline __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+//Rank 3
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<3,RP,Functor,void,ValueType , typename std::enable_if< !is_array_type<ValueType>::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+// Specializations for void tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<3,RP,Functor,Tag, ValueType, typename std::enable_if< !is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  inline __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+//Rank 4
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<4,RP,Functor,void,ValueType , typename std::enable_if< !is_array_type<ValueType>::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+// Specializations for void tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<4,RP,Functor,Tag,ValueType, typename std::enable_if< !is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  inline __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+//Rank 5
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<5,RP,Functor,void,ValueType , typename std::enable_if< !is_array_type<ValueType>::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<5,RP,Functor,Tag,ValueType, typename std::enable_if< !is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+//Rank 6
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<6,RP,Functor,void,ValueType , typename std::enable_if< !is_array_type<ValueType>::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<6,RP,Functor,Tag,ValueType, typename std::enable_if< !is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , ValueType & v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  ValueType & m_v;
+};
+
+
+// ValueType = T[], T*
+//Rank 2
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<2,RP,Functor,void,ValueType, typename std::enable_if< is_array_type<ValueType>::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          {
+            m_func( m_offset[0], m_offset[1], m_v );
+          }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<2,RP,Functor,Tag, ValueType, typename std::enable_if< is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  inline __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_v ); }
+        }
+      } //end for loop over num_tiles - product of tiles in each direction
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+
+//Rank 3
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<3,RP,Functor,void,ValueType , typename std::enable_if< is_array_type<ValueType>::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]); // Move this to first computation, add to m_offset right away
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+
+// Specializations for void tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<3,RP,Functor,Tag, ValueType, typename std::enable_if< is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  inline __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+
+//Rank 4
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<4,RP,Functor,void,ValueType , typename std::enable_if< is_array_type<ValueType>::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+
+// Specializations for void tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<4,RP,Functor,Tag,ValueType, typename std::enable_if< is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  inline __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+
+//Rank 5
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<5,RP,Functor,void,ValueType , typename std::enable_if< is_array_type<ValueType>::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<5,RP,Functor,Tag,ValueType, typename std::enable_if< is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+
+//Rank 6
+// Specializations for void tag type
+template< typename RP , typename Functor , typename ValueType >
+struct DeviceIterateTile<6,RP,Functor,void,ValueType , typename std::enable_if< is_array_type<ValueType>::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag, typename ValueType >
+struct DeviceIterateTile<6,RP,Functor,Tag,ValueType, typename std::enable_if< is_array_type<ValueType>::value && !is_void< Tag >::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using value_type = typename is_array_type< ValueType >::value_type;
+
+  __device__
+  DeviceIterateTile( const RP & rp_ , const Functor & f_ , value_type* v_)
+  : m_rp(rp_)
+  , m_func(f_)
+  , m_v(v_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+  //static constexpr index_type max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+
+  inline __device__
+  void exec_range() const
+  {
+    //enum { max_blocks = static_cast<index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount) };
+    //const index_type max_blocks = static_cast<index_type>( Kokkos::Impl::cuda_internal_maximum_grid_count() );
+    if ( (index_type)blockIdx.x < m_rp.m_num_tiles && (index_type)threadIdx.y < m_rp.m_prod_tile_dims ) {
+      index_type m_offset[RP::rank]; // tile starting global id offset
+      index_type m_local_offset[RP::rank]; // tile starting global id offset
+
+      for ( index_type tileidx = (index_type)blockIdx.x; tileidx < m_rp.m_num_tiles; tileidx += gridDim.x ) {
+        index_type tile_idx = tileidx; // temp because tile_idx will be modified while determining tile starting point offsets
+        index_type thrd_idx = (index_type)threadIdx.y;
+        bool in_bounds = true;
+
+        // LL
+        if (RP::inner_direction == RP::Left) {
+          for (int i=0; i<RP::rank; ++i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
+        }
+        // LR
+        else {
+          for (int i=RP::rank-1; i>=0; --i) {
+            m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+            tile_idx /= m_rp.m_tile_end[i];
+
+            // tile-local indices identified with (index_type)threadIdx.y
+            m_local_offset[i] = (thrd_idx % m_rp.m_tile[i]);
+            thrd_idx /= m_rp.m_tile[i];
+
+            m_offset[i] += m_local_offset[i];
+            if ( !(m_offset[i] < m_rp.m_upper[i] && m_local_offset[i] < m_rp.m_tile[i]) ) {
+              in_bounds &= false;
+            }
+          }
+          if ( in_bounds )
+          { m_func( Tag(), m_offset[0], m_offset[1], m_offset[2], m_offset[3], m_offset[4], m_offset[5], m_v ); }
+        }
+      }
+    }
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+  value_type* m_v;
+};
+
+} // Reduce
+
+// ----------------------------------------------------------------------------------
+
+} } } //end namespace Kokkos::Experimental::Impl
+
+#endif
+#endif
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
index 13abcfd93c..cae8ecd489 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
@@ -53,6 +53,7 @@
 #include <impl/Kokkos_Error.hpp>
 #include <Cuda/Kokkos_Cuda_abort.hpp>
 #include <Cuda/Kokkos_Cuda_Error.hpp>
+#include <Cuda/Kokkos_Cuda_Locks.hpp>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -125,53 +126,12 @@ unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits:
 
 #endif
 
-
-namespace Kokkos {
-namespace Impl {
-  struct CudaLockArraysStruct {
-    int* atomic;
-    int* scratch;
-    int* threadid;
-    int n;
-  };
-}
-}
-__device__ __constant__
-#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-extern
-#endif
-Kokkos::Impl::CudaLockArraysStruct kokkos_impl_cuda_lock_arrays ;
-
-#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
-#define CUDA_SPACE_ATOMIC_XOR_MASK 0x15A39
-
 namespace Kokkos {
 namespace Impl {
   void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink = false);
 }
 }
 
-namespace Kokkos {
-namespace Impl {
-__device__ inline
-bool lock_address_cuda_space(void* ptr) {
-  size_t offset = size_t(ptr);
-  offset = offset >> 2;
-  offset = offset & CUDA_SPACE_ATOMIC_MASK;
-  return (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[offset],0,1));
-}
-
-__device__ inline
-void unlock_address_cuda_space(void* ptr) {
-  size_t offset = size_t(ptr);
-  offset = offset >> 2;
-  offset = offset & CUDA_SPACE_ATOMIC_MASK;
-  atomicExch( &kokkos_impl_cuda_lock_arrays.atomic[ offset ], 0);
-}
-
-}
-}
-
 template< typename T >
 inline
 __device__
@@ -192,7 +152,7 @@ namespace Impl {
 // For 2.0 capability: 48 KB L1 and 16 KB shared
 //----------------------------------------------------------------------------
 
-template< class DriverType >
+template< class DriverType>
 __global__
 static void cuda_parallel_launch_constant_memory()
 {
@@ -202,19 +162,39 @@ static void cuda_parallel_launch_constant_memory()
   driver();
 }
 
-template< class DriverType >
+template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
+__global__
+__launch_bounds__(maxTperB, minBperSM)
+static void cuda_parallel_launch_constant_memory()
+{
+  const DriverType & driver =
+    *((const DriverType *) kokkos_impl_cuda_constant_memory_buffer );
+
+  driver();
+}
+
+template< class DriverType>
 __global__
 static void cuda_parallel_launch_local_memory( const DriverType driver )
 {
   driver();
 }
 
-template < class DriverType ,
-           bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ) >
+template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
+__global__
+__launch_bounds__(maxTperB, minBperSM)
+static void cuda_parallel_launch_local_memory( const DriverType driver )
+{
+  driver();
+}
+
+template < class DriverType
+         , class LaunchBounds = Kokkos::LaunchBounds<>
+         , bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ) >
 struct CudaParallelLaunch ;
 
-template < class DriverType >
-struct CudaParallelLaunch< DriverType , true > {
+template < class DriverType, class LaunchBounds >
+struct CudaParallelLaunch< DriverType, LaunchBounds, true > {
 
   inline
   CudaParallelLaunch( const DriverType & driver
@@ -238,26 +218,19 @@ struct CudaParallelLaunch< DriverType , true > {
       }
       #ifndef KOKKOS_ARCH_KEPLER //On Kepler the L1 has no benefit since it doesn't cache reads
       else if ( shmem ) {
-        CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferShared ) );
+        CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferShared ) );
       } else {
-        CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferL1 ) );
+        CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferL1 ) );
       }
       #endif
 
       // Copy functor to constant memory on the device
       cudaMemcpyToSymbol( kokkos_impl_cuda_constant_memory_buffer , & driver , sizeof(DriverType) );
 
-      #ifndef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-      Kokkos::Impl::CudaLockArraysStruct locks;
-      locks.atomic = atomic_lock_array_cuda_space_ptr(false);
-      locks.scratch = scratch_lock_array_cuda_space_ptr(false);
-      locks.threadid = threadid_lock_array_cuda_space_ptr(false);
-      locks.n = Kokkos::Cuda::concurrency();
-      cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
-      #endif
+      KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
 
       // Invoke the driver function on the device
-      cuda_parallel_launch_constant_memory< DriverType ><<< grid , block , shmem , stream >>>();
+      cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM ><<< grid , block , shmem , stream >>>();
 
 #if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
       CUDA_SAFE_CALL( cudaGetLastError() );
@@ -267,8 +240,8 @@ struct CudaParallelLaunch< DriverType , true > {
   }
 };
 
-template < class DriverType >
-struct CudaParallelLaunch< DriverType , false > {
+template < class DriverType, class LaunchBounds >
+struct CudaParallelLaunch< DriverType, LaunchBounds, false > {
 
   inline
   CudaParallelLaunch( const DriverType & driver
@@ -284,22 +257,15 @@ struct CudaParallelLaunch< DriverType , false > {
       }
       #ifndef KOKKOS_ARCH_KEPLER //On Kepler the L1 has no benefit since it doesn't cache reads
       else if ( shmem ) {
-        CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType > , cudaFuncCachePreferShared ) );
+        CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferShared ) );
       } else {
-        CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType > , cudaFuncCachePreferL1 ) );
+        CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferL1 ) );
       }
       #endif
 
-      #ifndef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-      Kokkos::Impl::CudaLockArraysStruct locks;
-      locks.atomic = atomic_lock_array_cuda_space_ptr(false);
-      locks.scratch = scratch_lock_array_cuda_space_ptr(false);
-      locks.threadid = threadid_lock_array_cuda_space_ptr(false);
-      locks.n = Kokkos::Cuda::concurrency();
-      cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
-      #endif
+      KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
 
-      cuda_parallel_launch_local_memory< DriverType ><<< grid , block , shmem , stream >>>( driver );
+      cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM ><<< grid , block , shmem , stream >>>( driver );
 
 #if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
       CUDA_SAFE_CALL( cudaGetLastError() );
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
index 406b4f1e22..b699f0d6ba 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
@@ -230,18 +230,6 @@ void CudaHostPinnedSpace::deallocate( void * const arg_alloc_ptr , const size_t
   } catch(...) {}
 }
 
-constexpr const char* CudaSpace::name() {
-  return m_name;
-}
-
-constexpr const char* CudaUVMSpace::name() {
-  return m_name;
-}
-
-constexpr const char* CudaHostPinnedSpace::name() {
-  return m_name;
-}
-
 } // namespace Kokkos
 
 //----------------------------------------------------------------------------
@@ -655,11 +643,12 @@ reallocate_tracked( void * const arg_alloc_ptr
 SharedAllocationRecord< Kokkos::CudaSpace , void > *
 SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record( void * alloc_ptr )
 {
-  using Header     = SharedAllocationHeader ;
   using RecordBase = SharedAllocationRecord< void , void > ;
   using RecordCuda = SharedAllocationRecord< Kokkos::CudaSpace , void > ;
 
 #if 0
+  using Header     = SharedAllocationHeader ;
+
   // Copy the header from the allocation
   Header head ;
 
@@ -812,83 +801,6 @@ print_records( std::ostream & s , const Kokkos::CudaHostPinnedSpace & space , bo
   SharedAllocationRecord< void , void >::print_host_accessible_records( s , "CudaHostPinned" , & s_root_record , detail );
 }
 
-} // namespace Impl
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace {
-  __global__ void init_lock_array_kernel_atomic() {
-    unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
-
-    if(i<CUDA_SPACE_ATOMIC_MASK+1)
-      kokkos_impl_cuda_lock_arrays.atomic[i] = 0;
-  }
-
-  __global__ void init_lock_array_kernel_scratch_threadid(int N) {
-    unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
-
-    if(i<N) {
-      kokkos_impl_cuda_lock_arrays.scratch[i] = 0;
-      kokkos_impl_cuda_lock_arrays.threadid[i] = 0;
-    }
-  }
-}
-
-
-namespace Impl {
-int* atomic_lock_array_cuda_space_ptr(bool deallocate) {
-  static int* ptr = NULL;
-  if(deallocate) {
-    cudaFree(ptr);
-    ptr = NULL;
-  }
-
-  if(ptr==NULL && !deallocate)
-    cudaMalloc(&ptr,sizeof(int)*(CUDA_SPACE_ATOMIC_MASK+1));
-  return ptr;
-}
-
-int* scratch_lock_array_cuda_space_ptr(bool deallocate) {
-  static int* ptr = NULL;
-  if(deallocate) {
-    cudaFree(ptr);
-    ptr = NULL;
-  }
-
-  if(ptr==NULL && !deallocate)
-    cudaMalloc(&ptr,sizeof(int)*(Cuda::concurrency()));
-  return ptr;
-}
-
-int* threadid_lock_array_cuda_space_ptr(bool deallocate) {
-  static int* ptr = NULL;
-  if(deallocate) {
-    cudaFree(ptr);
-    ptr = NULL;
-  }
-
-  if(ptr==NULL && !deallocate)
-    cudaMalloc(&ptr,sizeof(int)*(Cuda::concurrency()));
-  return ptr;
-}
-
-void init_lock_arrays_cuda_space() {
-  static int is_initialized = 0;
-  if(! is_initialized) {
-    Kokkos::Impl::CudaLockArraysStruct locks;
-    locks.atomic = atomic_lock_array_cuda_space_ptr(false);
-    locks.scratch = scratch_lock_array_cuda_space_ptr(false);
-    locks.threadid = threadid_lock_array_cuda_space_ptr(false);
-    locks.n = Kokkos::Cuda::concurrency();
-    cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
-    init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK+255)/256,256>>>();
-    init_lock_array_kernel_scratch_threadid<<<(Kokkos::Cuda::concurrency()+255)/256,256>>>(Kokkos::Cuda::concurrency());
-  }
-}
-
 void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink) {
   static void* ptr = NULL;
   static std::int64_t current_size = 0;
@@ -908,8 +820,8 @@ void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink) {
   return ptr;
 }
 
-}
-}
+} // namespace Impl
+} // namespace Kokkos
 #else
 void KOKKOS_CORE_SRC_CUDA_CUDASPACE_PREVENT_LINK_ERROR() {}
 #endif // KOKKOS_ENABLE_CUDA
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
index daf55cbd97..80e8f9bd8a 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
@@ -51,6 +51,7 @@
 
 #include <Cuda/Kokkos_Cuda_Error.hpp>
 #include <Cuda/Kokkos_Cuda_Internal.hpp>
+#include <Cuda/Kokkos_Cuda_Locks.hpp>
 #include <impl/Kokkos_Error.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>
 
@@ -69,9 +70,6 @@
 __device__ __constant__
 unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits::ConstantMemoryUsage / sizeof(unsigned long) ] ;
 
-__device__ __constant__
-Kokkos::Impl::CudaLockArraysStruct kokkos_impl_cuda_lock_arrays ;
-
 #endif
 
 /*--------------------------------------------------------------------------*/
@@ -103,6 +101,7 @@ int cuda_kernel_arch()
   return arch ;
 }
 
+#ifdef KOKKOS_ENABLE_CUDA_UVM
 bool cuda_launch_blocking()
 {
   const char * env = getenv("CUDA_LAUNCH_BLOCKING");
@@ -111,16 +110,13 @@ bool cuda_launch_blocking()
 
   return atoi(env);
 }
+#endif
 
 }
 
 void cuda_device_synchronize()
 {
-//  static const bool launch_blocking = cuda_launch_blocking();
-
-//  if (!launch_blocking) {
-    CUDA_SAFE_CALL( cudaDeviceSynchronize() );
-//  }
+  CUDA_SAFE_CALL( cudaDeviceSynchronize() );
 }
 
 void cuda_internal_error_throw( cudaError e , const char * name, const char * file, const int line )
@@ -240,6 +236,7 @@ public:
   unsigned    m_maxWarpCount ;
   unsigned    m_maxBlock ;
   unsigned    m_maxSharedWords ;
+  uint32_t    m_maxConcurrency ;
   size_type   m_scratchSpaceCount ;
   size_type   m_scratchFlagsCount ;
   size_type   m_scratchUnifiedCount ;
@@ -248,6 +245,7 @@ public:
   size_type * m_scratchSpace ;
   size_type * m_scratchFlags ;
   size_type * m_scratchUnified ;
+  uint32_t  * m_scratchConcurrentBitset ;
   cudaStream_t * m_stream ;
 
   static int was_initialized;
@@ -274,6 +272,7 @@ public:
     , m_maxWarpCount( 0 )
     , m_maxBlock( 0 )
     , m_maxSharedWords( 0 )
+    , m_maxConcurrency( 0 )
     , m_scratchSpaceCount( 0 )
     , m_scratchFlagsCount( 0 )
     , m_scratchUnifiedCount( 0 )
@@ -282,6 +281,7 @@ public:
     , m_scratchSpace( 0 )
     , m_scratchFlags( 0 )
     , m_scratchUnified( 0 )
+    , m_scratchConcurrentBitset( 0 )
     , m_stream( 0 )
     {}
 
@@ -327,7 +327,8 @@ CudaInternal::~CudaInternal()
   if ( m_stream ||
        m_scratchSpace ||
        m_scratchFlags ||
-       m_scratchUnified ) {
+       m_scratchUnified ||
+       m_scratchConcurrentBitset ) {
     std::cerr << "Kokkos::Cuda ERROR: Failed to call Kokkos::Cuda::finalize()"
               << std::endl ;
     std::cerr.flush();
@@ -339,6 +340,7 @@ CudaInternal::~CudaInternal()
   m_maxWarpCount            = 0 ;
   m_maxBlock                = 0 ;
   m_maxSharedWords          = 0 ;
+  m_maxConcurrency          = 0 ;
   m_scratchSpaceCount       = 0 ;
   m_scratchFlagsCount       = 0 ;
   m_scratchUnifiedCount     = 0 ;
@@ -347,6 +349,7 @@ CudaInternal::~CudaInternal()
   m_scratchSpace            = 0 ;
   m_scratchFlags            = 0 ;
   m_scratchUnified          = 0 ;
+  m_scratchConcurrentBitset = 0 ;
   m_stream                  = 0 ;
 }
 
@@ -485,6 +488,33 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
       (void) scratch_space( reduce_block_count * 16 * sizeof(size_type) );
     }
     //----------------------------------
+    // Concurrent bitset for obtaining unique tokens from within
+    // an executing kernel.
+    {
+      const unsigned max_threads_per_sm = 2048 ; // up to capability 7.0
+
+      m_maxConcurrency =
+        max_threads_per_sm * cudaProp.multiProcessorCount ;
+
+      const int32_t buffer_bound =
+         Kokkos::Impl::concurrent_bitset::buffer_bound( m_maxConcurrency );
+
+      // Allocate and initialize uint32_t[ buffer_bound ]
+
+      typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;
+
+      Record * const r = Record::allocate( Kokkos::CudaSpace()
+                                         , "InternalScratchBitset"
+                                         , sizeof(uint32_t) * buffer_bound );
+
+      Record::increment( r );
+
+      m_scratchConcurrentBitset = reinterpret_cast<uint32_t *>( r->data() );
+
+      CUDA_SAFE_CALL( cudaMemset( m_scratchConcurrentBitset , 0 , sizeof(uint32_t) * buffer_bound ) );
+
+    }
+    //----------------------------------
 
     if ( stream_count ) {
       m_stream = (cudaStream_t*) ::malloc( stream_count * sizeof(cudaStream_t) );
@@ -543,16 +573,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
   cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
 
   // Init the array for used for arbitrarily sized atomics
-  Impl::init_lock_arrays_cuda_space();
-
-  #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-  Kokkos::Impl::CudaLockArraysStruct locks;
-  locks.atomic = atomic_lock_array_cuda_space_ptr(false);
-  locks.scratch = scratch_lock_array_cuda_space_ptr(false);
-  locks.threadid = threadid_lock_array_cuda_space_ptr(false);
-  locks.n = Kokkos::Cuda::concurrency();
-  cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
-  #endif
+  Impl::initialize_host_cuda_lock_arrays();
 }
 
 //----------------------------------------------------------------------------
@@ -635,9 +656,7 @@ void CudaInternal::finalize()
   was_finalized = 1;
   if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {
 
-    atomic_lock_array_cuda_space_ptr(true);
-    scratch_lock_array_cuda_space_ptr(true);
-    threadid_lock_array_cuda_space_ptr(true);
+    Impl::finalize_host_cuda_lock_arrays();
 
     if ( m_stream ) {
       for ( size_type i = 1 ; i < m_streamCount ; ++i ) {
@@ -653,6 +672,7 @@ void CudaInternal::finalize()
     RecordCuda::decrement( RecordCuda::get_record( m_scratchFlags ) );
     RecordCuda::decrement( RecordCuda::get_record( m_scratchSpace ) );
     RecordHost::decrement( RecordHost::get_record( m_scratchUnified ) );
+    RecordCuda::decrement( RecordCuda::get_record( m_scratchConcurrentBitset ) );
 
     m_cudaDev             = -1 ;
     m_multiProcCount      = 0 ;
@@ -666,6 +686,7 @@ void CudaInternal::finalize()
     m_scratchSpace        = 0 ;
     m_scratchFlags        = 0 ;
     m_scratchUnified      = 0 ;
+    m_scratchConcurrentBitset = 0 ;
     m_stream              = 0 ;
   }
 }
@@ -713,9 +734,8 @@ namespace Kokkos {
 Cuda::size_type Cuda::detect_device_count()
 { return Impl::CudaInternalDevices::singleton().m_cudaDevCount ; }
 
-int Cuda::concurrency() {
-  return 131072;
-}
+int Cuda::concurrency()
+{ return Impl::CudaInternal::singleton().m_maxConcurrency ; }
 
 int Cuda::is_initialized()
 { return Impl::CudaInternal::singleton().is_initialized(); }
@@ -798,7 +818,22 @@ void Cuda::fence()
 const char* Cuda::name() { return "Cuda"; }
 
 } // namespace Kokkos
+
+namespace Kokkos {
+namespace Experimental {
+
+UniqueToken< Kokkos::Cuda , Kokkos::Experimental::UniqueTokenScope::Global >::
+UniqueToken( Kokkos::Cuda const & )
+  : m_buffer( Kokkos::Impl::CudaInternal::singleton().m_scratchConcurrentBitset )
+  , m_count(  Kokkos::Impl::CudaInternal::singleton().m_maxConcurrency )
+  {}
+
+} // namespace Experimental
+} // namespace Kokkos
+
 #else
+
 void KOKKOS_CORE_SRC_CUDA_IMPL_PREVENT_LINK_ERROR() {}
+
 #endif // KOKKOS_ENABLE_CUDA
 
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp
new file mode 100644
index 0000000000..237022ad23
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp
@@ -0,0 +1,119 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+
+#ifdef KOKKOS_ENABLE_CUDA
+
+#include <Cuda/Kokkos_Cuda_Locks.hpp>
+#include <Cuda/Kokkos_Cuda_Error.hpp>
+#include <Kokkos_Cuda.hpp>
+
+#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+namespace Kokkos {
+namespace Impl {
+__device__ __constant__
+CudaLockArrays g_device_cuda_lock_arrays = { nullptr, nullptr, 0 };
+}
+}
+#endif
+
+namespace Kokkos {
+
+namespace {
+
+__global__ void init_lock_array_kernel_atomic() {
+  unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
+  if(i<CUDA_SPACE_ATOMIC_MASK+1) {
+    Kokkos::Impl::g_device_cuda_lock_arrays.atomic[i] = 0;
+  }
+}
+
+__global__ void init_lock_array_kernel_threadid(int N) {
+  unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
+  if(i<(unsigned)N) {
+    Kokkos::Impl::g_device_cuda_lock_arrays.scratch[i] = 0;
+  }
+}
+
+} // namespace
+
+namespace Impl {
+
+CudaLockArrays g_host_cuda_lock_arrays = { nullptr, nullptr, 0 };
+
+void initialize_host_cuda_lock_arrays() {
+  if (g_host_cuda_lock_arrays.atomic != nullptr) return;
+  CUDA_SAFE_CALL(cudaMalloc(&g_host_cuda_lock_arrays.atomic,
+                 sizeof(int)*(CUDA_SPACE_ATOMIC_MASK+1)));
+  CUDA_SAFE_CALL(cudaMalloc(&g_host_cuda_lock_arrays.scratch,
+                 sizeof(int)*(Cuda::concurrency())));
+  CUDA_SAFE_CALL(cudaDeviceSynchronize());
+  g_host_cuda_lock_arrays.n = Cuda::concurrency();
+  KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
+  init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK+1+255)/256,256>>>();
+  init_lock_array_kernel_threadid<<<(Kokkos::Cuda::concurrency()+255)/256,256>>>(Kokkos::Cuda::concurrency());
+  CUDA_SAFE_CALL(cudaDeviceSynchronize());
+}
+
+void finalize_host_cuda_lock_arrays() {
+  if (g_host_cuda_lock_arrays.atomic == nullptr) return;
+  cudaFree(g_host_cuda_lock_arrays.atomic);
+  g_host_cuda_lock_arrays.atomic = nullptr;
+  cudaFree(g_host_cuda_lock_arrays.scratch);
+  g_host_cuda_lock_arrays.scratch = nullptr;
+  g_host_cuda_lock_arrays.n = 0;
+#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+  KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
+#endif
+}
+
+} // namespace Impl
+
+} // namespace Kokkos
+
+#else
+
+void KOKKOS_CORE_SRC_CUDA_CUDA_LOCKS_PREVENT_LINK_ERROR() {}
+
+#endif
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
new file mode 100644
index 0000000000..d01f06fb4f
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
@@ -0,0 +1,166 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_LOCKS_HPP
+#define KOKKOS_CUDA_LOCKS_HPP
+
+#include <Kokkos_Macros.hpp>
+
+#ifdef KOKKOS_ENABLE_CUDA
+
+#include <cstdint>
+
+#include <Cuda/Kokkos_Cuda_Error.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+struct CudaLockArrays {
+  std::int32_t* atomic;
+  std::int32_t* scratch;
+  std::int32_t n;
+};
+
+/// \brief This global variable in Host space is the central definition
+///        of these arrays.
+extern Kokkos::Impl::CudaLockArrays g_host_cuda_lock_arrays ;
+
+/// \brief After this call, the g_host_cuda_lock_arrays variable has
+///        valid, initialized arrays.
+///
+/// This call is idempotent.
+void initialize_host_cuda_lock_arrays();
+
+/// \brief After this call, the g_host_cuda_lock_arrays variable has
+///        all null pointers, and all array memory has been freed.
+///
+/// This call is idempotent.
+void finalize_host_cuda_lock_arrays();
+
+} // namespace Impl
+} // namespace Kokkos
+
+#if defined( __CUDACC__ )
+
+namespace Kokkos {
+namespace Impl {
+
+/// \brief This global variable in CUDA space is what kernels use
+///        to get access to the lock arrays.
+///
+/// When relocatable device code is enabled, there can be one single
+/// instance of this global variable for the entire executable,
+/// whose definition will be in Kokkos_Cuda_Locks.cpp (and whose declaration
+/// here must then be extern.
+/// This one instance will be initialized by initialize_host_cuda_lock_arrays
+/// and need not be modified afterwards.
+///
+/// When relocatable device code is disabled, an instance of this variable
+/// will be created in every translation unit that sees this header file
+/// (we make this clear by marking it static, meaning no other translation
+///  unit can link to it).
+/// Since the Kokkos_Cuda_Locks.cpp translation unit cannot initialize the
+/// instances in other translation units, we must update this CUDA global
+/// variable based on the Host global variable prior to running any kernels
+/// that will use it.
+/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro.
+__device__ __constant__
+#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+extern
+#endif
+Kokkos::Impl::CudaLockArrays g_device_cuda_lock_arrays ;
+
+#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
+
+/// \brief Aquire a lock for the address
+///
+/// This function tries to aquire the lock for the hash value derived
+/// from the provided ptr. If the lock is successfully aquired the
+/// function returns true. Otherwise it returns false.
+__device__ inline
+bool lock_address_cuda_space(void* ptr) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & CUDA_SPACE_ATOMIC_MASK;
+  return (0 == atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset],0,1));
+}
+
+/// \brief Release lock for the address
+///
+/// This function releases the lock for the hash value derived
+/// from the provided ptr. This function should only be called
+/// after previously successfully aquiring a lock with
+/// lock_address.
+__device__ inline
+void unlock_address_cuda_space(void* ptr) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & CUDA_SPACE_ATOMIC_MASK;
+  atomicExch( &Kokkos::Impl::g_device_cuda_lock_arrays.atomic[ offset ], 0);
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+/* Dan Ibanez: it is critical that this code be a macro, so that it will
+   capture the right address for Kokkos::Impl::g_device_cuda_lock_arrays!
+   putting this in an inline function will NOT do the right thing! */
+#define KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \
+{ \
+  CUDA_SAFE_CALL(cudaMemcpyToSymbol( \
+        Kokkos::Impl::g_device_cuda_lock_arrays , \
+        & Kokkos::Impl::g_host_cuda_lock_arrays , \
+        sizeof(Kokkos::Impl::CudaLockArrays) ) ); \
+}
+
+#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
+#else
+#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()
+#endif
+
+#endif /* defined( __CUDACC__ ) */
+
+#endif /* defined( KOKKOS_ENABLE_CUDA ) */
+
+#endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
index 0c8c700e8f..e2eab19e45 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
@@ -58,6 +58,7 @@
 #include <Cuda/Kokkos_CudaExec.hpp>
 #include <Cuda/Kokkos_Cuda_ReduceScan.hpp>
 #include <Cuda/Kokkos_Cuda_Internal.hpp>
+#include <Cuda/Kokkos_Cuda_Locks.hpp>
 #include <Kokkos_Vectorization.hpp>
 
 #if defined(KOKKOS_ENABLE_PROFILING)
@@ -65,6 +66,8 @@
 #include <typeinfo>
 #endif
 
+#include <KokkosExp_MDRangePolicy.hpp>
+
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
@@ -318,6 +321,7 @@ private:
   typedef Kokkos::RangePolicy< Traits ... > Policy;
   typedef typename Policy::member_type  Member ;
   typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::launch_bounds LaunchBounds ;
 
   const FunctorType  m_functor ;
   const Policy       m_policy ;
@@ -363,7 +367,7 @@ public:
       const dim3 block(  1 , CudaTraits::WarpSize * cuda_internal_maximum_warp_count(), 1);
       const dim3 grid( std::min( ( nwork + block.y - 1 ) / block.y , cuda_internal_maximum_grid_count() ) , 1 , 1);
 
-      CudaParallelLaunch< ParallelFor >( *this , grid , block , 0 );
+      CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
     }
 
   ParallelFor( const FunctorType  & arg_functor ,
@@ -373,6 +377,115 @@ public:
     { }
 };
 
+
+// MDRangePolicy impl
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType
+                 , Kokkos::Experimental::MDRangePolicy< Traits ... >
+                 , Kokkos::Cuda
+                 >
+{
+private:
+  typedef Kokkos::Experimental::MDRangePolicy< Traits ...  > Policy ;
+  using RP = Policy;
+  typedef typename Policy::array_index_type array_index_type;
+  typedef typename Policy::index_type index_type;
+  typedef typename Policy::launch_bounds LaunchBounds;
+
+
+  const FunctorType m_functor ;
+  const Policy      m_rp ;
+
+public:
+
+  inline
+  __device__
+  void operator()(void) const
+    {
+      Kokkos::Experimental::Impl::Refactor::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag>(m_rp,m_functor).exec_range();
+    }
+
+
+  inline
+  void execute() const
+  {
+    const array_index_type maxblocks = static_cast<array_index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
+    if ( RP::rank == 2 )
+    {
+      const dim3 block( m_rp.m_tile[0] , m_rp.m_tile[1] , 1);
+      const dim3 grid(
+            std::min( ( m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1 ) / block.x , maxblocks )
+          , std::min( ( m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1 ) / block.y , maxblocks )
+          , 1
+          );
+      CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
+    }
+    else if ( RP::rank == 3 )
+    {
+      const dim3 block( m_rp.m_tile[0] , m_rp.m_tile[1] , m_rp.m_tile[2] );
+      const dim3 grid(
+          std::min( ( m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1 ) / block.x , maxblocks )
+        , std::min( ( m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1 ) / block.y , maxblocks )
+        , std::min( ( m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1 ) / block.z , maxblocks )
+        );
+      CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
+    }
+    else if ( RP::rank == 4 )
+    {
+      // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to threadIdx.z
+      const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2] , m_rp.m_tile[3] );
+      const dim3 grid(
+          std::min( static_cast<index_type>( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] )
+                  , static_cast<index_type>(maxblocks) )
+        , std::min( ( m_rp.m_upper[2] - m_rp.m_lower[2] + block.y - 1 ) / block.y , maxblocks )
+        , std::min( ( m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1 ) / block.z , maxblocks )
+        );
+      CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
+    }
+    else if ( RP::rank == 5 )
+    {
+      // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to threadIdx.z
+      const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2]*m_rp.m_tile[3] , m_rp.m_tile[4] );
+      const dim3 grid(
+          std::min( static_cast<index_type>( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] )
+                  , static_cast<index_type>(maxblocks) )
+        , std::min( static_cast<index_type>( m_rp.m_tile_end[2] * m_rp.m_tile_end[3] )
+                  , static_cast<index_type>(maxblocks) )
+        , std::min( ( m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1 ) / block.z , maxblocks )
+        );
+      CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
+    }
+    else if ( RP::rank == 6 )
+    {
+      // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to threadIdx.z
+      const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2]*m_rp.m_tile[3] , m_rp.m_tile[4]*m_rp.m_tile[5] );
+      const dim3 grid(
+          std::min( static_cast<index_type>( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] )
+                  , static_cast<index_type>(maxblocks) )
+        ,  std::min( static_cast<index_type>( m_rp.m_tile_end[2] * m_rp.m_tile_end[3] )
+                  , static_cast<index_type>(maxblocks) )
+        , std::min( static_cast<index_type>( m_rp.m_tile_end[4] * m_rp.m_tile_end[5] )
+                  , static_cast<index_type>(maxblocks) )
+        );
+      CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
+    }
+    else
+    {
+      printf("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n");
+      Kokkos::abort("Aborting");
+    }
+
+  } //end execute
+
+//  inline
+  ParallelFor( const FunctorType & arg_functor
+             , Policy arg_policy )
+    : m_functor( arg_functor )
+    , m_rp(  arg_policy )
+    {}
+};
+
+
 template< class FunctorType , class ... Properties >
 class ParallelFor< FunctorType
                  , Kokkos::TeamPolicy< Properties ... >
@@ -384,6 +497,7 @@ private:
   typedef TeamPolicyInternal< Kokkos::Cuda , Properties ... >   Policy ;
   typedef typename Policy::member_type  Member ;
   typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::launch_bounds  LaunchBounds ;
 
 public:
 
@@ -430,15 +544,15 @@ public:
     if ( m_scratch_size[1]>0 ) {
       __shared__ int base_thread_id;
       if (threadIdx.x==0 && threadIdx.y==0 ) {
-        threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % kokkos_impl_cuda_lock_arrays.n;
+        threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % Kokkos::Impl::g_device_cuda_lock_arrays.n;
         threadid = ((threadid + blockDim.x * blockDim.y-1)/(blockDim.x * blockDim.y)) * blockDim.x * blockDim.y;
-        if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
+        if(threadid > Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
         int done = 0;
         while (!done) {
-          done = (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[threadid],0,1));
+          done = (0 == atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid],0,1));
           if(!done) {
             threadid += blockDim.x * blockDim.y;
-            if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid = 0;
+            if(threadid > Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid = 0;
           }
         }
         base_thread_id = threadid;
@@ -448,7 +562,8 @@ public:
     }
 
 
-    for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) {
+    const int int_league_size = (int)m_league_size;
+    for ( int league_rank = blockIdx.x ; league_rank < int_league_size ; league_rank += gridDim.x ) {
 
       this-> template exec_team< WorkTag >(
         typename Policy::member_type( kokkos_impl_cuda_shared_memory<void>()
@@ -462,7 +577,7 @@ public:
     if ( m_scratch_size[1]>0 ) {
       __syncthreads();
       if (threadIdx.x==0 && threadIdx.y==0 )
-        kokkos_impl_cuda_lock_arrays.atomic[threadid]=0;
+        Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid]=0;
     }
   }
 
@@ -473,7 +588,7 @@ public:
       const dim3 grid( int(m_league_size) , 1 , 1 );
       const dim3 block( int(m_vector_size) , int(m_team_size) , 1 );
 
-      CudaParallelLaunch< ParallelFor >( *this, grid, block, shmem_size_total ); // copy to device and execute
+      CudaParallelLaunch< ParallelFor, LaunchBounds >( *this, grid, block, shmem_size_total ); // copy to device and execute
 
     }
 
@@ -529,6 +644,7 @@ private:
   typedef typename Policy::WorkRange    WorkRange ;
   typedef typename Policy::work_tag     WorkTag ;
   typedef typename Policy::member_type  Member ;
+  typedef typename Policy::launch_bounds LaunchBounds ;
 
   typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
   typedef typename ReducerConditional::type ReducerTypeFwd;
@@ -563,6 +679,7 @@ private:
   typedef int DummySHMEMReductionType;
 
 public:
+  // Make the exec_range calls call to Reduce::DeviceIterateTile
   template< class TagType >
   __device__ inline
   typename std::enable_if< std::is_same< TagType , void >::value >::type
@@ -686,7 +803,7 @@ public:
 
       const int shmem = UseShflReduction?0:cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( m_functor , block.y );
 
-      CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem ); // copy to device and execute
+      CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
 
       Cuda::fence();
 
@@ -737,6 +854,232 @@ public:
   { }
 };
 
+
+// MDRangePolicy impl
+template< class FunctorType , class ReducerType, class ... Traits >
+class ParallelReduce< FunctorType
+                    , Kokkos::Experimental::MDRangePolicy< Traits ... >
+                    , ReducerType
+                    , Kokkos::Cuda
+                    >
+{
+private:
+
+  typedef Kokkos::Experimental::MDRangePolicy< Traits ... > Policy ;
+  typedef typename Policy::array_index_type                 array_index_type;
+  typedef typename Policy::index_type                       index_type;
+
+  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::member_type  Member ;
+  typedef typename Policy::launch_bounds LaunchBounds;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd, WorkTag > ValueInit ;
+  typedef Kokkos::Impl::FunctorValueJoin<   ReducerTypeFwd, WorkTag > ValueJoin ;
+
+public:
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::value_type      value_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+  typedef FunctorType                           functor_type ;
+  typedef Cuda::size_type                       size_type ;
+
+  // Algorithmic constraints: blockSize is a power of two AND blockDim.y == blockDim.z == 1
+
+  const FunctorType   m_functor ;
+  const Policy        m_policy ; // used for workrange and nwork
+  const ReducerType   m_reducer ;
+  const pointer_type  m_result_ptr ;
+  size_type *         m_scratch_space ;
+  size_type *         m_scratch_flags ;
+  size_type *         m_unified_space ;
+
+  typedef typename Kokkos::Experimental::Impl::Reduce::DeviceIterateTile<Policy::rank, Policy, FunctorType, typename Policy::work_tag, reference_type> DeviceIteratePattern;
+
+  // Shall we use the shfl based reduction or not (only use it for static sized types of more than 128bit
+  enum { UseShflReduction = ((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize) };
+  // Some crutch to do function overloading
+private:
+  typedef double DummyShflReductionType;
+  typedef int DummySHMEMReductionType;
+
+public:
+  inline
+  __device__
+  void
+  exec_range( reference_type update ) const
+  {
+    Kokkos::Experimental::Impl::Reduce::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag, reference_type>(m_policy, m_functor, update).exec_range();
+  }
+
+  inline
+  __device__
+  void operator() (void) const {
+    run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0) );
+  }
+
+  __device__ inline
+  void run(const DummySHMEMReductionType& ) const
+  {
+    const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
+      word_count( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) / sizeof(size_type) );
+
+    {
+      reference_type value =
+        ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.y * word_count.value );
+
+      // Number of blocks is bounded so that the reduction can be limited to two passes.
+      // Each thread block is given an approximately equal amount of work to perform.
+      // Accumulate the values for this block.
+      // The accumulation ordering does not match the final pass, but is arithmatically equivalent.
+
+      this-> exec_range( value );
+    }
+
+    // Reduce with final value at blockDim.y - 1 location.
+    // Problem: non power-of-two blockDim
+    if ( cuda_single_inter_block_reduce_scan<false,ReducerTypeFwd,WorkTag>(
+           ReducerConditional::select(m_functor , m_reducer) , blockIdx.x , gridDim.x ,
+           kokkos_impl_cuda_shared_memory<size_type>() , m_scratch_space , m_scratch_flags ) ) {
+
+      // This is the final block with the final result at the final threads' location
+      size_type * const shared = kokkos_impl_cuda_shared_memory<size_type>() + ( blockDim.y - 1 ) * word_count.value ;
+      size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;
+
+      if ( threadIdx.y == 0 ) {
+        Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , shared );
+      }
+
+      if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); }
+
+      for ( unsigned i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i]; }
+    }
+  }
+
+  __device__ inline
+   void run(const DummyShflReductionType&) const
+   {
+
+     value_type value;
+     ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &value);
+     // Number of blocks is bounded so that the reduction can be limited to two passes.
+     // Each thread block is given an approximately equal amount of work to perform.
+     // Accumulate the values for this block.
+     // The accumulation ordering does not match the final pass, but is arithmatically equivalent.
+
+     const Member work_part =
+       ( ( m_policy.m_num_tiles + ( gridDim.x - 1 ) ) / gridDim.x ); //portion of tiles handled by each block
+
+     this-> exec_range( value );
+
+     pointer_type const result = (pointer_type) (m_unified_space ? m_unified_space : m_scratch_space) ;
+
+     int max_active_thread = work_part < blockDim.y ? work_part:blockDim.y;
+     max_active_thread = (max_active_thread == 0)?blockDim.y:max_active_thread;
+
+     value_type init;
+     ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &init);
+     if(Impl::cuda_inter_block_reduction<ReducerTypeFwd,ValueJoin,WorkTag>
+         (value,init,ValueJoin(ReducerConditional::select(m_functor , m_reducer)),m_scratch_space,result,m_scratch_flags,max_active_thread)) {
+       const unsigned id = threadIdx.y*blockDim.x + threadIdx.x;
+       if(id==0) {
+         Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , (void*) &value );
+         *result = value;
+       }
+     }
+   }
+
+  // Determine block size constrained by shared memory:
+  static inline
+  unsigned local_block_size( const FunctorType & f )
+    {
+      unsigned n = CudaTraits::WarpSize * 8 ;
+      while ( n && CudaTraits::SharedMemoryCapacity < cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( f , n ) ) { n >>= 1 ; }
+      return n ;
+    }
+
+  inline
+  void execute()
+    {
+      const int nwork = m_policy.m_num_tiles;
+      if ( nwork ) {
+        int block_size = m_policy.m_prod_tile_dims;
+        // CONSTRAINT: Algorithm requires block_size >= product of tile dimensions
+        // Nearest power of two
+        int exponent_pow_two = std::ceil( std::log2(block_size) );
+        block_size = std::pow(2, exponent_pow_two);
+        int suggested_blocksize = local_block_size( m_functor );
+
+        block_size = (block_size > suggested_blocksize) ? block_size : suggested_blocksize ; //Note: block_size must be less than or equal to 512
+
+
+        m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) * block_size /* block_size == max block_count */ );
+        m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) );
+        m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) );
+
+        // REQUIRED ( 1 , N , 1 )
+        const dim3 block( 1 , block_size , 1 );
+        // Required grid.x <= block.y
+        const dim3 grid( std::min( int(block.y) , int( nwork ) ) , 1 , 1 );
+
+      const int shmem = UseShflReduction?0:cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( m_functor , block.y );
+
+      CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
+
+      Cuda::fence();
+
+      if ( m_result_ptr ) {
+        if ( m_unified_space ) {
+          const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer)  );
+          for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
+        }
+        else {
+          const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer)  );
+          DeepCopy<HostSpace,CudaSpace>( m_result_ptr , m_scratch_space , size );
+        }
+      }
+    }
+    else {
+      if (m_result_ptr) {
+        ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , m_result_ptr );
+      }
+    }
+  }
+
+  template< class HostViewType >
+  ParallelReduce( const FunctorType  & arg_functor
+                , const Policy       & arg_policy
+                , const HostViewType & arg_result
+                , typename std::enable_if<
+                   Kokkos::is_view< HostViewType >::value
+                ,void*>::type = NULL)
+  : m_functor( arg_functor )
+  , m_policy(  arg_policy )
+  , m_reducer( InvalidType() )
+  , m_result_ptr( arg_result.ptr_on_device() )
+  , m_scratch_space( 0 )
+  , m_scratch_flags( 0 )
+  , m_unified_space( 0 )
+  {}
+
+  ParallelReduce( const FunctorType  & arg_functor
+                , const Policy       & arg_policy
+                , const ReducerType & reducer)
+  : m_functor( arg_functor )
+  , m_policy(  arg_policy )
+  , m_reducer( reducer )
+  , m_result_ptr( reducer.view().ptr_on_device() )
+  , m_scratch_space( 0 )
+  , m_scratch_flags( 0 )
+  , m_unified_space( 0 )
+  {}
+};
+
+
 //----------------------------------------------------------------------------
 
 #if 1
@@ -753,6 +1096,7 @@ private:
   typedef TeamPolicyInternal< Kokkos::Cuda, Properties ... >  Policy ;
   typedef typename Policy::member_type  Member ;
   typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::launch_bounds     LaunchBounds ;
 
   typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
   typedef typename ReducerConditional::type ReducerTypeFwd;
@@ -819,15 +1163,15 @@ public:
     if ( m_scratch_size[1]>0 ) {
       __shared__ int base_thread_id;
       if (threadIdx.x==0 && threadIdx.y==0 ) {
-        threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % kokkos_impl_cuda_lock_arrays.n;
+        threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % Kokkos::Impl::g_device_cuda_lock_arrays.n;
         threadid = ((threadid + blockDim.x * blockDim.y-1)/(blockDim.x * blockDim.y)) * blockDim.x * blockDim.y;
-        if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
+        if(threadid > Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
         int done = 0;
         while (!done) {
-          done = (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[threadid],0,1));
+          done = (0 == atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid],0,1));
           if(!done) {
             threadid += blockDim.x * blockDim.y;
-            if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid = 0;
+            if(threadid > Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid = 0;
           }
         }
         base_thread_id = threadid;
@@ -840,7 +1184,7 @@ public:
     if ( m_scratch_size[1]>0 ) {
       __syncthreads();
       if (threadIdx.x==0 && threadIdx.y==0 )
-        kokkos_impl_cuda_lock_arrays.atomic[threadid]=0;
+        Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid]=0;
     }
   }
 
@@ -854,7 +1198,8 @@ public:
       ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.y * word_count.value );
 
     // Iterate this block through the league
-    for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) {
+    const int int_league_size = (int)m_league_size;
+    for ( int league_rank = blockIdx.x ; league_rank < int_league_size ; league_rank += gridDim.x ) {
       this-> template exec_team< WorkTag >
         ( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin
                                         , m_shmem_begin
@@ -894,7 +1239,8 @@ public:
     ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &value);
 
     // Iterate this block through the league
-    for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) {
+    const int int_league_size = (int)m_league_size;
+    for ( int league_rank = blockIdx.x ; league_rank < int_league_size ; league_rank += gridDim.x ) {
       this-> template exec_team< WorkTag >
         ( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin
                                         , m_shmem_begin
@@ -936,7 +1282,7 @@ public:
         const dim3 grid( block_count , 1 , 1 );
         const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size ;
 
-        CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem_size_total ); // copy to device and execute
+        CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem_size_total ); // copy to device and execute
 
         Cuda::fence();
 
@@ -975,12 +1321,6 @@ public:
   , m_shmem_begin( 0 )
   , m_shmem_size( 0 )
   , m_scratch_ptr{NULL,NULL}
-  , m_league_size( arg_policy.league_size() )
-  , m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
-      Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
-                                                               arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
-                                                               arg_policy.vector_length() )
-  , m_vector_size( arg_policy.vector_length() )
   , m_scratch_size{
     arg_policy.scratch_size(0,( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
         Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
@@ -991,6 +1331,12 @@ public:
                                                                  arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
                                                                  arg_policy.vector_length() )
         )}
+  , m_league_size( arg_policy.league_size() )
+  , m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
+      Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
+                                                               arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
+                                                               arg_policy.vector_length() )
+  , m_vector_size( arg_policy.vector_length() )
   {
     // Return Init value if the number of worksets is zero
     if( arg_policy.league_size() == 0) {
@@ -1150,6 +1496,7 @@ private:
   typedef typename reducer_type<>::pointer_type    pointer_type ;
   typedef typename reducer_type<>::reference_type  reference_type ;
   typedef typename reducer_type<>::value_type      value_type ;
+  typedef typename Policy::launch_bounds           LaunchBounds ;
 
   typedef Kokkos::Impl::FunctorAnalysis
     < Kokkos::Impl::FunctorPatternInterface::REDUCE
@@ -1273,7 +1620,7 @@ public:
         const int  shmem = m_shmem_team_begin + m_shmem_team_size ;
 
         // copy to device and execute
-        CudaParallelLaunch<ParallelReduce>( *this, grid, block, shmem );
+        CudaParallelLaunch<ParallelReduce,LaunchBounds>( *this, grid, block, shmem );
 
         Cuda::fence();
 
@@ -1373,7 +1720,7 @@ public:
 
     if ( CudaTraits::WarpSize < team_threads ) {
       // Need inter-warp team reduction (collectives) shared memory
-      // Speculate an upper bound for the value size 
+      // Speculate an upper bound for the value size
 
       m_shmem_team_begin =
         align_scratch( CudaTraits::warp_count(team_threads) * sizeof(double) );
@@ -1426,7 +1773,7 @@ public:
 
     // Reduce space has claim flag followed by vaue buffer
     const int global_reduce_value_size =
-      max_concurrent_block * 
+      max_concurrent_block *
       ( aligned_flag_size + align_scratch( value_size ) );
 
     // Scratch space has claim flag followed by scratch buffer
@@ -1469,6 +1816,7 @@ private:
   typedef typename Policy::member_type  Member ;
   typedef typename Policy::work_tag     WorkTag ;
   typedef typename Policy::WorkRange    WorkRange ;
+  typedef typename Policy::launch_bounds  LaunchBounds ;
 
   typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
   typedef Kokkos::Impl::FunctorValueInit<   FunctorType, WorkTag > ValueInit ;
@@ -1655,10 +2003,10 @@ public:
         const int shmem = ValueTraits::value_size( m_functor ) * ( block_size + 2 );
 
         m_final = false ;
-        CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute
+        CudaParallelLaunch< ParallelScan, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
 
         m_final = true ;
-        CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute
+        CudaParallelLaunch< ParallelScan, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
       }
     }
 
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
index 432c7895cc..709cbbd534 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@@ -151,7 +151,7 @@ template< class ValueType , class JoinOp>
 __device__
 inline void cuda_intra_warp_reduction( ValueType& result,
                                        const JoinOp& join,
-                                       const int max_active_thread = blockDim.y) {
+                                       const uint32_t max_active_thread = blockDim.y) {
 
   unsigned int shift = 1;
 
@@ -268,29 +268,33 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT
         if( id + 1 < int(gridDim.x) )
           join(value, tmp);
       }
+      int active = __ballot(1);
       if (int(blockDim.x*blockDim.y) > 2) {
         value_type tmp = Kokkos::shfl_down(value, 2,32);
         if( id + 2 < int(gridDim.x) )
           join(value, tmp);
       }
+      active += __ballot(1);
       if (int(blockDim.x*blockDim.y) > 4) {
         value_type tmp = Kokkos::shfl_down(value, 4,32);
         if( id + 4 < int(gridDim.x) )
           join(value, tmp);
       }
+      active += __ballot(1);
       if (int(blockDim.x*blockDim.y) > 8) {
         value_type tmp = Kokkos::shfl_down(value, 8,32);
         if( id + 8 < int(gridDim.x) )
           join(value, tmp);
       }
+      active += __ballot(1);
       if (int(blockDim.x*blockDim.y) > 16) {
         value_type tmp = Kokkos::shfl_down(value, 16,32);
         if( id + 16 < int(gridDim.x) )
           join(value, tmp);
       }
+      active += __ballot(1);
     }
   }
-
   //The last block has in its thread=0 the global reduction value through "value"
   return last_block;
 #else
@@ -302,7 +306,7 @@ template< class ReducerType >
 __device__ inline
 typename std::enable_if< Kokkos::is_reducer<ReducerType>::value >::type
 cuda_intra_warp_reduction( const ReducerType& reducer,
-                           const int max_active_thread = blockDim.y) {
+                           const uint32_t max_active_thread = blockDim.y) {
 
   typedef typename ReducerType::value_type ValueType;
 
@@ -428,26 +432,31 @@ cuda_inter_block_reduction( const ReducerType& reducer,
         if( id + 1 < int(gridDim.x) )
           reducer.join(value, tmp);
       }
+      int active = __ballot(1);
       if (int(blockDim.x*blockDim.y) > 2) {
         value_type tmp = Kokkos::shfl_down(value, 2,32);
         if( id + 2 < int(gridDim.x) )
           reducer.join(value, tmp);
       }
+      active += __ballot(1);
       if (int(blockDim.x*blockDim.y) > 4) {
         value_type tmp = Kokkos::shfl_down(value, 4,32);
         if( id + 4 < int(gridDim.x) )
           reducer.join(value, tmp);
       }
+      active += __ballot(1);
       if (int(blockDim.x*blockDim.y) > 8) {
         value_type tmp = Kokkos::shfl_down(value, 8,32);
         if( id + 8 < int(gridDim.x) )
           reducer.join(value, tmp);
       }
+      active += __ballot(1);
       if (int(blockDim.x*blockDim.y) > 16) {
         value_type tmp = Kokkos::shfl_down(value, 16,32);
         if( id + 16 < int(gridDim.x) )
           reducer.join(value, tmp);
       }
+      active += __ballot(1);
     }
   }
 
@@ -594,7 +603,7 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType     & functor ,
   typedef FunctorValueOps<    FunctorType , ArgTag >  ValueOps ;
 
   typedef typename ValueTraits::pointer_type    pointer_type ;
-  typedef typename ValueTraits::reference_type  reference_type ;
+  //typedef typename ValueTraits::reference_type  reference_type ;
 
   // '__ffs' = position of the least significant bit set to 1.
   // 'blockDim.y' is guaranteed to be a power of two so this
@@ -637,7 +646,7 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType     & functor ,
 
     {
       void * const shared_ptr = shared_data + word_count.value * threadIdx.y ;
-      reference_type shared_value = ValueInit::init( functor , shared_ptr );
+      /* reference_type shared_value = */ ValueInit::init( functor , shared_ptr );
 
       for ( size_type i = b ; i < e ; ++i ) {
         ValueJoin::join( functor , shared_ptr , global_data + word_count.value * i );
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp
index 3c6f0a5dda..5f08800c40 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp
@@ -58,25 +58,56 @@ template class TaskQueue< Kokkos::Cuda > ;
 
 //----------------------------------------------------------------------------
 
+#if defined( KOKKOS_DEBUG )
+
+__device__
+void verify_warp_convergence( const char * const where )
+{
+  const unsigned b = __ballot(1);
+
+  if ( b != ~0u ) {
+
+printf(" verify_warp_convergence( %s ) (%d,%d,%d) (%d,%d,%d) failed %x\n"
+      , where
+      , blockIdx.x
+      , blockIdx.y
+      , blockIdx.z
+      , threadIdx.x
+      , threadIdx.y
+      , threadIdx.z
+      , b );
+
+  }
+}
+
+#endif // #if defined( KOKKOS_DEBUG )
+
+//----------------------------------------------------------------------------
+
 __device__
 void TaskQueueSpecialization< Kokkos::Cuda >::driver
-  ( TaskQueueSpecialization< Kokkos::Cuda >::queue_type * const queue )
+  ( TaskQueueSpecialization< Kokkos::Cuda >::queue_type * const queue 
+  , int32_t shmem_per_warp )
 {
   using Member = TaskExec< Kokkos::Cuda > ;
   using Queue  = TaskQueue< Kokkos::Cuda > ;
-  using task_root_type = TaskBase< Kokkos::Cuda , void , void > ;
+  using task_root_type = TaskBase< void , void , void > ;
+
+  extern __shared__ int32_t shmem_all[];
 
   task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
 
-  Member single_exec( 1 );
-  Member team_exec( blockDim.y );
+  int32_t * const warp_shmem =
+    shmem_all + ( threadIdx.z * shmem_per_warp ) / sizeof(int32_t);
+
+  task_root_type * const task_shmem = (task_root_type *) warp_shmem ;
 
   const int warp_lane = threadIdx.x + threadIdx.y * blockDim.x ;
 
-  union {
-    task_root_type * ptr ;
-    int              raw[2] ;
-  } task ;
+  Member single_exec( warp_shmem , 1 );
+  Member team_exec( warp_shmem , blockDim.y );
+
+  task_root_type * task_ptr ;
 
   // Loop until all queues are empty and no tasks in flight
 
@@ -87,41 +118,86 @@ void TaskQueueSpecialization< Kokkos::Cuda >::driver
 
     if ( 0 == warp_lane ) {
 
-      task.ptr = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
+      task_ptr = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
 
       // Loop by priority and then type
-      for ( int i = 0 ; i < Queue::NumQueue && end == task.ptr ; ++i ) {
-        for ( int j = 0 ; j < 2 && end == task.ptr ; ++j ) {
-          task.ptr = Queue::pop_ready_task( & queue->m_ready[i][j] );
+      for ( int i = 0 ; i < Queue::NumQueue && end == task_ptr ; ++i ) {
+        for ( int j = 0 ; j < 2 && end == task_ptr ; ++j ) {
+          task_ptr = Queue::pop_ready_task( & queue->m_ready[i][j] );
         }
       }
 
 #if 0
 printf("TaskQueue<Cuda>::driver(%d,%d) task(%lx)\n",threadIdx.z,blockIdx.x
-      , uintptr_t(task.ptr));
+      , uintptr_t(task_ptr));
 #endif
 
     }
 
     // shuffle broadcast
 
-    task.raw[0] = __shfl( task.raw[0] , 0 );
-    task.raw[1] = __shfl( task.raw[1] , 0 );
+    ((int*) & task_ptr )[0] = __shfl( ((int*) & task_ptr )[0] , 0 );
+    ((int*) & task_ptr )[1] = __shfl( ((int*) & task_ptr )[1] , 0 );
 
-    if ( 0 == task.ptr ) break ; // 0 == queue->m_ready_count
+#if defined( KOKKOS_DEBUG )
+    verify_warp_convergence("task_ptr");
+#endif
 
-    if ( end != task.ptr ) {
-      if ( task_root_type::TaskTeam == task.ptr->m_task_type ) {
+    if ( 0 == task_ptr ) break ; // 0 == queue->m_ready_count
+
+    if ( end != task_ptr ) {
+
+      // Whole warp copy task's closure to/from shared memory.
+      // Use all threads of warp for coalesced read/write.
+
+      int32_t const b = sizeof(task_root_type) / sizeof(int32_t);
+      int32_t const e = *((int32_t volatile *)( & task_ptr->m_alloc_size )) / sizeof(int32_t);
+
+      int32_t volatile * const task_mem = (int32_t volatile *) task_ptr ;
+
+      // copy global to shared memory:
+
+      for ( int32_t i = warp_lane ; i < e ; i += CudaTraits::WarpSize ) {
+        warp_shmem[i] = task_mem[i] ;
+      }
+
+      Kokkos::memory_fence();
+
+      // Copy done - use memory fence so that memory writes are visible.
+      // For reliable warp convergence on Pascal and Volta an explicit
+      // warp level synchronization will also be required.
+
+      if ( task_root_type::TaskTeam == task_shmem->m_task_type ) {
         // Thread Team Task
-        (*task.ptr->m_apply)( task.ptr , & team_exec );
+        (*task_shmem->m_apply)( task_shmem , & team_exec );
       }
       else if ( 0 == threadIdx.y ) {
         // Single Thread Task
-        (*task.ptr->m_apply)( task.ptr , & single_exec );
+        (*task_shmem->m_apply)( task_shmem , & single_exec );
       }
 
+      // copy shared to global memory:
+
+      for ( int32_t i = b + warp_lane ; i < e ; i += CudaTraits::WarpSize ) {
+        task_mem[i] = warp_shmem[i] ;
+      }
+
+      Kokkos::memory_fence();
+
+#if defined( KOKKOS_DEBUG )
+    verify_warp_convergence("apply");
+#endif
+
+      // If respawn requested copy respawn data back to main memory
+
       if ( 0 == warp_lane ) {
-        queue->complete( task.ptr );
+
+        if ( ((task_root_type *) task_root_type::LockTag) != task_shmem->m_next ) {
+          ( (volatile task_root_type *) task_ptr )->m_next = task_shmem->m_next ;
+          ( (volatile task_root_type *) task_ptr )->m_priority = task_shmem->m_priority ;
+        }
+
+        queue->complete( task_ptr );
       }
     }
   } while(1);
@@ -130,18 +206,20 @@ printf("TaskQueue<Cuda>::driver(%d,%d) task(%lx)\n",threadIdx.z,blockIdx.x
 namespace {
 
 __global__
-void cuda_task_queue_execute( TaskQueue< Kokkos::Cuda > * queue )
-{ TaskQueueSpecialization< Kokkos::Cuda >::driver( queue ); }
+void cuda_task_queue_execute( TaskQueue< Kokkos::Cuda > * queue 
+                            , int32_t shmem_size )
+{ TaskQueueSpecialization< Kokkos::Cuda >::driver( queue , shmem_size ); }
 
 }
 
 void TaskQueueSpecialization< Kokkos::Cuda >::execute
   ( TaskQueue< Kokkos::Cuda > * const queue )
 {
+  const int shared_per_warp = 2048 ;
   const int warps_per_block = 4 ;
   const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 );
   const dim3 block( 1 , Kokkos::Impl::CudaTraits::WarpSize , warps_per_block );
-  const int shared = 0 ;
+  const int shared_total = shared_per_warp * warps_per_block ;
   const cudaStream_t stream = 0 ;
 
   CUDA_SAFE_CALL( cudaDeviceSynchronize() );
@@ -159,7 +237,7 @@ printf("cuda_task_queue_execute before\n");
   //
   // CUDA_SAFE_CALL( cudaDeviceSetLimit( cudaLimitStackSize , stack_size ) );
 
-  cuda_task_queue_execute<<< grid , block , shared , stream >>>( queue );
+  cuda_task_queue_execute<<< grid , block , shared_total , stream >>>( queue , shared_per_warp );
 
   CUDA_SAFE_CALL( cudaGetLastError() );
 
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
index 5d08219ea5..4a52985d29 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
@@ -57,7 +57,7 @@ namespace {
 template< typename TaskType >
 __global__
 void set_cuda_task_base_apply_function_pointer
-  ( TaskBase<Kokkos::Cuda,void,void>::function_type * ptr )
+  ( TaskBase<void,void,void>::function_type * ptr )
 { *ptr = TaskType::apply ; }
 
 }
@@ -78,7 +78,7 @@ public:
   void iff_single_thread_recursive_execute( queue_type * const ) {}
 
   __device__
-  static void driver( queue_type * const );
+  static void driver( queue_type * const , int32_t );
 
   static
   void execute( queue_type * const );
@@ -106,7 +106,14 @@ public:
 
 extern template class TaskQueue< Kokkos::Cuda > ;
 
+}} /* namespace Kokkos::Impl */
+
 //----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
 /**\brief  Impl::TaskExec<Cuda> is the TaskScheduler<Cuda>::member_type
  *         passed to tasks running in a Cuda space.
  *
@@ -134,11 +141,13 @@ private:
   friend class Kokkos::Impl::TaskQueue< Kokkos::Cuda > ;
   friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::Cuda > ;
 
+  int32_t * m_team_shmem ;
   const int m_team_size ;
 
   __device__
-  TaskExec( int arg_team_size = blockDim.y )
-    : m_team_size( arg_team_size ) {}
+  TaskExec( int32_t * arg_team_shmem , int arg_team_size = blockDim.y )
+    : m_team_shmem( arg_team_shmem )
+    , m_team_size( arg_team_size ) {}
 
 public:
 
@@ -154,7 +163,13 @@ public:
 
 };
 
+}} /* namespace Kokkos::Impl */
+
 //----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
 
 template<typename iType>
 struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
index 084daa098b..3f3d85ecd1 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
@@ -106,7 +106,7 @@ private:
   typedef Kokkos::Cuda                           execution_space ;
   typedef execution_space::scratch_memory_space  scratch_memory_space ;
 
-  void                * m_team_reduce ;
+  mutable void        * m_team_reduce ;
   scratch_memory_space  m_team_shared ;
   int                   m_team_reduce_size ;
   int                   m_league_rank ;
@@ -166,7 +166,7 @@ public:
       if ( 1 == blockDim.z ) { // team == block
         __syncthreads();
         // Wait for shared data write until all threads arrive here
-        if ( threadIdx.x == 0 && threadIdx.y == thread_id ) {
+        if ( threadIdx.x == 0u && threadIdx.y == (uint32_t)thread_id ) {
           *((ValueType*) m_team_reduce) = val ;
         }
         __syncthreads(); // Wait for shared data read until root thread writes
@@ -210,7 +210,7 @@ public:
       const int wx =
         ( threadIdx.x + blockDim.x * threadIdx.y ) & CudaTraits::WarpIndexMask ;
 
-      for ( int i = CudaTraits::WarpSize ; blockDim.x <= ( i >>= 1 ) ; ) {
+      for ( int i = CudaTraits::WarpSize ; (int)blockDim.x <= ( i >>= 1 ) ; ) {
 
         cuda_shfl_down( reducer.reference() , tmp , i , CudaTraits::WarpSize );
 
@@ -354,7 +354,7 @@ public:
 
       for ( int i = blockDim.x ; ( i >>= 1 ) ; ) {
         cuda_shfl_down( reducer.reference() , tmp , i , blockDim.x );
-        if ( threadIdx.x < i ) { reducer.join( tmp , reducer.reference() ); }
+        if ( (int)threadIdx.x < i ) { reducer.join( tmp , reducer.reference() ); }
       }
 
       // Broadcast from root lane to all other lanes.
@@ -410,7 +410,7 @@ public:
 
         value_type tmp( reducer.reference() );
 
-        for ( int i = CudaTraits::WarpSize ; blockDim.x <= ( i >>= 1 ) ; ) {
+        for ( int i = CudaTraits::WarpSize ; (int)blockDim.x <= ( i >>= 1 ) ; ) {
 
           cuda_shfl_down( reducer.reference(), tmp, i, CudaTraits::WarpSize );
 
@@ -479,7 +479,7 @@ public:
 
           __threadfence(); // Wait until global write is visible.
 
-          last_block = gridDim.x ==
+          last_block = (int)gridDim.x ==
                        1 + Kokkos::atomic_fetch_add(global_scratch_flags,1);
 
           // If last block then reset count
@@ -509,7 +509,7 @@ public:
         reducer.copy( ((pointer_type)shmem) + offset
                     , ((pointer_type)global_scratch_space) + offset );
 
-        for ( int i = nentry + tid ; i < gridDim.x ; i += nentry ) {
+        for ( int i = nentry + tid ; i < (int)gridDim.x ; i += nentry ) {
           reducer.join( ((pointer_type)shmem) + offset
                       , ((pointer_type)global_scratch_space)
                         + i * reducer.length() );
@@ -576,6 +576,14 @@ public:
     , m_league_size( arg_league_size )
     {}
 
+public:
+  // Declare to avoid unused private member warnings which are trigger
+  // when SFINAE excludes the member function which uses these variables
+  // Making another class a friend also surpresses these warnings
+  bool impl_avoid_sfinae_warning() const noexcept
+  {
+    return m_team_reduce_size > 0 && m_team_reduce != nullptr;
+  }
 };
 
 } // namspace Impl
@@ -913,10 +921,10 @@ void parallel_scan
     //  [t] += [t-4] if t >= 4
     //  ...
 
-    for ( int j = 1 ; j < blockDim.x ; j <<= 1 ) {
+    for ( int j = 1 ; j < (int)blockDim.x ; j <<= 1 ) {
       value_type tmp = 0 ;
       Impl::cuda_shfl_up( tmp , sval , j , blockDim.x );
-      if ( j <= threadIdx.x ) { sval += tmp ; }
+      if ( j <= (int)threadIdx.x ) { sval += tmp ; }
     }
 
     // Include accumulation and remove value for exclusive scan:
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp
new file mode 100644
index 0000000000..e11ae4798f
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp
@@ -0,0 +1,133 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_UNIQUE_TOKEN_HPP
+#define KOKKOS_CUDA_UNIQUE_TOKEN_HPP
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_CUDA
+
+#include <Kokkos_CudaSpace.hpp>
+#include <Kokkos_UniqueToken.hpp>
+#include <impl/Kokkos_SharedAlloc.hpp>
+#include <impl/Kokkos_ConcurrentBitset.hpp>
+
+namespace Kokkos { namespace Experimental {
+
+// both global and instance Unique Tokens are implemented in the same way
+template<>
+class UniqueToken< Cuda, UniqueTokenScope::Global >
+{
+private:
+
+  uint32_t volatile * m_buffer ;
+  uint32_t            m_count ;
+
+public:
+
+  using execution_space = Cuda;
+
+  explicit
+  UniqueToken( execution_space const& );
+
+  KOKKOS_INLINE_FUNCTION
+  UniqueToken() : m_buffer(0), m_count(0) {}
+
+  KOKKOS_INLINE_FUNCTION
+  UniqueToken( const UniqueToken & ) = default;
+
+  KOKKOS_INLINE_FUNCTION
+  UniqueToken( UniqueToken && )      = default;
+
+  KOKKOS_INLINE_FUNCTION
+  UniqueToken & operator=( const UniqueToken & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  UniqueToken & operator=( UniqueToken && ) = default ;
+
+  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  int32_t size() const noexcept { return m_count ; }
+
+  /// \brief acquire value such that 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  int32_t acquire() const
+  {
+    const Kokkos::pair<int,int> result =
+      Kokkos::Impl::concurrent_bitset::
+        acquire_bounded( m_buffer
+                       , m_count
+                       , Kokkos::Impl::clock_tic() % m_count
+                       );
+
+   if ( result.first < 0 ) {
+     Kokkos::abort("UniqueToken<Cuda> failure to release tokens, no tokens available" );
+   }
+
+    return result.first;
+  }
+
+  /// \brief release an acquired value
+  KOKKOS_INLINE_FUNCTION
+  void release( int32_t i ) const noexcept
+  {
+    Kokkos::Impl::concurrent_bitset::release( m_buffer, i );
+  }
+};
+
+template<>
+class UniqueToken< Cuda, UniqueTokenScope::Instance >
+  : public UniqueToken< Cuda, UniqueTokenScope::Global >
+{
+public:
+
+  explicit
+  UniqueToken( execution_space const& arg )
+    : UniqueToken< Cuda, UniqueTokenScope::Global >( arg ) {}
+};
+
+}} // namespace Kokkos::Experimental
+
+#endif // KOKKOS_ENABLE_CUDA
+#endif // KOKKOS_CUDA_UNIQUE_TOKEN_HPP
+
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
index f5e2d87fb6..d641622bb6 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
@@ -221,7 +221,6 @@ struct CudaLDGFetch {
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
-namespace Experimental {
 namespace Impl {
 
 /** \brief  Replace Default ViewDataHandle with Cuda texture fetch specialization
@@ -294,9 +293,8 @@ public:
     }
 };
 
-}
-}
-}
+} // namespace Impl
+} // namespace Kokkos
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
new file mode 100644
index 0000000000..99778c64b1
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp
@@ -0,0 +1,119 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_WORKGRAPHPOLICY_HPP
+#define KOKKOS_CUDA_WORKGRAPHPOLICY_HPP
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType ,
+                   Kokkos::Experimental::WorkGraphPolicy< Traits ... > ,
+                   Kokkos::Cuda
+                 >
+  : public Kokkos::Impl::Experimental::
+           WorkGraphExec< FunctorType,
+                          Kokkos::Cuda,
+                          Traits ...
+                        >
+{
+public:
+
+  typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... >   Policy ;
+  typedef Kokkos::Impl::Experimental::
+          WorkGraphExec<FunctorType, Kokkos::Cuda, Traits ... > Base ;
+  typedef ParallelFor<FunctorType, Policy, Kokkos::Cuda>        Self ;
+
+private:
+
+  template< class TagType >
+  __device__
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_one(const typename Policy::member_type& i) const {
+    Base::m_functor( i );
+  }
+
+  template< class TagType >
+  __device__
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_one(const typename Policy::member_type& i) const {
+    const TagType t{} ;
+    Base::m_functor( t , i );
+  }
+
+public:
+
+  __device__
+  inline
+  void operator()() const {
+    for (std::int32_t i; (-1 != (i = Base::before_work())); ) {
+      exec_one< typename Policy::work_tag >( i );
+      Base::after_work(i);
+    }
+  }
+
+  inline
+  void execute()
+  {
+    const int warps_per_block = 4 ;
+    const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 );
+    const dim3 block( 1 , Kokkos::Impl::CudaTraits::WarpSize , warps_per_block );
+    const int shared = 0 ;
+    const cudaStream_t stream = 0 ;
+
+    Kokkos::Impl::CudaParallelLaunch<Self>(*this, grid, block, shared, stream);
+  }
+
+  inline
+  ParallelFor( const FunctorType & arg_functor
+             , const Policy      & arg_policy )
+    : Base( arg_functor, arg_policy )
+  {
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* #define KOKKOS_CUDA_WORKGRAPHPOLICY_HPP */
diff --git a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
index 4f68d9c2c0..6ef7443a14 100644
--- a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
+++ b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
@@ -52,6 +52,7 @@
 
 #if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
 #include<Cuda/KokkosExp_Cuda_IterateTile.hpp>
+#include <Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp>
 #endif
 
 namespace Kokkos { namespace Experimental {
@@ -120,28 +121,17 @@ struct MDRangePolicy
                                        , typename traits::index_type
                                        > ;
 
+  typedef MDRangePolicy execution_policy; // needed for is_execution_space interrogation
+
   static_assert( !std::is_same<typename traits::iteration_pattern,void>::value
                , "Kokkos Error: MD iteration pattern not defined" );
 
   using iteration_pattern   = typename traits::iteration_pattern;
   using work_tag            = typename traits::work_tag;
+  using launch_bounds       = typename traits::launch_bounds;
+  using member_type = typename range_policy::member_type;
 
-  static constexpr int rank = iteration_pattern::rank;
-
-  static constexpr int outer_direction = static_cast<int> (
-      (iteration_pattern::outer_direction != Iterate::Default)
-    ? iteration_pattern::outer_direction
-    : default_outer_direction< typename traits::execution_space>::value );
-
-  static constexpr int inner_direction = static_cast<int> (
-      iteration_pattern::inner_direction != Iterate::Default
-    ? iteration_pattern::inner_direction
-    : default_inner_direction< typename traits::execution_space>::value ) ;
-
-
-  // Ugly ugly workaround intel 14 not handling scoped enum correctly
-  static constexpr int Right = static_cast<int>( Iterate::Right );
-  static constexpr int Left  = static_cast<int>( Iterate::Left );
+  enum { rank = static_cast<int>(iteration_pattern::rank) };
 
   using index_type  = typename traits::index_type;
   using array_index_type = long;
@@ -155,11 +145,50 @@ struct MDRangePolicy
   // This would require the user to either pass a matching index_type parameter
   // as template parameter to the MDRangePolicy or static_cast the individual values
 
+  point_type m_lower;
+  point_type m_upper;
+  tile_type  m_tile;
+  point_type m_tile_end;
+  index_type m_num_tiles;
+  index_type m_prod_tile_dims;
+
+/*
+  // NDE enum impl definition alternative - replace static constexpr int ? 
+  enum { outer_direction = static_cast<int> (
+      (iteration_pattern::outer_direction != Iterate::Default)
+    ? iteration_pattern::outer_direction
+    : default_outer_direction< typename traits::execution_space>::value ) };
+
+  enum { inner_direction = static_cast<int> (
+      iteration_pattern::inner_direction != Iterate::Default
+    ? iteration_pattern::inner_direction
+    : default_inner_direction< typename traits::execution_space>::value ) };
+
+  enum { Right = static_cast<int>( Iterate::Right ) };
+  enum { Left  = static_cast<int>( Iterate::Left ) };
+*/
+  //static constexpr int rank = iteration_pattern::rank;
+
+  static constexpr int outer_direction = static_cast<int> (
+      (iteration_pattern::outer_direction != Iterate::Default)
+    ? iteration_pattern::outer_direction
+    : default_outer_direction< typename traits::execution_space>::value );
+
+  static constexpr int inner_direction = static_cast<int> (
+      iteration_pattern::inner_direction != Iterate::Default
+    ? iteration_pattern::inner_direction
+    : default_inner_direction< typename traits::execution_space>::value ) ;
+
+  // Ugly ugly workaround intel 14 not handling scoped enum correctly
+  static constexpr int Right = static_cast<int>( Iterate::Right );
+  static constexpr int Left  = static_cast<int>( Iterate::Left );
+
   MDRangePolicy( point_type const& lower, point_type const& upper, tile_type const& tile = tile_type{} )
     : m_lower(lower)
     , m_upper(upper)
     , m_tile(tile)
     , m_num_tiles(1)
+    , m_prod_tile_dims(1)
   {
     // Host
     if ( true
@@ -172,8 +201,8 @@ struct MDRangePolicy
       for (int i=0; i<rank; ++i) {
         span = upper[i] - lower[i];
         if ( m_tile[i] <= 0 ) {
-          if (  (inner_direction == Right && (i < rank-1))
-              || (inner_direction == Left && (i > 0)) )
+          if (  ((int)inner_direction == (int)Right && (i < rank-1))
+              || ((int)inner_direction == (int)Left && (i > 0)) )
           {
             m_tile[i] = 2;
           }
@@ -183,6 +212,7 @@ struct MDRangePolicy
         }
         m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
         m_num_tiles *= m_tile_end[i];
+        m_prod_tile_dims *= m_tile[i];
       }
     }
     #if defined(KOKKOS_ENABLE_CUDA)
@@ -190,14 +220,18 @@ struct MDRangePolicy
     {
       index_type span;
       for (int i=0; i<rank; ++i) {
-        span = upper[i] - lower[i];
+        span = m_upper[i] - m_lower[i];
         if ( m_tile[i] <= 0 ) {
           // TODO: determine what is a good default tile size for cuda
           // may be rank dependent
-          if (  (inner_direction == Right && (i < rank-1))
-              || (inner_direction == Left && (i > 0)) )
+          if (  ((int)inner_direction == (int)Right && (i < rank-1))
+              || ((int)inner_direction == (int)Left && (i > 0)) )
           {
-            m_tile[i] = 2;
+            if ( m_prod_tile_dims < 512 ) {
+              m_tile[i] = 2;
+            } else {
+              m_tile[i] = 1;
+            }
           }
           else {
             m_tile[i] = 16;
@@ -205,12 +239,9 @@ struct MDRangePolicy
         }
         m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
         m_num_tiles *= m_tile_end[i];
+        m_prod_tile_dims *= m_tile[i];
       }
-      index_type total_tile_size_check = 1;
-      for (int i=0; i<rank; ++i) {
-        total_tile_size_check *= m_tile[i];
-      }
-      if ( total_tile_size_check >= 1024 ) { // improve this check - 1024,1024,64 max per dim (Kepler), but product num_threads < 1024; more restrictions pending register limit
+      if ( m_prod_tile_dims > 512 ) { // Match Cuda restriction for ParallelReduce; 1024,1024,64 max per dim (Kepler), but product num_threads < 1024
         printf(" Tile dimensions exceed Cuda limits\n");
         Kokkos::abort(" Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
         //Kokkos::Impl::throw_runtime_exception( " Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
@@ -223,19 +254,7 @@ struct MDRangePolicy
   template < typename LT , typename UT , typename TT = array_index_type >
   MDRangePolicy( std::initializer_list<LT> const& lower, std::initializer_list<UT> const& upper, std::initializer_list<TT> const& tile = {} )
   {
-#if 0
-    // This should work, less duplicated code but not yet extensively tested
-    point_type lower_tmp, upper_tmp;
-    tile_type tile_tmp;
-    for ( auto i = 0; i < rank; ++i ) {
-      lower_tmp[i] = static_cast<array_index_type>(lower.begin()[i]);
-      upper_tmp[i] = static_cast<array_index_type>(upper.begin()[i]);
-      tile_tmp[i]  = static_cast<array_index_type>(tile.begin()[i]);
-    }
 
-    MDRangePolicy( lower_tmp, upper_tmp, tile_tmp );
-
-#else
     if(static_cast<int>(m_lower.size()) != rank || static_cast<int>(m_upper.size()) != rank)
       Kokkos::abort("MDRangePolicy: Constructor initializer lists have wrong size");
 
@@ -249,7 +268,7 @@ struct MDRangePolicy
     }
 
     m_num_tiles = 1;
-
+    m_prod_tile_dims = 1;
 
     // Host
     if ( true
@@ -262,8 +281,8 @@ struct MDRangePolicy
       for (int i=0; i<rank; ++i) {
         span = m_upper[i] - m_lower[i];
         if ( m_tile[i] <= 0 ) {
-          if (  (inner_direction == Right && (i < rank-1))
-              || (inner_direction == Left && (i > 0)) )
+          if (  ((int)inner_direction == (int)Right && (i < rank-1))
+              || ((int)inner_direction == (int)Left && (i > 0)) )
           {
             m_tile[i] = 2;
           }
@@ -273,6 +292,7 @@ struct MDRangePolicy
         }
         m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
         m_num_tiles *= m_tile_end[i];
+        m_prod_tile_dims *= m_tile[i];
       }
     }
     #if defined(KOKKOS_ENABLE_CUDA)
@@ -284,10 +304,14 @@ struct MDRangePolicy
         if ( m_tile[i] <= 0 ) {
           // TODO: determine what is a good default tile size for cuda
           // may be rank dependent
-          if (  (inner_direction == Right && (i < rank-1))
-              || (inner_direction == Left && (i > 0)) )
+          if (  ((int)inner_direction == (int)Right && (i < rank-1))
+              || ((int)inner_direction == (int)Left && (i > 0)) )
           {
-            m_tile[i] = 2;
+            if ( m_prod_tile_dims < 512 ) {
+              m_tile[i] = 2;
+            } else {
+              m_tile[i] = 1;
+            }
           }
           else {
             m_tile[i] = 16;
@@ -295,32 +319,22 @@ struct MDRangePolicy
         }
         m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
         m_num_tiles *= m_tile_end[i];
+        m_prod_tile_dims *= m_tile[i];
       }
-      index_type total_tile_size_check = 1;
-      for (int i=0; i<rank; ++i) {
-        total_tile_size_check *= m_tile[i];
-      }
-      if ( total_tile_size_check >= 1024 ) { // improve this check - 1024,1024,64 max per dim (Kepler), but product num_threads < 1024; more restrictions pending register limit
+      if ( m_prod_tile_dims > 512 ) { // Match Cuda restriction for ParallelReduce; 1024,1024,64 max per dim (Kepler), but product num_threads < 1024
         printf(" Tile dimensions exceed Cuda limits\n");
         Kokkos::abort(" Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
         //Kokkos::Impl::throw_runtime_exception( " Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
       }
     }
     #endif
-#endif
   }
 
-
-  point_type m_lower;
-  point_type m_upper;
-  tile_type  m_tile;
-  point_type m_tile_end;
-  index_type m_num_tiles;
 };
 // ------------------------------------------------------------------ //
 
 // ------------------------------------------------------------------ //
-//md_parallel_for
+//md_parallel_for - deprecated use parallel_for
 // ------------------------------------------------------------------ //
 template <typename MDRange, typename Functor, typename Enable = void>
 void md_parallel_for( MDRange const& range
@@ -335,7 +349,6 @@ void md_parallel_for( MDRange const& range
 {
   Impl::MDFunctor<MDRange, Functor, void> g(range, f);
 
-  //using range_policy = typename MDRange::range_policy;
   using range_policy = typename MDRange::impl_range_policy;
 
   Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
@@ -354,7 +367,6 @@ void md_parallel_for( const std::string& str
 {
   Impl::MDFunctor<MDRange, Functor, void> g(range, f);
 
-  //using range_policy = typename MDRange::range_policy;
   using range_policy = typename MDRange::impl_range_policy;
 
   Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
@@ -395,7 +407,7 @@ void md_parallel_for( MDRange const& range
 // ------------------------------------------------------------------ //
 
 // ------------------------------------------------------------------ //
-//md_parallel_reduce
+//md_parallel_reduce - deprecated use parallel_reduce
 // ------------------------------------------------------------------ //
 template <typename MDRange, typename Functor, typename ValueType>
 void md_parallel_reduce( MDRange const& range
@@ -409,9 +421,8 @@ void md_parallel_reduce( MDRange const& range
                       ) >::type* = 0
                     )
 {
-  Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f, v);
+  Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f);
 
-  //using range_policy = typename MDRange::range_policy;
   using range_policy = typename MDRange::impl_range_policy;
   Kokkos::parallel_reduce( str, range_policy(0, range.m_num_tiles).set_chunk_size(1), g, v );
 }
@@ -428,48 +439,14 @@ void md_parallel_reduce( const std::string& str
                       ) >::type* = 0
                     )
 {
-  Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f, v);
+  Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f);
 
-  //using range_policy = typename MDRange::range_policy;
   using range_policy = typename MDRange::impl_range_policy;
 
   Kokkos::parallel_reduce( str, range_policy(0, range.m_num_tiles).set_chunk_size(1), g, v );
 }
 
-// Cuda - parallel_reduce not implemented yet
-/*
-template <typename MDRange, typename Functor, typename ValueType>
-void md_parallel_reduce( MDRange const& range
-                    , Functor const& f
-                    , ValueType & v
-                    , const std::string& str = ""
-                    , typename std::enable_if<( true
-                      #if defined( KOKKOS_ENABLE_CUDA)
-                      && std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
-                      #endif
-                      ) >::type* = 0
-                    )
-{
-  Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f, v);
-  closure.execute();
-}
-
-template <typename MDRange, typename Functor, typename ValueType>
-void md_parallel_reduce( const std::string& str
-                    , MDRange const& range
-                    , Functor const& f
-                    , ValueType & v
-                    , typename std::enable_if<( true
-                      #if defined( KOKKOS_ENABLE_CUDA)
-                      && std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
-                      #endif
-                      ) >::type* = 0
-                    )
-{
-  Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f, v);
-  closure.execute();
-}
-*/
+// Cuda - md_parallel_reduce not implemented - use parallel_reduce
 
 }} // namespace Kokkos::Experimental
 
diff --git a/lib/kokkos/core/src/Kokkos_Atomic.hpp b/lib/kokkos/core/src/Kokkos_Atomic.hpp
index 3ecae24da4..3c8673c66a 100644
--- a/lib/kokkos/core/src/Kokkos_Atomic.hpp
+++ b/lib/kokkos/core/src/Kokkos_Atomic.hpp
@@ -114,40 +114,9 @@
 #endif /* Not pre-selected atomic implementation */
 #endif
 
-//----------------------------------------------------------------------------
-
-// Forward decalaration of functions supporting arbitrary sized atomics
-// This is necessary since Kokkos_Atomic.hpp is internally included very early
-// through Kokkos_HostSpace.hpp as well as the allocation tracker.
 #ifdef KOKKOS_ENABLE_CUDA
-namespace Kokkos {
-namespace Impl {
-/// \brief Aquire a lock for the address
-///
-/// This function tries to aquire the lock for the hash value derived
-/// from the provided ptr. If the lock is successfully aquired the
-/// function returns true. Otherwise it returns false.
-#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-extern
+#include <Cuda/Kokkos_Cuda_Locks.hpp>
 #endif
-__device__ inline
-bool lock_address_cuda_space(void* ptr);
-
-/// \brief Release lock for the address
-///
-/// This function releases the lock for the hash value derived
-/// from the provided ptr. This function should only be called
-/// after previously successfully aquiring a lock with
-/// lock_address.
-#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-extern
-#endif
-__device__ inline
-void unlock_address_cuda_space(void* ptr);
-}
-}
-#endif
-
 
 namespace Kokkos {
 template <typename T>
diff --git a/lib/kokkos/core/src/Kokkos_Concepts.hpp b/lib/kokkos/core/src/Kokkos_Concepts.hpp
index 9a2b53e157..5480dbf40c 100644
--- a/lib/kokkos/core/src/Kokkos_Concepts.hpp
+++ b/lib/kokkos/core/src/Kokkos_Concepts.hpp
@@ -79,6 +79,21 @@ struct IndexType
   using type = T;
 };
 
+/**\brief Specify Launch Bounds for CUDA execution.
+ *
+ *  The "best" defaults may be architecture specific.
+ */
+template< unsigned int maxT = 1024 /* Max threads per block */
+        , unsigned int minB = 1    /* Min blocks per SM */
+        >
+struct LaunchBounds
+{
+  using launch_bounds = LaunchBounds;
+  using type = LaunchBounds<maxT,minB>;
+  static unsigned int constexpr maxTperB {maxT};
+  static unsigned int constexpr minBperSM {minB};
+};
+
 } // namespace Kokkos
 
 //----------------------------------------------------------------------------
@@ -119,6 +134,7 @@ using Kokkos::is_array_layout ;
 KOKKOS_IMPL_IS_CONCEPT( iteration_pattern )
 KOKKOS_IMPL_IS_CONCEPT( schedule_type )
 KOKKOS_IMPL_IS_CONCEPT( index_type )
+KOKKOS_IMPL_IS_CONCEPT( launch_bounds )
 
 }
 
diff --git a/lib/kokkos/core/src/Kokkos_Core.hpp b/lib/kokkos/core/src/Kokkos_Core.hpp
index 19de791c0f..ddb11d2894 100644
--- a/lib/kokkos/core/src/Kokkos_Core.hpp
+++ b/lib/kokkos/core/src/Kokkos_Core.hpp
@@ -96,11 +96,13 @@ struct InitArguments {
   int num_numa;
   int device_id;
 
-  InitArguments() {
-    num_threads = -1;
-    num_numa = -1;
-    device_id = -1;
-  }
+  InitArguments( int nt = -1
+               , int nn = -1
+               , int dv = -1)
+    : num_threads( nt )
+    , num_numa( nn )
+    , device_id( dv )
+  {}
 };
 
 void initialize(int& narg, char* arg[]);
@@ -168,6 +170,9 @@ void * kokkos_realloc( void * arg_alloc , const size_t arg_alloc_size )
 
 } // namespace Kokkos
 
+#include <Kokkos_Crs.hpp>
+#include <Kokkos_WorkGraphPolicy.hpp>
+
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
diff --git a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
index 09081d2387..8c080f7a8f 100644
--- a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
+++ b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
@@ -51,6 +51,9 @@
 #include <Kokkos_Macros.hpp>
 #include <impl/Kokkos_Utilities.hpp>
 
+#include <Kokkos_UniqueToken.hpp>
+#include <Kokkos_MasterLock.hpp>
+
 //----------------------------------------------------------------------------
 // Have assumed a 64bit build (8byte pointers) throughout the code base.
 
diff --git a/lib/kokkos/core/src/Kokkos_Crs.hpp b/lib/kokkos/core/src/Kokkos_Crs.hpp
new file mode 100644
index 0000000000..93b3fa5ca9
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Crs.hpp
@@ -0,0 +1,333 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CRS_HPP
+#define KOKKOS_CRS_HPP
+
+namespace Kokkos {
+namespace Experimental {
+
+/// \class Crs
+/// \brief Compressed row storage array.
+///
+/// \tparam DataType The type of stored entries.  If a Crs is
+///   used as the graph of a sparse matrix, then this is usually an
+///   integer type, the type of the column indices in the sparse
+///   matrix.
+///
+/// \tparam Arg1Type The second template parameter, corresponding
+///   either to the Device type (if there are no more template
+///   parameters) or to the Layout type (if there is at least one more
+///   template parameter).
+///
+/// \tparam Arg2Type The third template parameter, which if provided
+///   corresponds to the Device type.
+///
+/// \tparam SizeType The type of row offsets.  Usually the default
+///   parameter suffices.  However, setting a nondefault value is
+///   necessary in some cases, for example, if you want to have a
+///   sparse matrices with dimensions (and therefore column indices)
+///   that fit in \c int, but want to store more than <tt>INT_MAX</tt>
+///   entries in the sparse matrix.
+///
+/// A row has a range of entries:
+/// <ul>
+/// <li> <tt> row_map[i0] <= entry < row_map[i0+1] </tt> </li>
+/// <li> <tt> 0 <= i1 < row_map[i0+1] - row_map[i0] </tt> </li>
+/// <li> <tt> entries( entry ,            i2 , i3 , ... ); </tt> </li>
+/// <li> <tt> entries( row_map[i0] + i1 , i2 , i3 , ... ); </tt> </li>
+/// </ul>
+template< class DataType,
+          class Arg1Type,
+          class Arg2Type = void,
+          typename SizeType = typename ViewTraits<DataType*, Arg1Type, Arg2Type, void >::size_type>
+class Crs {
+protected:
+  typedef ViewTraits<DataType*, Arg1Type, Arg2Type, void> traits;
+
+public:
+  typedef DataType                                            data_type;
+  typedef typename traits::array_layout                       array_layout;
+  typedef typename traits::execution_space                    execution_space;
+  typedef typename traits::memory_space                       memory_space;
+  typedef typename traits::device_type                        device_type;
+  typedef SizeType                                            size_type;
+
+  typedef Crs< DataType , Arg1Type , Arg2Type , SizeType > staticcrsgraph_type;
+  typedef Crs< DataType , array_layout , typename traits::host_mirror_space , SizeType > HostMirror;
+  typedef View<size_type* , array_layout, device_type> row_map_type;
+  typedef View<DataType*  , array_layout, device_type> entries_type;
+
+  entries_type entries;
+  row_map_type row_map;
+
+  //! Construct an empty view.
+  Crs () : entries(), row_map() {}
+
+  //! Copy constructor (shallow copy).
+  Crs (const Crs& rhs) : entries (rhs.entries), row_map (rhs.row_map)
+  {}
+
+  template<class EntriesType, class RowMapType>
+  Crs (const EntriesType& entries_,const RowMapType& row_map_) : entries (entries_), row_map (row_map_)
+  {}
+
+  /** \brief  Assign to a view of the rhs array.
+   *          If the old view is the last view
+   *          then allocated memory is deallocated.
+   */
+  Crs& operator= (const Crs& rhs) {
+    entries = rhs.entries;
+    row_map = rhs.row_map;
+    return *this;
+  }
+
+  /**  \brief  Destroy this view of the array.
+   *           If the last view then allocated memory is deallocated.
+   */
+  ~Crs() {}
+
+  /**  \brief  Return number of rows in the graph
+   */
+  KOKKOS_INLINE_FUNCTION
+  size_type numRows() const {
+    return (row_map.dimension_0 () != 0) ?
+      row_map.dimension_0 () - static_cast<size_type> (1) :
+      static_cast<size_type> (0);
+  }
+};
+
+/*--------------------------------------------------------------------------*/
+
+template< class OutCounts,
+          class DataType,
+          class Arg1Type,
+          class Arg2Type,
+          class SizeType>
+void get_crs_transpose_counts(
+    OutCounts& out,
+    Crs<DataType, Arg1Type, Arg2Type, SizeType> const& in,
+    std::string const& name = "transpose_counts");
+
+template< class OutCounts,
+          class InCrs>
+void get_crs_row_map_from_counts(
+    OutCounts& out,
+    InCrs const& in,
+    std::string const& name = "row_map");
+
+template< class DataType,
+          class Arg1Type,
+          class Arg2Type,
+          class SizeType>
+void transpose_crs(
+    Crs<DataType, Arg1Type, Arg2Type, SizeType>& out,
+    Crs<DataType, Arg1Type, Arg2Type, SizeType> const& in);
+
+}} // namespace Kokkos::Experimental
+
+/*--------------------------------------------------------------------------*/
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+namespace Experimental {
+
+template <class InCrs, class OutCounts>
+class GetCrsTransposeCounts {
+ public:
+  using execution_space = typename InCrs::execution_space;
+  using self_type = GetCrsTransposeCounts<InCrs, OutCounts>;
+  using index_type = typename InCrs::size_type;
+ private:
+  InCrs in;
+  OutCounts out;
+ public:
+  KOKKOS_INLINE_FUNCTION
+  void operator()(index_type i) const {
+    atomic_increment( &out[in.entries(i)] );
+  }
+  GetCrsTransposeCounts(InCrs const& arg_in, OutCounts const& arg_out):
+    in(arg_in),out(arg_out) {
+    using policy_type = RangePolicy<index_type, execution_space>;
+    using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
+    const closure_type closure(*this, policy_type(0, index_type(in.entries.size())));
+    closure.execute();
+    execution_space::fence();
+  }
+};
+
+template <class InCounts, class OutRowMap>
+class CrsRowMapFromCounts {
+ public:
+  using execution_space = typename InCounts::execution_space;
+  using value_type = typename OutRowMap::value_type;
+  using index_type = typename InCounts::size_type;
+ private:
+  InCounts in;
+  OutRowMap out;
+ public:
+  KOKKOS_INLINE_FUNCTION
+  void operator()(index_type i, value_type& update, bool final_pass) const {
+    update += in(i);
+    if (final_pass) {
+      out(i + 1) = update;
+      if (i == 0) {
+        out(0) = 0;
+      }
+    }
+  }
+  KOKKOS_INLINE_FUNCTION
+  void init(value_type& update) const { update = 0; }
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& update, const volatile value_type& input) const {
+    update += input;
+  }
+  using self_type = CrsRowMapFromCounts<InCounts, OutRowMap>;
+  CrsRowMapFromCounts(InCounts const& arg_in, OutRowMap const& arg_out):
+    in(arg_in),out(arg_out) {
+    using policy_type = RangePolicy<index_type, execution_space>;
+    using closure_type = Kokkos::Impl::ParallelScan<self_type, policy_type>;
+    closure_type closure(*this, policy_type(0, in.size()));
+    closure.execute();
+    execution_space::fence();
+  }
+};
+
+template <class InCrs, class OutCrs>
+class FillCrsTransposeEntries {
+ public:
+  using execution_space = typename InCrs::execution_space;
+  using memory_space = typename InCrs::memory_space;
+  using value_type = typename OutCrs::entries_type::value_type;
+  using index_type = typename InCrs::size_type;
+ private:
+  using counters_type = View<index_type*, memory_space>;
+  InCrs in;
+  OutCrs out;
+  counters_type counters;
+ public:
+  KOKKOS_INLINE_FUNCTION
+  void operator()(index_type i) const {
+    auto begin = in.row_map(i);
+    auto end = in.row_map(i + 1);
+    for (auto j = begin; j < end; ++j) {
+      auto ti = in.entries(j);
+      auto tbegin = out.row_map(ti);
+      auto tj = atomic_fetch_add( &counters(ti), 1 );
+      out.entries( tbegin + tj ) = i;
+    }
+  }
+  using self_type = FillCrsTransposeEntries<InCrs, OutCrs>;
+  FillCrsTransposeEntries(InCrs const& arg_in, OutCrs const& arg_out):
+    in(arg_in),out(arg_out),
+    counters("counters", arg_out.numRows()) {
+    using policy_type = RangePolicy<index_type, execution_space>;
+    using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
+    const closure_type closure(*this, policy_type(0, index_type(in.numRows())));
+    closure.execute();
+    execution_space::fence();
+  }
+};
+
+}}} // namespace Kokkos::Impl::Experimental
+
+/*--------------------------------------------------------------------------*/
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Experimental {
+
+template< class OutCounts,
+          class DataType,
+          class Arg1Type,
+          class Arg2Type,
+          class SizeType>
+void get_crs_transpose_counts(
+    OutCounts& out,
+    Crs<DataType, Arg1Type, Arg2Type, SizeType> const& in,
+    std::string const& name) {
+  using InCrs = Crs<DataType, Arg1Type, Arg2Type, SizeType>;
+  out = OutCounts(name, in.numRows());
+  Kokkos::Impl::Experimental::
+    GetCrsTransposeCounts<InCrs, OutCounts> functor(in, out);
+}
+
+template< class OutRowMap,
+          class InCounts>
+void get_crs_row_map_from_counts(
+    OutRowMap& out,
+    InCounts const& in,
+    std::string const& name) {
+  out = OutRowMap(ViewAllocateWithoutInitializing(name), in.size() + 1);
+  Kokkos::Impl::Experimental::
+    CrsRowMapFromCounts<InCounts, OutRowMap> functor(in, out);
+}
+
+template< class DataType,
+          class Arg1Type,
+          class Arg2Type,
+          class SizeType>
+void transpose_crs(
+    Crs<DataType, Arg1Type, Arg2Type, SizeType>& out,
+    Crs<DataType, Arg1Type, Arg2Type, SizeType> const& in)
+{
+  typedef Crs<DataType, Arg1Type, Arg2Type, SizeType> crs_type ;
+  typedef typename crs_type::memory_space             memory_space ;
+  typedef View<SizeType*, memory_space>               counts_type ;
+  {
+  counts_type counts;
+  Kokkos::Experimental::get_crs_transpose_counts(counts, in);
+  Kokkos::Experimental::get_crs_row_map_from_counts(out.row_map, counts,
+      "tranpose_row_map");
+  }
+  out.entries = decltype(out.entries)("transpose_entries", in.entries.size());
+  Kokkos::Impl::Experimental::
+    FillCrsTransposeEntries<crs_type, crs_type> entries_functor(in, out);
+}
+
+}} // namespace Kokkos::Experimental
+
+#endif /* #define KOKKOS_CRS_HPP */
diff --git a/lib/kokkos/core/src/Kokkos_Cuda.hpp b/lib/kokkos/core/src/Kokkos_Cuda.hpp
index f0f0f87458..197831dee5 100644
--- a/lib/kokkos/core/src/Kokkos_Cuda.hpp
+++ b/lib/kokkos/core/src/Kokkos_Cuda.hpp
@@ -217,8 +217,8 @@ public:
 
 private:
 
-  cudaStream_t m_stream ;
   int          m_device ;
+  cudaStream_t m_stream ;
 };
 
 } // namespace Kokkos
@@ -295,6 +295,7 @@ struct VerifyExecutionCanAccessMemorySpace
 #include <Cuda/Kokkos_Cuda_Team.hpp>
 #include <Cuda/Kokkos_Cuda_Parallel.hpp>
 #include <Cuda/Kokkos_Cuda_Task.hpp>
+#include <Cuda/Kokkos_Cuda_UniqueToken.hpp>
 
 #include <KokkosExp_MDRangePolicy.hpp>
 //----------------------------------------------------------------------------
diff --git a/lib/kokkos/core/src/Kokkos_CudaSpace.hpp b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp
index 307ab193b1..fb5985e164 100644
--- a/lib/kokkos/core/src/Kokkos_CudaSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp
@@ -90,7 +90,7 @@ public:
                  , const size_t arg_alloc_size ) const ;
 
   /**\brief Return Name of the MemorySpace */
-  static constexpr const char* name();
+  static constexpr const char* name() { return m_name; }
 
   /*--------------------------------*/
   /** \brief  Error reporting for HostSpace attempt to access CudaSpace */
@@ -186,7 +186,7 @@ public:
                  , const size_t arg_alloc_size ) const ;
 
   /**\brief Return Name of the MemorySpace */
-  static constexpr const char* name();
+  static constexpr const char* name() { return m_name; }
 
   /*--------------------------------*/
 
@@ -234,7 +234,7 @@ public:
                  , const size_t arg_alloc_size ) const ;
 
   /**\brief Return Name of the MemorySpace */
-  static constexpr const char* name();
+  static constexpr const char* name() { return m_name; }
 
 private:
 
diff --git a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp
index 375a2d3744..a8c4d77c62 100644
--- a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp
+++ b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp
@@ -384,6 +384,7 @@ Impl::PerThreadValue PerThread(const int& arg);
  *    WorkTag (none): Tag which is used as the first argument for the functor operator.
  *    Schedule<Type> (Schedule<Static>): Scheduling Policy (Dynamic, or Static).
  *    IndexType<Type> (IndexType<ExecutionSpace::size_type>: Integer Index type used to iterate over the Index space.
+ *    LaunchBounds<int,int> (LaunchBounds<1024,1>: Launch Bounds for CUDA compilation.
  */
 template< class ... Properties>
 class TeamPolicy: public
@@ -561,6 +562,45 @@ KOKKOS_INLINE_FUNCTION
 Impl::ThreadVectorRangeBoundariesStruct<iType,TeamMemberType>
 ThreadVectorRange( const TeamMemberType&, const iType& count );
 
+#if defined(KOKKOS_ENABLE_PROFILING)
+namespace Impl {
+
+template<typename FunctorType, typename TagType,
+  bool HasTag = !std::is_same<TagType, void>::value >
+struct ParallelConstructName;
+
+template<typename FunctorType, typename TagType>
+struct ParallelConstructName<FunctorType, TagType, true> {
+  ParallelConstructName(std::string const& label):label_ref(label) {
+    if (label.empty()) {
+      default_name = std::string(typeid(FunctorType).name()) + "/" +
+        typeid(TagType).name();
+    }
+  }
+  std::string const& get() {
+    return (label_ref.empty()) ? default_name : label_ref;
+  }
+  std::string const& label_ref;
+  std::string default_name;
+};
+
+template<typename FunctorType, typename TagType>
+struct ParallelConstructName<FunctorType, TagType, false> {
+  ParallelConstructName(std::string const& label):label_ref(label) {
+    if (label.empty()) {
+      default_name = std::string(typeid(FunctorType).name());
+    }
+  }
+  std::string const& get() {
+    return (label_ref.empty()) ? default_name : label_ref;
+  }
+  std::string const& label_ref;
+  std::string default_name;
+};
+
+} // namespace Impl
+#endif /* defined KOKKOS_ENABLE_PROFILING */
+
 } // namespace Kokkos
 
 #endif /* #define KOKKOS_EXECPOLICY_HPP */
diff --git a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
index e224cd4e84..9c9af0dd8b 100644
--- a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
@@ -126,14 +126,6 @@ public:
   //! This memory space preferred device_type
   typedef Kokkos::Device< execution_space, memory_space > device_type;
 
-  /*--------------------------------*/
-  /* Functions unique to the HBWSpace */
-  static int in_parallel();
-
-  static void register_in_parallel( int (*)() );
-
-  /*--------------------------------*/
-
   /**\brief  Default memory space instance */
   HBWSpace();
   HBWSpace( const HBWSpace & rhs ) = default;
diff --git a/lib/kokkos/core/src/Kokkos_HostSpace.hpp b/lib/kokkos/core/src/Kokkos_HostSpace.hpp
index d00cce8f60..431635047a 100644
--- a/lib/kokkos/core/src/Kokkos_HostSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_HostSpace.hpp
@@ -130,14 +130,6 @@ public:
   //! This memory space preferred device_type
   typedef Kokkos::Device< execution_space, memory_space > device_type;
 
-  /*--------------------------------*/
-  /* Functions unique to the HostSpace */
-  static int in_parallel();
-
-  static void register_in_parallel( int (*)() );
-
-  /*--------------------------------*/
-
   /**\brief  Default memory space instance */
   HostSpace();
   HostSpace( HostSpace && rhs ) = default;
@@ -161,7 +153,7 @@ public:
                  , const size_t arg_alloc_size ) const;
 
   /**\brief Return Name of the MemorySpace */
-  static constexpr const char* name();
+  static constexpr const char* name() { return m_name; }
 
 private:
   AllocationMechanism  m_alloc_mech;
diff --git a/lib/kokkos/core/src/Kokkos_Layout.hpp b/lib/kokkos/core/src/Kokkos_Layout.hpp
index f300a6d9f6..87c705153e 100644
--- a/lib/kokkos/core/src/Kokkos_Layout.hpp
+++ b/lib/kokkos/core/src/Kokkos_Layout.hpp
@@ -156,6 +156,8 @@ struct LayoutStride {
       for ( int r = 0 ; r < ARRAY_LAYOUT_MAX_RANK ; ++r ) {
         tmp.dimension[r] = 0 ;
         tmp.stride[r]    = 0 ;
+      }
+      for ( int r = 0 ; r < rank ; ++r ) {
         check_input &= ~int( 1 << order[r] );
       }
       if ( 0 == check_input ) {
diff --git a/lib/kokkos/core/src/Kokkos_Macros.hpp b/lib/kokkos/core/src/Kokkos_Macros.hpp
index 1439dbd3f8..250ef6630a 100644
--- a/lib/kokkos/core/src/Kokkos_Macros.hpp
+++ b/lib/kokkos/core/src/Kokkos_Macros.hpp
@@ -297,6 +297,10 @@
     #endif
   #endif
 
+  #if defined( KOKKOS_ARCH_AVX512MIC )
+      #define KOKKOS_ENABLE_RFO_PREFETCH 1
+  #endif 
+
   #if defined( __MIC__ )
     // Compiling for Xeon Phi
   #endif
@@ -344,13 +348,18 @@
   //#define KOKKOS_ENABLE_PRAGMA_VECTOR 1
   //#define KOKKOS_ENABLE_PRAGMA_SIMD 1
 
+  #if defined( KOKKOS_ARCH_AVX512MIC )
+      #define KOKKOS_ENABLE_RFO_PREFETCH 1
+  #endif
+
   #if !defined( KOKKOS_FORCEINLINE_FUNCTION )
     #define KOKKOS_FORCEINLINE_FUNCTION inline __attribute__((always_inline))
   #endif
 
   #if !defined( KOKKOS_ENABLE_ASM ) && !defined( __PGIC__ ) && \
       ( defined( __amd64 ) || defined( __amd64__ ) || \
-        defined( __x86_64 ) || defined( __x86_64__ ) )
+        defined( __x86_64 ) || defined( __x86_64__ ) || \
+	defined(__PPC64__) )
     #define KOKKOS_ENABLE_ASM 1
   #endif
 #endif
diff --git a/lib/kokkos/core/src/Kokkos_MasterLock.hpp b/lib/kokkos/core/src/Kokkos_MasterLock.hpp
new file mode 100644
index 0000000000..81564b8eac
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_MasterLock.hpp
@@ -0,0 +1,73 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_MASTER_LOCK_HPP
+#define KOKKOS_MASTER_LOCK_HPP
+
+#include <Kokkos_Macros.hpp>
+
+namespace Kokkos { namespace Experimental {
+
+// my be used to coordinate work between master instances
+// SHOULD NOT be used within a parallel algorithm
+//
+// This lock should be used with with a scoped lock guard
+// i.e. std::unique_lock<Lock>, std::lock_guard
+//
+// cannot be copied or moved
+// has the following functions available
+//
+// Lock()
+// ~Lock()
+//
+// void lock()
+// void unlock()
+// bool try_lock()
+//
+template <typename ExecutionSpace>
+class MasterLock;
+
+}} // namespace Kokkos::Experimental
+
+#endif //KOKKOS_MASTER_LOCK_HPP
+
diff --git a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
index dbf1ad8057..1da936067d 100644
--- a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
+++ b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
@@ -66,11 +66,6 @@ private:
   enum : uint32_t { max_bit_count_lg2 = CB::max_bit_count_lg2 };
   enum : uint32_t { max_bit_count     = CB::max_bit_count };
 
-  /*  Defaults for min block, max block, and superblock sizes */
-  enum : uint32_t { MIN_BLOCK_SIZE_LG2  =  6  /*   64 bytes */ };
-  enum : uint32_t { MAX_BLOCK_SIZE_LG2  = 12  /*   4k bytes */ };
-  enum : uint32_t { SUPERBLOCK_SIZE_LG2 = 16  /*  64k bytes */ };
-
   enum : uint32_t { HINT_PER_BLOCK_SIZE = 2 };
 
   /*  Each superblock has a concurrent bitset state
@@ -85,6 +80,14 @@ private:
    *  is concurrently updated.
    */
 
+  /*  Mapping between block_size <-> block_state
+   *
+   *  block_state = ( m_sb_size_lg2 - block_size_lg2 ) << state_shift
+   *  block_size  = m_sb_size_lg2 - ( block_state >> state_shift )
+   *
+   *  Thus A_block_size < B_block_size  <=>  A_block_state > B_block_state
+   */
+
   typedef typename DeviceType::memory_space base_memory_space ;
 
   enum { accessible =
@@ -251,10 +254,10 @@ public:
    *  significant runtime performance improvements.
    */
   MemoryPool( const base_memory_space & memspace
-            , const size_t   min_total_alloc_size
-            , const uint32_t min_block_alloc_size // = 1 << MIN_BLOCK_SIZE_LG2
-            , const uint32_t max_block_alloc_size // = 1 << MAX_BLOCK_SIZE_LG2
-            , const uint32_t min_superblock_size  // = 1 << SUPERBLOCK_SIZE_LG2
+            , const size_t min_total_alloc_size
+            , size_t min_block_alloc_size = 0
+            , size_t max_block_alloc_size = 0
+            , size_t min_superblock_size  = 0
             )
     : m_tracker()
     , m_sb_state_array(0)
@@ -267,8 +270,43 @@ public:
     , m_data_offset(0)
     , m_unused_padding(0)
     {
-      const uint32_t int_align_lg2  = 3 ; /* align as int[8] */
-      const uint32_t int_align_mask = ( 1u << int_align_lg2 ) - 1 ;
+      const uint32_t int_align_lg2   = 3 ; /* align as int[8] */
+      const uint32_t int_align_mask  = ( 1u << int_align_lg2 ) - 1 ;
+
+      // Constraints and defaults:
+      //   min_block_alloc_size <= max_block_alloc_size
+      //   max_block_alloc_size <= min_superblock_size 
+      //   min_superblock_size  <= min_total_alloc_size
+
+      const uint32_t MIN_BLOCK_SIZE  = 1u << 6   /*   64 bytes */ ;
+      const uint32_t MAX_BLOCK_SIZE  = 1u << 12  /*   4k bytes */ ;
+
+      if ( 0 == min_block_alloc_size ) min_block_alloc_size = MIN_BLOCK_SIZE ;
+
+      if ( 0 == max_block_alloc_size ) {
+
+        max_block_alloc_size = MAX_BLOCK_SIZE ;
+
+        // Upper bound of total allocation size
+        max_block_alloc_size = std::min( size_t(max_block_alloc_size)
+                                       , min_total_alloc_size );
+
+        // Lower bound of minimum block size
+        max_block_alloc_size = std::max( max_block_alloc_size
+                                       , min_block_alloc_size );
+      }
+
+      if ( 0 == min_superblock_size ) {
+        min_superblock_size = max_block_alloc_size ;
+
+        // Upper bound of total allocation size
+        min_superblock_size = std::min( size_t(min_superblock_size)
+                                      , min_total_alloc_size );
+
+        // Lower bound of maximum block size
+        min_superblock_size = std::max( min_superblock_size
+                                      , max_block_alloc_size );
+      }
 
       // Block and superblock size is power of two:
 
@@ -435,6 +473,8 @@ public:
   void * allocate( size_t alloc_size
                  , int32_t attempt_limit = 1 ) const noexcept
     {
+      if ( 0 == alloc_size ) return (void*) 0 ;
+
       void * p = 0 ;
 
       const uint32_t block_size_lg2 = get_block_size_lg2( alloc_size );
@@ -444,10 +484,9 @@ public:
         // Allocation will fit within a superblock
         // that has block sizes ( 1 << block_size_lg2 )
 
-        const uint32_t block_count_lg2  = m_sb_size_lg2 - block_size_lg2 ;
-        const uint32_t block_state      = block_count_lg2 << state_shift ;
-        const uint32_t block_count      = 1u << block_count_lg2 ;
-        const uint32_t block_count_mask = block_count - 1 ;
+        const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2 ;
+        const uint32_t block_state     = block_count_lg2 << state_shift ;
+        const uint32_t block_count     = 1u << block_count_lg2 ;
 
         // Superblock hints for this block size:
         //   hint_sb_id_ptr[0] is the dynamically changing hint
@@ -465,7 +504,7 @@ public:
         // the guess for which block within a superblock should
         // be claimed.  If not available then a search occurs.
 
-        const uint32_t block_id_hint = block_count_mask &
+        const uint32_t block_id_hint =
           (uint32_t)( Kokkos::Impl::clock_tic()
 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA )
           // Spread out potentially concurrent access
@@ -474,6 +513,9 @@ public:
 #endif
           );
 
+        // expected state of superblock for allocation
+        uint32_t sb_state = block_state ;
+
         int32_t sb_id = -1 ;
 
         volatile uint32_t * sb_state_array = 0 ;
@@ -484,6 +526,8 @@ public:
 
           if ( sb_id < 0 ) {
 
+            // No superblock specified, try the hint for this block size
+
             sb_id = hint_sb_id = int32_t( *hint_sb_id_ptr );
 
             sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size );
@@ -493,16 +537,20 @@ public:
           //   0 <= sb_id
           //   sb_state_array == m_sb_state_array + m_sb_state_size * sb_id
 
-          if ( block_state == ( state_header_mask & *sb_state_array ) ) {
+          if ( sb_state == ( state_header_mask & *sb_state_array ) ) {
 
-            // This superblock state is assigned to this block size.
-            // Try to claim a bit.
+            // This superblock state is as expected, for the moment.
+            // Attempt to claim a bit.  The attempt updates the state
+            // so have already made sure the state header is as expected.
+
+            const uint32_t count_lg2 = sb_state >> state_shift ;
+            const uint32_t mask      = ( 1u << count_lg2 ) - 1 ;
 
             const Kokkos::pair<int,int> result =
               CB::acquire_bounded_lg2( sb_state_array
-                                     , block_count_lg2
-                                     , block_id_hint
-                                     , block_state
+                                     , count_lg2
+                                     , block_id_hint & mask
+                                     , sb_state
                                      );
 
             // If result.first < 0 then failed to acquire
@@ -512,16 +560,18 @@ public:
 
             if ( 0 <= result.first ) { // acquired a bit
 
+              const uint32_t size_lg2 = m_sb_size_lg2 - count_lg2 ;
+
               // Set the allocated block pointer
 
               p = ((char*)( m_sb_state_array + m_data_offset ))
                 + ( uint32_t(sb_id) << m_sb_size_lg2 ) // superblock memory
-                + ( result.first    << block_size_lg2 ); // block memory
+                + ( result.first    << size_lg2 );     // block memory
 
               break ; // Success
             }
 
-// printf("  acquire block_count_lg2(%d) block_state(0x%x) sb_id(%d) result(%d,%d)\n" , block_count_lg2 , block_state , sb_id , result.first , result.second );
+// printf("  acquire count_lg2(%d) sb_state(0x%x) sb_id(%d) result(%d,%d)\n" , count_lg2 , sb_state , sb_id , result.first , result.second );
 
           }
           //------------------------------------------------------------------
@@ -529,12 +579,18 @@ public:
           //  Must find a new superblock.
 
           //  Start searching at designated index for this block size.
-          //  Look for a partially full superblock of this block size.
-          //  Look for an empty superblock just in case cannot find partfull.
+          //  Look for superblock that, in preferential order,
+          //  1) part-full superblock of this block size
+          //  2) empty superblock to claim for this block size
+          //  3) part-full superblock of the next larger block size
 
+          sb_state = block_state ; // Expect to find the desired state
           sb_id = -1 ;
 
+          bool update_hint = false ;
           int32_t sb_id_empty = -1 ;
+          int32_t sb_id_large = -1 ;
+          uint32_t sb_state_large = 0 ;
 
           sb_state_array = m_sb_state_array + sb_id_begin * m_sb_state_size ;
 
@@ -544,38 +600,54 @@ public:
             //  Note that the state may change at any moment
             //  as concurrent allocations and deallocations occur.
             
-            const uint32_t state = *sb_state_array ;
-            const uint32_t used  = state & state_used_mask ;
+            const uint32_t full_state = *sb_state_array ;
+            const uint32_t used       = full_state & state_used_mask ;
+            const uint32_t state      = full_state & state_header_mask ;
 
-            if ( block_state == ( state & state_header_mask ) ) {
+            if ( state == block_state ) {
 
               //  Superblock is assigned to this block size
 
-              if ( used < block_count ) { 
+              if ( used < block_count ) {
 
                 // There is room to allocate one block
 
                 sb_id = id ;
 
-                if ( used + 1 < block_count ) {
+                // Is there room to allocate more than one block?
 
-                  // There is room to allocate more than one block
-
-                  Kokkos::atomic_compare_exchange
-                    ( hint_sb_id_ptr , uint32_t(hint_sb_id) , uint32_t(sb_id) );
-                }
+                update_hint = used + 1 < block_count ;
 
                 break ;
               }
             }
-            else if ( ( used == 0 ) && ( sb_id_empty == -1 ) ) {
+            else if ( 0 == used ) {
 
-              // Superblock is not assigned to this block size
-              // and is the first empty superblock encountered.
-              // Save this id to use if a partfull superblock is not found.
+              // Superblock is empty
 
-              sb_id_empty = id ;
+              if ( -1 == sb_id_empty ) {
+
+                // Superblock is not assigned to this block size
+                // and is the first empty superblock encountered.
+                // Save this id to use if a partfull superblock is not found.
+
+                sb_id_empty = id ;
+              }
             }
+            else if ( ( -1 == sb_id_empty /* have not found an empty */ ) &&
+                      ( -1 == sb_id_large /* have not found a larger */ ) &&
+                      ( state < block_state /* a larger block */ ) &&
+                      // is not full:
+                      ( used < ( 1u << ( state >> state_shift ) ) ) ) {
+              //  First superblock encountered that is
+              //  larger than this block size and
+              //  has room for an allocation.
+              //  Save this id to use of partfull or empty superblock not found
+              sb_id_large    = id ;
+              sb_state_large = state ;
+            }
+
+            // Iterate around the superblock array:
 
             if ( ++id < m_sb_count ) {
               sb_state_array += m_sb_state_size ;
@@ -586,7 +658,7 @@ public:
             }
           }
 
-// printf("  search m_sb_count(%d) sb_id(%d) sb_id_empty(%d)\n" , m_sb_count , sb_id , sb_id_empty );
+// printf("  search m_sb_count(%d) sb_id(%d) sb_id_empty(%d) sb_id_large(%d)\n" , m_sb_count , sb_id , sb_id_empty , sb_id_large);
 
           if ( sb_id < 0 ) {
 
@@ -609,21 +681,31 @@ public:
 
               const uint32_t state_empty = state_header_mask & *sb_state_array ;
 
-              if ( state_empty ==
-                     Kokkos::atomic_compare_exchange
-                       (sb_state_array,state_empty,block_state) ) {
+              // If this thread claims the empty block then update the hint
+              update_hint =
+                state_empty ==
+                  Kokkos::atomic_compare_exchange
+                    (sb_state_array,state_empty,block_state);
+            }
+            else if ( 0 <= sb_id_large ) {
 
-                // If this thread claimed the block then update the hint
+              // Found a larger superblock with space available
 
-                Kokkos::atomic_compare_exchange
-                  ( hint_sb_id_ptr , uint32_t(hint_sb_id) , uint32_t(sb_id) );
-              }
+              sb_id    = sb_id_large ;
+              sb_state = sb_state_large ;
+
+              sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size );
             }
             else {
               // Did not find a potentially usable superblock
               --attempt_limit ;
             }
           }
+
+          if ( update_hint ) {
+            Kokkos::atomic_compare_exchange
+              ( hint_sb_id_ptr , uint32_t(hint_sb_id) , uint32_t(sb_id) );
+          }
         } // end allocation attempt loop
 
         //--------------------------------------------------------------------
@@ -646,6 +728,8 @@ public:
   KOKKOS_INLINE_FUNCTION
   void deallocate( void * p , size_t /* alloc_size */ ) const noexcept
     {
+      if ( 0 == p ) return ;
+
       // Determine which superblock and block
       const ptrdiff_t d =
         ((char*)p) - ((char*)( m_sb_state_array + m_data_offset ));
diff --git a/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp b/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp
index 94b58b8aff..af9c8ea782 100644
--- a/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp
+++ b/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp
@@ -72,11 +72,11 @@ struct MemoryTraits {
   //! Tag this class as a kokkos memory traits:
   typedef MemoryTraits memory_traits ;
 
-  enum { Unmanaged    = T & unsigned(Kokkos::Unmanaged) };
-  enum { RandomAccess = T & unsigned(Kokkos::RandomAccess) };
-  enum { Atomic       = T & unsigned(Kokkos::Atomic) };
-  enum { Restrict     = T & unsigned(Kokkos::Restrict) };
-  enum { Aligned      = T & unsigned(Kokkos::Aligned) };
+  enum : bool { Unmanaged    = (unsigned(0) != (T & unsigned(Kokkos::Unmanaged))) };
+  enum : bool { RandomAccess = (unsigned(0) != (T & unsigned(Kokkos::RandomAccess))) };
+  enum : bool { Atomic       = (unsigned(0) != (T & unsigned(Kokkos::Atomic))) };
+  enum : bool { Restrict     = (unsigned(0) != (T & unsigned(Kokkos::Restrict))) };
+  enum : bool { Aligned      = (unsigned(0) != (T & unsigned(Kokkos::Aligned))) };
 
 };
 
@@ -109,7 +109,11 @@ enum { MEMORY_ALIGNMENT =
 #else
     ( 1 << Kokkos::Impl::integral_power_of_two( 128 ) )
 #endif
-  , MEMORY_ALIGNMENT_THRESHOLD = 4 
+#if defined( KOKKOS_MEMORY_ALIGNMENT_THRESHOLD )
+  , MEMORY_ALIGNMENT_THRESHOLD = KOKKOS_MEMORY_ALIGNMENT_THRESHOLD
+#else
+  , MEMORY_ALIGNMENT_THRESHOLD = 4
+#endif
   };
 
 
diff --git a/lib/kokkos/core/src/Kokkos_OpenMP.hpp b/lib/kokkos/core/src/Kokkos_OpenMP.hpp
index 3e11621ce6..d5de01cf2f 100644
--- a/lib/kokkos/core/src/Kokkos_OpenMP.hpp
+++ b/lib/kokkos/core/src/Kokkos_OpenMP.hpp
@@ -47,10 +47,6 @@
 #include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ENABLE_OPENMP)
 
-#if !defined(_OPENMP)
-#error "You enabled Kokkos OpenMP support without enabling OpenMP in the compiler!"
-#endif
-
 #include <Kokkos_Core_fwd.hpp>
 
 #include <cstddef>
@@ -67,95 +63,144 @@
 #include <Kokkos_Layout.hpp>
 #include <impl/Kokkos_Tags.hpp>
 
+#include <vector>
+
 /*--------------------------------------------------------------------------*/
 
 namespace Kokkos {
 
+namespace Impl {
+class OpenMPExec;
+}
+
 /// \class OpenMP
 /// \brief Kokkos device for multicore processors in the host memory space.
 class OpenMP {
 public:
-  //------------------------------------
-  //! \name Type declarations that all Kokkos devices must provide.
-  //@{
-
   //! Tag this class as a kokkos execution space
   using execution_space = OpenMP;
+
+  using memory_space =
   #ifdef KOKKOS_ENABLE_HBWSPACE
-  using memory_space = Experimental::HBWSpace;
+    Experimental::HBWSpace;
   #else
-  using memory_space = HostSpace;
+    HostSpace;
   #endif
+
   //! This execution space preferred device_type
-  using device_type = Kokkos::Device<execution_space,memory_space>;
-
-  using array_layout = LayoutRight;
-  using size_type = memory_space::size_type;
-
+  using device_type          = Kokkos::Device< execution_space, memory_space >;
+  using array_layout         = LayoutRight;
+  using size_type            = memory_space::size_type;
   using scratch_memory_space = ScratchMemorySpace< OpenMP >;
 
-  //@}
-  //------------------------------------
-  //! \name Functions that all Kokkos execution spaces must implement.
-  //@{
+  /// \brief Get a handle to the default execution space instance
+  inline
+  OpenMP() noexcept;
 
-  inline static bool in_parallel();
+  // Using omp_get_max_threads(); is problematic
+  // On Intel (essentially an initial call to the OpenMP runtime
+  // without a parallel region before will set a process mask for a single core
+  // The runtime will than bind threads for a parallel region to other cores on the
+  // entering the first parallel region and make the process mask the aggregate of
+  // the thread masks. The intend seems to be to make serial code run fast, if you
+  // compile with OpenMP enabled but don't actually use parallel regions or so
+  // static int omp_max_threads = omp_get_max_threads();
+  static int get_current_max_threads() noexcept;
 
-  /** \brief  Set the device in a "sleep" state. A noop for OpenMP.  */
-  static bool sleep();
+  /// \brief Initialize the default execution space
+  ///
+  /// if ( thread_count == -1 )
+  ///   then use the number of threads that openmp defaults to
+  /// if ( thread_count == 0 && Kokkos::hwlow_available() )
+  ///   then use hwloc to choose the number of threads and change
+  ///   the default number of threads
+  /// if ( thread_count > 0 )
+  ///   then force openmp to use the given number of threads and change
+  ///   the default number of threads
+  static void initialize( int thread_count = -1 );
 
-  /** \brief Wake the device from the 'sleep' state. A noop for OpenMP. */
-  static bool wake();
-
-  /** \brief Wait until all dispatched functors complete. A noop for OpenMP. */
-  static void fence() {}
-
-  /// \brief Print configuration information to the given output stream.
-  static void print_configuration( std::ostream & , const bool detail = false );
-
-  /// \brief Free any resources being consumed by the device.
+  /// \brief Free any resources being consumed by the default execution space
   static void finalize();
 
-  /** \brief  Initialize the device.
-   *
-   *  1) If the hardware locality library is enabled and OpenMP has not
-   *     already bound threads then bind OpenMP threads to maximize
-   *     core utilization and group for memory hierarchy locality.
-   *
-   *  2) Allocate a HostThread for each OpenMP thread to hold its
-   *     topology and fan in/out data.
-   */
-  static void initialize( unsigned thread_count = 0 ,
-                          unsigned use_numa_count = 0 ,
-                          unsigned use_cores_per_numa = 0 );
+  /// \brief is the default execution space initialized for current 'master' thread
+  static bool is_initialized() noexcept;
 
-  static int is_initialized();
+  /// \brief Print configuration information to the given output stream.
+  static void print_configuration( std::ostream & , const bool verbose = false );
 
-  /** \brief  Return the maximum amount of concurrency.  */
-  static int concurrency();
+  /// \brief is the instance running a parallel algorithm
+  inline
+  static bool in_parallel( OpenMP const& = OpenMP() ) noexcept;
 
-  //@}
-  //------------------------------------
-  /** \brief  This execution space has a topological thread pool which can be queried.
-   *
-   *  All threads within a pool have a common memory space for which they are cache coherent.
-   *    depth = 0  gives the number of threads in the whole pool.
-   *    depth = 1  gives the number of threads in a NUMA region, typically sharing L3 cache.
-   *    depth = 2  gives the number of threads at the finest granularity, typically sharing L1 cache.
-   */
-  inline static int thread_pool_size( int depth = 0 );
+  /// \brief Wait until all dispatched functors complete on the given instance
+  ///
+  ///  This is a no-op on OpenMP
+  inline
+  static void fence( OpenMP const& = OpenMP() ) noexcept;
+
+  /// \brief Does the given instance return immediately after launching
+  /// a parallel algorithm
+  ///
+  /// This always returns false on OpenMP
+  inline
+  static bool is_asynchronous( OpenMP const& = OpenMP() ) noexcept;
+
+
+  /// \brief Partition the default instance into new instances without creating
+  ///  new masters
+  ///
+  /// This is a no-op on OpenMP since the default instance cannot be partitioned
+  /// without promoting other threads to 'master'
+  static std::vector<OpenMP> partition(...);
+
+  /// Non-default instances should be ref-counted so that when the last
+  /// is destroyed the instance resources are released
+  ///
+  /// This is a no-op on OpenMP since a non default instance cannot be created
+  static OpenMP create_instance(...);
+
+  /// \brief Partition the default instance and call 'f' on each new 'master' thread
+  ///
+  /// Func is a functor with the following signiture
+  ///   void( int partition_id, int num_partitions )
+  template <typename F>
+  static void partition_master( F const& f
+                              , int requested_num_partitions = 0
+                              , int requested_partition_size = 0
+                              );
+
+  inline
+  static int thread_pool_size() noexcept;
 
   /** \brief  The rank of the executing thread in this thread pool */
-  KOKKOS_INLINE_FUNCTION static int thread_pool_rank();
+  KOKKOS_INLINE_FUNCTION
+  static int thread_pool_rank() noexcept;
 
-  //------------------------------------
+#if !defined( KOKKOS_DISABLE_DEPRECATED )
+  /// \brief Initialize the default execution space
+  static void initialize( int thread_count,
+                          int use_numa_count,
+                          int use_cores_per_numa = 0);
 
-  inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
+  inline
+  static int thread_pool_size( int depth );
 
-  KOKKOS_INLINE_FUNCTION static
-  unsigned hardware_thread_id() { return thread_pool_rank(); }
+  static void sleep() {};
+  static void wake() {};
 
-  static const char* name();
+  // use UniqueToken
+  static int concurrency();
+
+  // use UniqueToken
+  inline
+  static int max_hardware_threads() noexcept;
+
+  // use UniqueToken
+  KOKKOS_INLINE_FUNCTION
+  static int hardware_thread_id() noexcept;
+#endif
+
+  static constexpr const char* name() noexcept { return "OpenMP"; }
 };
 
 } // namespace Kokkos
@@ -195,6 +240,7 @@ struct VerifyExecutionCanAccessMemorySpace
 /*--------------------------------------------------------------------------*/
 
 #include <OpenMP/Kokkos_OpenMP_Exec.hpp>
+#include <OpenMP/Kokkos_OpenMP_Team.hpp>
 #include <OpenMP/Kokkos_OpenMP_Parallel.hpp>
 #include <OpenMP/Kokkos_OpenMP_Task.hpp>
 
diff --git a/lib/kokkos/core/src/Kokkos_Parallel.hpp b/lib/kokkos/core/src/Kokkos_Parallel.hpp
index e412e608b2..fc8d6bec81 100644
--- a/lib/kokkos/core/src/Kokkos_Parallel.hpp
+++ b/lib/kokkos/core/src/Kokkos_Parallel.hpp
@@ -177,22 +177,23 @@ void parallel_for( const ExecPolicy  & policy
                  )
 {
 #if defined(KOKKOS_ENABLE_PROFILING)
-    uint64_t kpID = 0;
-     if(Kokkos::Profiling::profileLibraryLoaded()) {
-     	Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
-     }
+  uint64_t kpID = 0;
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Impl::ParallelConstructName<FunctorType, typename ExecPolicy::work_tag> name(str);
+    Kokkos::Profiling::beginParallelFor(name.get(), 0, &kpID);
+  }
 #endif
 
-    Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
+    Kokkos::Impl::shared_allocation_tracking_disable();
     Impl::ParallelFor< FunctorType , ExecPolicy > closure( functor , policy );
-    Kokkos::Impl::shared_allocation_tracking_release_and_enable();
+    Kokkos::Impl::shared_allocation_tracking_enable();
 
    closure.execute();
 
 #if defined(KOKKOS_ENABLE_PROFILING)
-     if(Kokkos::Profiling::profileLibraryLoaded()) {
-        Kokkos::Profiling::endParallelFor(kpID);
-     }
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Profiling::endParallelFor(kpID);
+  }
 #endif
 }
 
@@ -210,14 +211,15 @@ void parallel_for( const size_t        work_count
 
 #if defined(KOKKOS_ENABLE_PROFILING)
   uint64_t kpID = 0;
-     if(Kokkos::Profiling::profileLibraryLoaded()) {
-  	Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
-     }
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Impl::ParallelConstructName<FunctorType, void> name(str);
+    Kokkos::Profiling::beginParallelFor(name.get(), 0, &kpID);
+  }
 #endif
 
-  Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
+  Kokkos::Impl::shared_allocation_tracking_disable();
   Impl::ParallelFor< FunctorType , policy > closure( functor , policy(0,work_count) );
-  Kokkos::Impl::shared_allocation_tracking_release_and_enable();
+  Kokkos::Impl::shared_allocation_tracking_enable();
 
   closure.execute();
 
@@ -420,21 +422,22 @@ void parallel_scan( const ExecutionPolicy & policy
 {
 #if defined(KOKKOS_ENABLE_PROFILING)
   uint64_t kpID = 0;
-     if(Kokkos::Profiling::profileLibraryLoaded()) {
-	Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
-     }
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Impl::ParallelConstructName<FunctorType, typename ExecutionPolicy::work_tag> name(str);
+    Kokkos::Profiling::beginParallelScan(name.get(), 0, &kpID);
+  }
 #endif
 
-  Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
+  Kokkos::Impl::shared_allocation_tracking_disable();
   Impl::ParallelScan< FunctorType , ExecutionPolicy > closure( functor , policy );
-  Kokkos::Impl::shared_allocation_tracking_release_and_enable();
+  Kokkos::Impl::shared_allocation_tracking_enable();
 
   closure.execute();
 
 #if defined(KOKKOS_ENABLE_PROFILING)
-     if(Kokkos::Profiling::profileLibraryLoaded()) {
-	Kokkos::Profiling::endParallelScan(kpID);
-     }
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Profiling::endParallelScan(kpID);
+  }
 #endif
 
 }
@@ -453,21 +456,22 @@ void parallel_scan( const size_t        work_count
 
 #if defined(KOKKOS_ENABLE_PROFILING)
   uint64_t kpID = 0;
-     if(Kokkos::Profiling::profileLibraryLoaded()) {
-	Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
-     }
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Impl::ParallelConstructName<FunctorType, void> name(str);
+    Kokkos::Profiling::beginParallelScan(name.get(), 0, &kpID);
+  }
 #endif
 
-  Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
+  Kokkos::Impl::shared_allocation_tracking_disable();
   Impl::ParallelScan< FunctorType , policy > closure( functor , policy(0,work_count) );
-  Kokkos::Impl::shared_allocation_tracking_release_and_enable();
+  Kokkos::Impl::shared_allocation_tracking_enable();
 
   closure.execute();
 
 #if defined(KOKKOS_ENABLE_PROFILING)
-     if(Kokkos::Profiling::profileLibraryLoaded()) {
-	Kokkos::Profiling::endParallelScan(kpID);
-     }
+  if(Kokkos::Profiling::profileLibraryLoaded()) {
+    Kokkos::Profiling::endParallelScan(kpID);
+  }
 #endif
 
 }
diff --git a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
index 8ea5183e35..9df6d4ba09 100644
--- a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
+++ b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
@@ -872,13 +872,14 @@ namespace Impl {
         const FunctorType& functor,
         ReturnType& return_value) {
           #if defined(KOKKOS_ENABLE_PROFILING)
-            uint64_t kpID = 0;
-            if(Kokkos::Profiling::profileLibraryLoaded()) {
-              Kokkos::Profiling::beginParallelReduce("" == label ? typeid(FunctorType).name() : label, 0, &kpID);
-            }
+          uint64_t kpID = 0;
+          if(Kokkos::Profiling::profileLibraryLoaded()) {
+            Kokkos::Impl::ParallelConstructName<FunctorType, typename PolicyType::work_tag> name(label);
+            Kokkos::Profiling::beginParallelReduce(name.get(), 0, &kpID);
+          }
           #endif
 
-          Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
+          Kokkos::Impl::shared_allocation_tracking_disable();
           #ifdef KOKKOS_IMPL_NEED_FUNCTOR_WRAPPER
           Impl::ParallelReduce<typename functor_adaptor::functor_type, PolicyType, typename return_value_adapter::reducer_type >
              closure(functor_adaptor::functor(functor),
@@ -890,13 +891,13 @@ namespace Impl {
                      policy,
                      return_value_adapter::return_value(return_value,functor));
           #endif
-          Kokkos::Impl::shared_allocation_tracking_release_and_enable();
+          Kokkos::Impl::shared_allocation_tracking_enable();
           closure.execute();
 
           #if defined(KOKKOS_ENABLE_PROFILING)
-            if(Kokkos::Profiling::profileLibraryLoaded()) {
-              Kokkos::Profiling::endParallelReduce(kpID);
-            }
+          if(Kokkos::Profiling::profileLibraryLoaded()) {
+            Kokkos::Profiling::endParallelReduce(kpID);
+          }
           #endif
         }
 
diff --git a/lib/kokkos/core/src/Kokkos_Serial.hpp b/lib/kokkos/core/src/Kokkos_Serial.hpp
index 73e8ae3030..539761a1f9 100644
--- a/lib/kokkos/core/src/Kokkos_Serial.hpp
+++ b/lib/kokkos/core/src/Kokkos_Serial.hpp
@@ -66,6 +66,7 @@
 
 #include <KokkosExp_MDRangePolicy.hpp>
 
+#include <Kokkos_UniqueToken.hpp>
 
 namespace Kokkos {
 
@@ -526,6 +527,7 @@ public:
     }
 };
 
+
 /*--------------------------------------------------------------------------*/
 
 template< class FunctorType , class ... Traits >
@@ -604,6 +606,178 @@ public:
     {}
 };
 
+} // namespace Impl
+} // namespace Kokkos
+
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+/* Parallel patterns for Kokkos::Serial with MDRangePolicy */
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType ,
+                   Kokkos::Experimental::MDRangePolicy< Traits ... > ,
+                   Kokkos::Serial
+                 >
+{
+private:
+
+  typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
+  typedef typename MDRangePolicy::impl_range_policy Policy ;
+
+  typedef typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
+
+  const FunctorType   m_functor ;
+  const MDRangePolicy m_mdr_policy ;
+  const Policy        m_policy ;
+
+  void
+  exec() const
+    {
+      const typename Policy::member_type e = m_policy.end();
+      for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
+        iterate_type( m_mdr_policy, m_functor )( i );
+      }
+    }
+
+public:
+
+  inline
+  void execute() const
+    { this->exec(); }
+
+  inline
+  ParallelFor( const FunctorType   & arg_functor
+             , const MDRangePolicy & arg_policy )
+    : m_functor( arg_functor )
+    , m_mdr_policy(  arg_policy )
+    , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
+    {}
+};
+
+
+template< class FunctorType , class ReducerType , class ... Traits >
+class ParallelReduce< FunctorType
+                    , Kokkos::Experimental::MDRangePolicy< Traits ... >
+                    , ReducerType
+                    , Kokkos::Serial
+                    >
+{
+private:
+
+  typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
+  typedef typename MDRangePolicy::impl_range_policy Policy ;
+
+  typedef typename MDRangePolicy::work_tag                                  WorkTag ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  typedef typename ReducerTypeFwd::value_type ValueType; 
+
+  typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ;
+
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTag >  ValueInit ;
+
+  typedef typename Analysis::pointer_type    pointer_type ;
+  typedef typename Analysis::reference_type  reference_type ;
+
+
+  using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy
+                                                                           , FunctorType
+                                                                           , WorkTag
+                                                                           , ValueType
+                                                                           >;
+
+
+  const FunctorType   m_functor ;
+  const MDRangePolicy m_mdr_policy ;
+  const Policy        m_policy ;
+  const ReducerType   m_reducer ;
+  const pointer_type  m_result_ptr ;
+
+  inline
+  void
+  exec( reference_type update ) const
+    {
+      const typename Policy::member_type e = m_policy.end();
+      for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
+        iterate_type( m_mdr_policy, m_functor, update )( i );
+      }
+    }
+
+public:
+
+  inline
+  void execute() const
+    {
+      const size_t pool_reduce_size =
+        Analysis::value_size( ReducerConditional::select(m_functor , m_reducer) );
+      const size_t team_reduce_size  = 0 ; // Never shrinks
+      const size_t team_shared_size  = 0 ; // Never shrinks
+      const size_t thread_local_size = 0 ; // Never shrinks
+
+      serial_resize_thread_team_data( pool_reduce_size
+                                    , team_reduce_size
+                                    , team_shared_size
+                                    , thread_local_size );
+
+      HostThreadTeamData & data = *serial_get_thread_team_data();
+
+      pointer_type ptr =
+        m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local());
+
+      reference_type update =
+        ValueInit::init(  ReducerConditional::select(m_functor , m_reducer) , ptr );
+
+      this-> exec( update );
+
+      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::
+        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
+    }
+
+  template< class HostViewType >
+  ParallelReduce( const FunctorType  & arg_functor ,
+                  const MDRangePolicy       & arg_policy ,
+                  const HostViewType & arg_result_view ,
+                  typename std::enable_if<
+                               Kokkos::is_view< HostViewType >::value &&
+                              !Kokkos::is_reducer_type<ReducerType>::value
+                  ,void*>::type = NULL)
+    : m_functor( arg_functor )
+    , m_mdr_policy( arg_policy )
+    , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
+    , m_reducer( InvalidType() )
+    , m_result_ptr( arg_result_view.data() )
+    {
+      static_assert( Kokkos::is_view< HostViewType >::value
+        , "Kokkos::Serial reduce result must be a View" );
+
+      static_assert( std::is_same< typename HostViewType::memory_space , HostSpace >::value
+        , "Kokkos::Serial reduce result must be a View in HostSpace" );
+    }
+
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+                , MDRangePolicy       arg_policy
+                , const ReducerType& reducer )
+    : m_functor( arg_functor )
+    , m_mdr_policy(  arg_policy )
+    , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
+    , m_reducer( reducer )
+    , m_result_ptr(  reducer.view().data() )
+    {
+      /*static_assert( std::is_same< typename ViewType::memory_space
+                                      , Kokkos::HostSpace >::value
+        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
+    }
+};
+
+
+
 } // namespace Impl
 } // namespace Kokkos
 
@@ -819,6 +993,60 @@ public:
 /*--------------------------------------------------------------------------*/
 /*--------------------------------------------------------------------------*/
 
+namespace Kokkos { namespace Experimental {
+
+template<>
+class UniqueToken< Serial, UniqueTokenScope::Instance>
+{
+public:
+  using execution_space = Serial;
+  using size_type       = int;
+
+  /// \brief create object size for concurrency on the given instance
+  ///
+  /// This object should not be shared between instances
+  UniqueToken( execution_space const& = execution_space() ) noexcept {}
+
+  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
+  inline
+  int size() const noexcept { return 1; }
+
+  /// \brief acquire value such that 0 <= value < size()
+  inline
+  int acquire() const  noexcept { return 0; }
+
+  /// \brief release a value acquired by generate
+  inline
+  void release( int ) const noexcept {}
+};
+
+template<>
+class UniqueToken< Serial, UniqueTokenScope::Global>
+{
+public:
+  using execution_space = Serial;
+  using size_type       = int;
+
+  /// \brief create object size for concurrency on the given instance
+  ///
+  /// This object should not be shared between instances
+  UniqueToken( execution_space const& = execution_space() ) noexcept {}
+
+  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
+  inline
+  int size() const noexcept { return 1; }
+
+  /// \brief acquire value such that 0 <= value < size()
+  inline
+  int acquire() const  noexcept { return 0; }
+
+  /// \brief release a value acquired by generate
+  inline
+  void release( int ) const noexcept {}
+};
+
+}} // namespace Kokkos::Experimental
+
 #include <impl/Kokkos_Serial_Task.hpp>
 
 #endif // defined( KOKKOS_ENABLE_SERIAL )
diff --git a/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp b/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp
index 7edda7aa75..fcfc91a4ee 100644
--- a/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp
+++ b/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp
@@ -148,7 +148,7 @@ private:
     typename std::conditional< Arg2_is_space , Arg2 , void
     >::type >::type ;
 
-  using task_base  = Impl::TaskBase< Space , ValueType , void > ;
+  using task_base  = Impl::TaskBase< void , void , void > ;
   using queue_type = Impl::TaskQueue< Space > ;
 
   task_base * m_task ;
@@ -293,13 +293,17 @@ public:
   //----------------------------------------
 
   KOKKOS_INLINE_FUNCTION
-  typename task_base::get_return_type
+  int is_ready() const noexcept
+    { return ( 0 == m_task ) || ( ((task_base*) task_base::LockTag) == m_task->m_wait ); }
+
+  KOKKOS_INLINE_FUNCTION
+  const typename Impl::TaskResult< ValueType >::reference_type
   get() const
     {
       if ( 0 == m_task ) {
         Kokkos::abort( "Kokkos:::Future::get ERROR: is_null()");
       }
-      return m_task->get();
+      return Impl::TaskResult< ValueType >::get( m_task );
     }
 };
 
@@ -396,7 +400,7 @@ private:
 
   using track_type = Kokkos::Impl::SharedAllocationTracker ;
   using queue_type = Kokkos::Impl::TaskQueue< ExecSpace > ;
-  using task_base  = Impl::TaskBase< ExecSpace , void , void > ;
+  using task_base  = Impl::TaskBase< void , void , void > ;
 
   track_type   m_track ;
   queue_type * m_queue ;
@@ -464,29 +468,19 @@ public:
 
   KOKKOS_INLINE_FUNCTION
   memory_pool * memory() const noexcept
-    { return m_queue ? m_queue->m_memory : (memory_pool*) 0 ; }
+    { return m_queue ? &( m_queue->m_memory ) : (memory_pool*) 0 ; }
 
   //----------------------------------------
   /**\brief  Allocation size for a spawned task */
   template< typename FunctorType >
   KOKKOS_FUNCTION
   size_t spawn_allocation_size() const
-    {
-      using task_type  = Impl::TaskBase< execution_space
-                                       , typename FunctorType::value_type
-                                       , FunctorType > ;
-
-      return m_queue->allocate_block_size( sizeof(task_type) );
-    }
+    { return m_queue->template spawn_allocation_size< FunctorType >(); }
 
   /**\brief  Allocation size for a when_all aggregate */
   KOKKOS_FUNCTION
   size_t when_all_allocation_size( int narg ) const
-    {
-      using task_base  = Kokkos::Impl::TaskBase< ExecSpace , void , void > ;
-
-      return m_queue->allocate_block_size( sizeof(task_base) + narg * sizeof(task_base*) );
-    }
+    { return m_queue->when_all_allocation_size( narg ); }
 
   //----------------------------------------
 
@@ -507,7 +501,7 @@ public:
       queue_type * const queue =
         arg_policy.m_scheduler ? arg_policy.m_scheduler->m_queue : (
         arg_policy.m_dependence.m_task
-          ? arg_policy.m_dependence.m_task->m_queue
+          ? static_cast<queue_type*>(arg_policy.m_dependence.m_task->m_queue)
           : (queue_type*) 0 );
 
       if ( 0 == queue ) {
@@ -530,8 +524,12 @@ public:
       future_type f ;
 
       // Allocate task from memory pool
+
+      const size_t alloc_size =
+        queue->template spawn_allocation_size< FunctorType >();
+
       f.m_task =
-        reinterpret_cast< task_type * >(queue->allocate(sizeof(task_type)));
+        reinterpret_cast< task_type * >(queue->allocate(alloc_size) );
 
       if ( f.m_task ) {
 
@@ -539,15 +537,17 @@ public:
         // Reference count starts at two:
         //   +1 for the matching decrement when task is complete
         //   +1 for the future
-        new ( f.m_task )
-          task_type( arg_function
-                   , queue
-                   , arg_policy.m_dependence.m_task /* dependence */
-                   , 2                              /* reference count */
-                   , int(sizeof(task_type))         /* allocation size */
-                   , int(arg_policy.m_task_type)
-                   , int(arg_policy.m_priority)
-                   , std::move(arg_functor) );
+        new ( f.m_task ) task_type( std::move(arg_functor) );
+
+        f.m_task->m_apply      = arg_function ;
+        f.m_task->m_queue      = queue ;
+        f.m_task->m_next       = arg_policy.m_dependence.m_task ;
+        f.m_task->m_ref_count  = 2 ;
+        f.m_task->m_alloc_size = alloc_size ;
+        f.m_task->m_task_type  = arg_policy.m_task_type ;
+        f.m_task->m_priority   = arg_policy.m_priority ;
+
+        Kokkos::memory_fence();
 
         // The dependence (if any) is processed immediately
         // within the schedule function, as such the dependence's
@@ -586,6 +586,30 @@ public:
       // Postcondition: task is in Executing-Respawn state
     }
 
+  template< typename FunctorType >
+  KOKKOS_FUNCTION static
+  void
+  respawn( FunctorType         * arg_self
+         , TaskScheduler const &
+         , TaskPriority  const & arg_priority
+         )
+    {
+      // Precondition: task is in Executing state
+
+      using value_type  = typename FunctorType::value_type ;
+      using task_type   = Impl::TaskBase< execution_space
+                                        , value_type
+                                        , FunctorType > ;
+
+      task_type * const task = static_cast< task_type * >( arg_self );
+
+      task->m_priority = static_cast<int>(arg_priority);
+
+      task->add_dependence( (task_base*) 0 );
+
+      // Postcondition: task is in Executing-Respawn state
+    }
+
   //----------------------------------------
   /**\brief  Return a future that is complete
    *         when all input futures are complete.
@@ -596,7 +620,7 @@ public:
   when_all( Future< A1 , A2 > const arg[] , int narg )
     {
       using future_type = Future< execution_space > ;
-      using task_base   = Kokkos::Impl::TaskBase< execution_space , void , void > ;
+      using task_base   = Kokkos::Impl::TaskBase< void , void , void > ;
 
       future_type f ;
 
@@ -610,9 +634,9 @@ public:
             // Increment reference count to track subsequent assignment.
             Kokkos::atomic_increment( &(t->m_ref_count) );
             if ( queue == 0 ) {
-              queue = t->m_queue ;
+              queue = static_cast< queue_type * >( t->m_queue );
             }
-            else if ( queue != t->m_queue ) {
+            else if ( queue != static_cast< queue_type * >( t->m_queue ) ) {
               Kokkos::abort("Kokkos when_all Futures must be in the same scheduler" );
             }
           }
@@ -620,28 +644,34 @@ public:
 
         if ( queue != 0 ) {
 
-          size_t const size  = sizeof(task_base) + narg * sizeof(task_base*);
+          size_t const alloc_size = queue->when_all_allocation_size( narg );
 
           f.m_task =
-            reinterpret_cast< task_base * >( queue->allocate( size ) );
+            reinterpret_cast< task_base * >( queue->allocate( alloc_size ) );
 
           if ( f.m_task ) {
 
             // Reference count starts at two:
             // +1 to match decrement when task completes
             // +1 for the future
-            new( f.m_task ) task_base( queue
-                                     , 2     /* reference count */
-                                     , size  /* allocation size */
-                                     , narg  /* dependence count */
-                                     );
+
+            new( f.m_task ) task_base();
+
+            f.m_task->m_queue      = queue ;
+            f.m_task->m_ref_count  = 2 ;
+            f.m_task->m_alloc_size = alloc_size ;
+            f.m_task->m_dep_count  = narg ;
+            f.m_task->m_task_type  = task_base::Aggregate ;
 
             // Assign dependences, reference counts were already incremented
 
-            task_base ** const dep = f.m_task->aggregate_dependences();
+            task_base * volatile * const dep =
+              f.m_task->aggregate_dependences();
 
             for ( int i = 0 ; i < narg ; ++i ) { dep[i] = arg[i].m_task ; }
 
+            Kokkos::memory_fence();
+
             queue->schedule_aggregate( f.m_task );
             // this when_all may be processed at any moment
           }
diff --git a/lib/kokkos/core/src/Kokkos_UniqueToken.hpp b/lib/kokkos/core/src/Kokkos_UniqueToken.hpp
new file mode 100644
index 0000000000..1ffb07a6db
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_UniqueToken.hpp
@@ -0,0 +1,88 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_UNIQUE_TOKEN_HPP
+#define KOKKOS_UNIQUE_TOKEN_HPP
+
+#include <Kokkos_Macros.hpp>
+
+namespace Kokkos { namespace Experimental {
+
+enum class UniqueTokenScope : int
+{
+  Instance,
+  Global
+};
+
+/// \brief class to generate unique ids base on the required amount of concurrency
+///
+/// This object should behave like a ref-counted object, so that when the last
+/// instance is destroy resources are free if needed
+template <typename ExecutionSpace, UniqueTokenScope = UniqueTokenScope::Instance >
+class UniqueToken
+{
+public:
+  using execution_space = ExecutionSpace;
+  using size_type       = typename execution_space::size_type;
+
+  /// \brief create object size for concurrency on the given instance
+  ///
+  /// This object should not be shared between instances
+  UniqueToken( execution_space const& = execution_space() );
+
+  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  size_type size() const ;
+
+  /// \brief acquire value such that 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  size_type acquire() const ;
+
+  /// \brief release a value acquired by generate
+  KOKKOS_INLINE_FUNCTION
+  void release( size_type ) const ;
+};
+
+}} // namespace Kokkos::Experimental
+
+#endif //KOKKOS_UNIQUE_TOKEN_HPP
diff --git a/lib/kokkos/core/src/Kokkos_View.hpp b/lib/kokkos/core/src/Kokkos_View.hpp
index 3312aa6a96..1754e4a8fb 100644
--- a/lib/kokkos/core/src/Kokkos_View.hpp
+++ b/lib/kokkos/core/src/Kokkos_View.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -54,11 +54,14 @@
 #include <Kokkos_MemoryTraits.hpp>
 #include <Kokkos_ExecPolicy.hpp>
 
+#if defined(KOKKOS_ENABLE_PROFILING)
+#include <impl/Kokkos_Profiling_Interface.hpp>
+#endif
+
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
-namespace Experimental {
 namespace Impl {
 
 template< class DataType >
@@ -73,16 +76,6 @@ struct ViewDataAnalysis ;
 template< class , class ... >
 class ViewMapping { public: enum { is_assignable = false }; };
 
-} /* namespace Impl */
-} /* namespace Experimental */
-} /* namespace Kokkos */
-
-namespace Kokkos {
-namespace Impl {
-
-using Kokkos::Experimental::Impl::ViewMapping ;
-using Kokkos::Experimental::Impl::ViewDataAnalysis ;
-
 } /* namespace Impl */
 } /* namespace Kokkos */
 
@@ -1563,12 +1556,12 @@ namespace Kokkos {
 namespace Impl {
 
 inline
-void shared_allocation_tracking_claim_and_disable()
-{ Kokkos::Impl::SharedAllocationRecord<void,void>::tracking_claim_and_disable(); }
+void shared_allocation_tracking_disable()
+{ Kokkos::Impl::SharedAllocationRecord<void,void>::tracking_disable(); }
 
 inline
-void shared_allocation_tracking_release_and_enable()
-{ Kokkos::Impl::SharedAllocationRecord<void,void>::tracking_release_and_enable(); }
+void shared_allocation_tracking_enable()
+{ Kokkos::Impl::SharedAllocationRecord<void,void>::tracking_enable(); }
 
 } /* namespace Impl */
 } /* namespace Kokkos */
@@ -1795,6 +1788,20 @@ void deep_copy
 
   if ( (void *) dst.data() != (void*) src.data() ) {
 
+#if defined(KOKKOS_ENABLE_PROFILING)
+    if (Kokkos::Profiling::profileLibraryLoaded()) {
+      const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
+      Kokkos::Profiling::beginDeepCopy(
+          Kokkos::Profiling::SpaceHandle(dst_memory_space::name()),
+          dst.label(),
+          dst.data(),
+          Kokkos::Profiling::SpaceHandle(src_memory_space::name()),
+          src.label(),
+          src.data(),
+          nbytes);
+    }
+#endif
+
     // Concern: If overlapping views then a parallel copy will be erroneous.
     // ...
 
@@ -1882,7 +1889,14 @@ void deep_copy
     else {
       Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation");
     }
-  }
+
+#if defined(KOKKOS_ENABLE_PROFILING)
+    if (Kokkos::Profiling::profileLibraryLoaded()) {
+      Kokkos::Profiling::endDeepCopy();
+    }
+#endif
+
+  } // ( (void *) dst.data() != (void*) src.data() )
 }
 
 } /* namespace Kokkos */
@@ -2249,6 +2263,82 @@ resize( Kokkos::View<T,P...> & v ,
 
   static_assert( Kokkos::ViewTraits<T,P...>::is_managed , "Can only resize managed views" );
 
+  // Fix #904 by checking dimensions before actually resizing.
+  //
+  // Rank is known at compile time, so hopefully the compiler will
+  // remove branches that are compile-time false.  The upcoming "if
+  // constexpr" language feature would make this certain.
+  if (view_type::Rank == 1 &&
+      n0 == static_cast<size_t> (v.extent(0))) {
+    return;
+  }
+  if (view_type::Rank == 2 &&
+      n0 == static_cast<size_t> (v.extent(0)) &&
+      n1 == static_cast<size_t> (v.extent(1))) {
+    return;
+  }
+  if (view_type::Rank == 3 &&
+      n0 == static_cast<size_t> (v.extent(0)) &&
+      n1 == static_cast<size_t> (v.extent(1)) &&
+      n2 == static_cast<size_t> (v.extent(2))) {
+    return;
+  }
+  if (view_type::Rank == 4 &&
+      n0 == static_cast<size_t> (v.extent(0)) &&
+      n1 == static_cast<size_t> (v.extent(1)) &&
+      n2 == static_cast<size_t> (v.extent(2)) &&
+      n3 == static_cast<size_t> (v.extent(3))) {
+    return;
+  }
+  if (view_type::Rank == 5 &&
+      n0 == static_cast<size_t> (v.extent(0)) &&
+      n1 == static_cast<size_t> (v.extent(1)) &&
+      n2 == static_cast<size_t> (v.extent(2)) &&
+      n3 == static_cast<size_t> (v.extent(3)) &&
+      n4 == static_cast<size_t> (v.extent(4))) {
+    return;
+  }
+  if (view_type::Rank == 6 &&
+      n0 == static_cast<size_t> (v.extent(0)) &&
+      n1 == static_cast<size_t> (v.extent(1)) &&
+      n2 == static_cast<size_t> (v.extent(2)) &&
+      n3 == static_cast<size_t> (v.extent(3)) &&
+      n4 == static_cast<size_t> (v.extent(4)) &&
+      n5 == static_cast<size_t> (v.extent(5))) {
+    return;
+  }
+  if (view_type::Rank == 7 &&
+      n0 == static_cast<size_t> (v.extent(0)) &&
+      n1 == static_cast<size_t> (v.extent(1)) &&
+      n2 == static_cast<size_t> (v.extent(2)) &&
+      n3 == static_cast<size_t> (v.extent(3)) &&
+      n4 == static_cast<size_t> (v.extent(4)) &&
+      n5 == static_cast<size_t> (v.extent(5)) &&
+      n6 == static_cast<size_t> (v.extent(6))) {
+    return;
+  }
+  if (view_type::Rank == 8 &&
+      n0 == static_cast<size_t> (v.extent(0)) &&
+      n1 == static_cast<size_t> (v.extent(1)) &&
+      n2 == static_cast<size_t> (v.extent(2)) &&
+      n3 == static_cast<size_t> (v.extent(3)) &&
+      n4 == static_cast<size_t> (v.extent(4)) &&
+      n5 == static_cast<size_t> (v.extent(5)) &&
+      n6 == static_cast<size_t> (v.extent(6)) &&
+      n7 == static_cast<size_t> (v.extent(7))) {
+    return;
+  }
+  // If Kokkos ever supports Views of rank > 8, the above code won't
+  // be incorrect, because avoiding reallocation in resize() is just
+  // an optimization.
+
+  // TODO (mfh 27 Jun 2017) If the old View has enough space but just
+  // different dimensions (e.g., if the product of the dimensions,
+  // including extra space for alignment, will not change), then
+  // consider just reusing storage.  For now, Kokkos always
+  // reallocates if any of the dimensions change, even if the old View
+  // has enough space.
+
   view_type v_resized( v.label(), n0, n1, n2, n3, n4, n5, n6, n7 );
 
   Kokkos::Impl::ViewRemap< view_type , view_type >( v_resized , v );
@@ -2317,6 +2407,106 @@ void realloc(      Kokkos::View<T,P...> & v ,
 }
 } /* namespace Kokkos */
 
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos { namespace Impl {
+
+template < class Specialize, typename A, typename B >
+struct CommonViewValueType;
+
+template < typename A, typename B >
+struct CommonViewValueType< void, A, B >
+{
+  using value_type = typename std::common_type< A , B >::type;
+};
+
+
+template < class Specialize, class ValueType >
+struct CommonViewAllocProp;
+
+template < class ValueType >
+struct CommonViewAllocProp< void, ValueType >
+{
+  using value_type = ValueType;
+
+  template < class ... Views >
+  CommonViewAllocProp( const Views & ... ) {}
+};
+
+
+template < class ... Views >
+struct DeduceCommonViewAllocProp;
+
+// Base case must provide types for:
+// 1. specialize  2. value_type  3. is_view  4. prop_type
+template < class FirstView >
+struct DeduceCommonViewAllocProp< FirstView >
+{
+  using specialize = typename FirstView::traits::specialize;
+
+  using value_type = typename FirstView::traits::value_type;
+
+  enum : bool { is_view = is_view< FirstView >::value };
+
+  using prop_type = CommonViewAllocProp< specialize, value_type >;
+};
+
+
+template < class FirstView, class ... NextViews >
+struct DeduceCommonViewAllocProp< FirstView, NextViews... >
+{
+  using NextTraits = DeduceCommonViewAllocProp< NextViews... >;
+
+  using first_specialize = typename FirstView::traits::specialize;
+  using first_value_type = typename FirstView::traits::value_type;
+
+  enum : bool { first_is_view = is_view< FirstView >::value };
+
+  using next_specialize = typename NextTraits::specialize;
+  using next_value_type = typename NextTraits::value_type;
+
+  enum : bool { next_is_view = NextTraits::is_view };
+
+  // common types
+
+  // determine specialize type
+  // if first and next specialize differ, but are not the same specialize, error out
+  static_assert( !(!std::is_same< first_specialize, next_specialize >::value && !std::is_same< first_specialize, void>::value && !std::is_same< void, next_specialize >::value)  , "Kokkos DeduceCommonViewAllocProp ERROR: Only one non-void specialize trait allowed" );
+
+  // otherwise choose non-void specialize if either/both are non-void
+  using specialize = typename std::conditional< std::is_same< first_specialize, next_specialize >::value
+                                              , first_specialize
+                                              , typename std::conditional< ( std::is_same< first_specialize, void >::value
+                                                                             && !std::is_same< next_specialize, void >::value)
+                                                                           , next_specialize
+                                                                           , first_specialize
+                                                                         >::type
+                                               >::type;
+
+  using value_type = typename CommonViewValueType< specialize, first_value_type, next_value_type >::value_type;
+
+  enum : bool { is_view = (first_is_view && next_is_view) };
+
+  using prop_type = CommonViewAllocProp< specialize, value_type >;
+};
+
+} // end namespace Impl
+
+template < class ... Views >
+using DeducedCommonPropsType = typename Impl::DeduceCommonViewAllocProp<Views...>::prop_type ;
+
+// User function
+template < class ... Views >
+DeducedCommonPropsType<Views...> 
+common_view_alloc_prop( Views const & ... views )
+{
+  return DeducedCommonPropsType<Views...>( views... );
+}
+
+} // namespace Kokkos
+
+
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 // For backward compatibility:
@@ -2350,6 +2540,9 @@ using Kokkos::Impl::WithoutInitializing_t ;
 using Kokkos::Impl::AllowPadding_t ;
 using Kokkos::Impl::SharedAllocationRecord ;
 using Kokkos::Impl::SharedAllocationTracker ;
+using Kokkos::Impl::ViewMapping ;
+using Kokkos::Impl::ViewDataAnalysis ;
+
 
 } /* namespace Impl */
 } /* namespace Experimental */
diff --git a/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp
new file mode 100644
index 0000000000..58b0f72f51
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp
@@ -0,0 +1,265 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_WORKGRAPHPOLICY_HPP
+#define KOKKOS_WORKGRAPHPOLICY_HPP
+
+namespace Kokkos {
+namespace Impl {
+namespace Experimental {
+
+template< class functor_type , class execution_space, class ... policy_args >
+class WorkGraphExec;
+
+}}} // namespace Kokkos::Impl::Experimental
+
+namespace Kokkos {
+namespace Experimental {
+
+template< class ... Properties >
+class WorkGraphPolicy
+{
+public:
+
+  using self_type = WorkGraphPolicy<Properties ... >;
+  using traits = Kokkos::Impl::PolicyTraits<Properties ... >;
+  using index_type = typename traits::index_type;
+  using execution_space = typename traits::execution_space;
+  using work_tag = typename traits::work_tag;
+  using memory_space = typename execution_space::memory_space;
+  using graph_type = Kokkos::Experimental::Crs<index_type, execution_space, void, index_type>;
+  using member_type = index_type;
+
+private:
+   
+  graph_type m_graph;
+
+  using ints_type = Kokkos::View<std::int32_t*, memory_space>;
+  using range_type = Kokkos::pair<std::int32_t, std::int32_t>;
+  using ranges_type = Kokkos::View<range_type*, memory_space>;
+  const std::int32_t m_total_work;
+  ints_type m_counts;
+  ints_type m_queue;
+  ranges_type m_ranges;
+
+public:
+
+  struct TagZeroRanges {};
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagZeroRanges, std::int32_t i) const {
+    m_ranges[i] = range_type(0, 0);
+  }
+  void zero_ranges() {
+    using policy_type = RangePolicy<std::int32_t, execution_space, TagZeroRanges>;
+    using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
+    const closure_type closure(*this, policy_type(0, 1));
+    closure.execute();
+    execution_space::fence();
+  }
+
+  struct TagFillQueue {};
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagFillQueue, std::int32_t i) const {
+    if (*((volatile std::int32_t*)(&m_counts(i))) == 0) push_work(i);
+  }
+  void fill_queue() {
+    using policy_type = RangePolicy<std::int32_t, execution_space, TagFillQueue>;
+    using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
+    const closure_type closure(*this, policy_type(0, m_total_work));
+    closure.execute();
+    execution_space::fence();
+  }
+
+private:
+
+  inline
+  void setup() {
+    if (m_graph.numRows() > std::numeric_limits<std::int32_t>::max()) {
+      Kokkos::abort("WorkGraphPolicy work must be indexable using int32_t");
+    }
+    get_crs_transpose_counts(m_counts, m_graph);
+    m_queue = ints_type(ViewAllocateWithoutInitializing("queue"), m_total_work);
+    deep_copy(m_queue, std::int32_t(-1));
+    m_ranges = ranges_type("ranges", 1);
+    fill_queue();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  std::int32_t pop_work() const {
+    range_type w(-1,-1);
+    while (true) {
+      const range_type w_new( w.first + 1 , w.second );
+      w = atomic_compare_exchange( &m_ranges(0) , w , w_new );
+      if ( w.first < w.second ) { // there was work in the queue
+        if ( w_new.first == w.first + 1 && w_new.second == w.second ) {
+          // we got a work item
+          std::int32_t i;
+          // the push_work function may have incremented the end counter
+          // but not yet written the work index into the queue.
+          // wait until the entry is valid.
+          while ( -1 == ( i = *((volatile std::int32_t*)(&m_queue( w.first ))) ) );
+          return i;
+        } // we got a work item
+      } else { // there was no work in the queue
+#ifdef KOKKOS_DEBUG
+        if ( w_new.first == w.first + 1 && w_new.second == w.second ) {
+          Kokkos::abort("bug in pop_work");
+        }
+#endif
+        if (w.first == m_total_work) { // all work is done
+          return -1;
+        } else { // need to wait for more work to be pushed
+          // take a guess that one work item will be pushed
+          // the key thing is we can't leave (w) alone, because
+          // otherwise the next compare_exchange may succeed in
+          // popping work from an empty queue
+          w.second++;
+        }
+      } // there was no work in the queue
+    } // while (true)
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void push_work(std::int32_t i) const {
+    range_type w(-1,-1);
+    while (true) {
+      const range_type w_new( w.first , w.second + 1 );
+      // try to increment the end counter
+      w = atomic_compare_exchange( &m_ranges(0) , w , w_new );
+      // stop trying if the increment was successful
+      if ( w.first == w_new.first && w.second + 1 == w_new.second ) break;
+    }
+    // write the work index into the claimed spot in the queue
+    *((volatile std::int32_t*)(&m_queue( w.second ))) = i;
+    // push this write out into the memory system
+    memory_fence();
+  }
+
+  template< class functor_type , class execution_space, class ... policy_args >
+  friend class Kokkos::Impl::Experimental::WorkGraphExec;
+
+public:
+
+  WorkGraphPolicy(graph_type arg_graph)
+    : m_graph(arg_graph)
+    , m_total_work( arg_graph.numRows() )
+  {
+    setup();
+  }
+
+};
+
+}} // namespace Kokkos::Experimental
+
+/*--------------------------------------------------------------------------*/
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+namespace Experimental {
+
+template< class functor_type , class execution_space, class ... policy_args >
+class WorkGraphExec
+{
+ public:
+
+  using self_type = WorkGraphExec< functor_type, execution_space, policy_args ... >;
+  using policy_type = Kokkos::Experimental::WorkGraphPolicy< policy_args ... >;
+  using member_type = typename policy_type::member_type;
+  using memory_space = typename execution_space::memory_space;
+
+ protected:
+
+  const functor_type m_functor;
+  const policy_type  m_policy;
+
+ protected:
+
+  KOKKOS_INLINE_FUNCTION
+  std::int32_t before_work() const {
+    return m_policy.pop_work();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void after_work(std::int32_t i) const {
+    /* fence any writes that were done by the work item itself
+       (usually writing its result to global memory) */
+    memory_fence();
+    const std::int32_t begin = m_policy.m_graph.row_map( i );
+    const std::int32_t end = m_policy.m_graph.row_map( i + 1 );
+    for (std::int32_t j = begin; j < end; ++j) {
+      const std::int32_t next = m_policy.m_graph.entries( j );
+      const std::int32_t old_count = atomic_fetch_add( &(m_policy.m_counts(next)), -1 );
+      if ( old_count == 1 )  m_policy.push_work( next );
+    }
+  }
+
+  inline
+  WorkGraphExec( const functor_type & arg_functor
+               , const policy_type  & arg_policy )
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+  {
+  }
+};
+
+}}} // namespace Kokkos::Impl::Experimental
+
+#ifdef KOKKOS_ENABLE_SERIAL
+#include "impl/Kokkos_Serial_WorkGraphPolicy.hpp"
+#endif
+
+#ifdef KOKKOS_ENABLE_OPENMP
+#include "OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp"
+#endif
+
+#ifdef KOKKOS_ENABLE_CUDA
+#include "Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp"
+#endif
+
+#ifdef KOKKOS_ENABLE_THREADS
+#include "Threads/Kokkos_Threads_WorkGraphPolicy.hpp"
+#endif
+
+#endif /* #define KOKKOS_WORKGRAPHPOLICY_HPP */
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
index 4e0ea93920..915fbe52c1 100644
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
@@ -45,75 +45,100 @@
 #if defined( KOKKOS_ENABLE_OPENMP )
 
 #include <cstdio>
+#include <cstdlib>
+
 #include <limits>
 #include <iostream>
 #include <vector>
+
 #include <Kokkos_Core.hpp>
+
 #include <impl/Kokkos_Error.hpp>
-#include <iostream>
 #include <impl/Kokkos_CPUDiscovery.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>
 
 
 namespace Kokkos {
 namespace Impl {
-namespace {
 
-KOKKOS_INLINE_FUNCTION
-int kokkos_omp_in_parallel();
+int g_openmp_hardware_max_threads = 1;
 
-int kokkos_omp_in_critical_region = ( Kokkos::HostSpace::register_in_parallel( kokkos_omp_in_parallel ) , 0 );
+__thread int t_openmp_hardware_id = 0;
+__thread Impl::OpenMPExec * t_openmp_instance = nullptr;
 
-KOKKOS_INLINE_FUNCTION
-int kokkos_omp_in_parallel()
+void OpenMPExec::validate_partition( const int nthreads
+                                   , int & num_partitions
+                                   , int & partition_size
+                                  )
 {
-#ifndef __CUDA_ARCH__
-  return omp_in_parallel() && ! kokkos_omp_in_critical_region ;
-#else
-  return 0;
-#endif
+  if (nthreads == 1) {
+    num_partitions = 1;
+    partition_size = 1;
+  }
+  else if( num_partitions < 1 && partition_size < 1) {
+    int idle = nthreads;
+    for (int np = 2; np <= nthreads ; ++np) {
+      for (int ps = 1; ps <= nthreads/np; ++ps) {
+        if (nthreads - np*ps < idle) {
+          idle = nthreads - np*ps;
+          num_partitions = np;
+          partition_size = ps;
+        }
+        if (idle == 0) {
+          break;
+        }
+      }
+    }
+  }
+  else if( num_partitions < 1 && partition_size > 0 ) {
+    if ( partition_size <= nthreads ) {
+      num_partitions = nthreads / partition_size;
+    }
+    else {
+      num_partitions = 1;
+      partition_size = nthreads;
+    }
+  }
+  else if( num_partitions > 0 && partition_size < 1 ) {
+    if ( num_partitions <= nthreads ) {
+      partition_size = nthreads / num_partitions;
+    }
+    else {
+      num_partitions = nthreads;
+      partition_size = 1;
+    }
+  }
+  else if ( num_partitions * partition_size > nthreads ) {
+    int idle = nthreads;
+    const int NP = num_partitions;
+    const int PS = partition_size;
+    for (int np = NP; np > 0; --np) {
+      for (int ps = PS; ps > 0; --ps) {
+        if (  (np*ps <= nthreads)
+           && (nthreads - np*ps < idle) ) {
+          idle = nthreads - np*ps;
+          num_partitions = np;
+          partition_size = ps;
+        }
+        if (idle == 0) {
+          break;
+        }
+      }
+    }
+  }
+
 }
 
-bool s_using_hwloc = false;
-
-} // namespace
-} // namespace Impl
-} // namespace Kokkos
-
-
-namespace Kokkos {
-namespace Impl {
-
-int OpenMPExec::m_map_rank[ OpenMPExec::MAX_THREAD_COUNT ] = { 0 };
-
-int OpenMPExec::m_pool_topo[ 4 ] = { 0 };
-
-HostThreadTeamData * OpenMPExec::m_pool[ OpenMPExec::MAX_THREAD_COUNT ] = { 0 };
-
-void OpenMPExec::verify_is_process( const char * const label )
+void OpenMPExec::verify_is_master( const char * const label )
 {
-  if ( omp_in_parallel() ) {
+  if ( !t_openmp_instance )
+  {
     std::string msg( label );
-    msg.append( " ERROR: in parallel" );
+    msg.append( " ERROR: in parallel or not initialized" );
     Kokkos::Impl::throw_runtime_exception( msg );
   }
 }
 
-void OpenMPExec::verify_initialized( const char * const label )
-{
-  if ( 0 == m_pool[0] ) {
-    std::string msg( label );
-    msg.append( " ERROR: not initialized" );
-    Kokkos::Impl::throw_runtime_exception( msg );
-  }
-
-  if ( omp_get_max_threads() != Kokkos::OpenMP::thread_pool_size(0) ) {
-    std::string msg( label );
-    msg.append( " ERROR: Initialized but threads modified inappropriately" );
-    Kokkos::Impl::throw_runtime_exception( msg );
-  }
-
-}
 
 } // namespace Impl
 } // namespace Kokkos
@@ -133,11 +158,11 @@ void OpenMPExec::clear_thread_data()
   const int old_alloc_bytes =
     m_pool[0] ? ( member_bytes + m_pool[0]->scratch_bytes() ) : 0 ;
 
-  Kokkos::HostSpace space ;
+  OpenMP::memory_space space ;
 
-#pragma omp parallel
+  #pragma omp parallel num_threads( m_pool_size )
   {
-    const int rank = m_map_rank[ omp_get_thread_num() ];
+    const int rank = omp_get_thread_num();
 
     if ( 0 != m_pool[rank] ) {
 
@@ -189,13 +214,13 @@ void OpenMPExec::resize_thread_data( size_t pool_reduce_bytes
                                       , team_shared_bytes
                                       , thread_local_bytes );
 
-    const int pool_size = omp_get_max_threads();
+    OpenMP::memory_space space ;
 
-    Kokkos::HostSpace space ;
+    memory_fence();
 
-#pragma omp parallel
+    #pragma omp parallel num_threads(m_pool_size)
     {
-      const int rank = m_map_rank[ omp_get_thread_num() ];
+      const int rank = omp_get_thread_num();
 
       if ( 0 != m_pool[rank] ) {
 
@@ -214,11 +239,14 @@ void OpenMPExec::resize_thread_data( size_t pool_reduce_bytes
                       , pool_reduce_bytes
                       , team_reduce_bytes
                       , team_shared_bytes
-                      , thread_local_bytes );
+                      , thread_local_bytes
+                      );
+
+      memory_fence();
     }
 /* END #pragma omp parallel */
 
-    HostThreadTeamData::organize_pool( m_pool , pool_size );
+    HostThreadTeamData::organize_pool( m_pool , m_pool_size );
   }
 }
 
@@ -232,16 +260,8 @@ namespace Kokkos {
 
 //----------------------------------------------------------------------------
 
-int OpenMP::is_initialized()
-{ return 0 != Impl::OpenMPExec::m_pool[0]; }
-
-void OpenMP::initialize( unsigned thread_count ,
-                         unsigned use_numa_count ,
-                         unsigned use_cores_per_numa )
+int OpenMP::get_current_max_threads() noexcept
 {
-  // Before any other call to OMP query the maximum number of threads
-  // and save the value for re-initialization unit testing.
-
   // Using omp_get_max_threads(); is problematic in conjunction with
   // Hwloc on Intel (essentially an initial call to the OpenMP runtime
   // without a parallel region before will set a process mask for a single core
@@ -250,110 +270,99 @@ void OpenMP::initialize( unsigned thread_count ,
   // the thread masks. The intend seems to be to make serial code run fast, if you
   // compile with OpenMP enabled but don't actually use parallel regions or so
   // static int omp_max_threads = omp_get_max_threads();
-  int nthreads = 0;
+
+  int count = 0;
   #pragma omp parallel
   {
     #pragma omp atomic
-    nthreads++;
+     ++count;
   }
+  return count;
+}
 
-  static int omp_max_threads = nthreads;
-
-  const bool is_initialized = 0 != Impl::OpenMPExec::m_pool[0] ;
-
-  bool thread_spawn_failed = false ;
-
-  if ( ! is_initialized ) {
-
-    // Use hwloc thread pinning if concerned with locality.
-    // If spreading threads across multiple NUMA regions.
-    // If hyperthreading is enabled.
-    Impl::s_using_hwloc = hwloc::available() && (
-                            ( 1 < Kokkos::hwloc::get_available_numa_count() ) ||
-                            ( 1 < Kokkos::hwloc::get_available_threads_per_core() ) );
-
-    std::pair<unsigned,unsigned> threads_coord[ Impl::OpenMPExec::MAX_THREAD_COUNT ];
-
-    // If hwloc available then use it's maximum value.
-
-    if ( thread_count == 0 ) {
-      thread_count = Impl::s_using_hwloc
-      ? Kokkos::hwloc::get_available_numa_count() *
-        Kokkos::hwloc::get_available_cores_per_numa() *
-        Kokkos::hwloc::get_available_threads_per_core()
-      : omp_max_threads ;
-    }
-
-    if(Impl::s_using_hwloc)
-      hwloc::thread_mapping( "Kokkos::OpenMP::initialize" ,
-                           false /* do not allow asynchronous */ ,
-                           thread_count ,
-                           use_numa_count ,
-                           use_cores_per_numa ,
-                           threads_coord );
-
-    // Spawn threads:
-
-    omp_set_num_threads( thread_count );
-
-    // Verify OMP interaction:
-    if ( int(thread_count) != omp_get_max_threads() ) {
-      thread_spawn_failed = true ;
-    }
-
-    // Verify spawning and bind threads:
-#pragma omp parallel
-    {
-#pragma omp critical
-      {
-        if ( int(thread_count) != omp_get_num_threads() ) {
-          thread_spawn_failed = true ;
-        }
-
-        // Call to 'bind_this_thread' is not thread safe so place this whole block in a critical region.
-        // Call to 'new' may not be thread safe as well.
-
-        const unsigned omp_rank    = omp_get_thread_num();
-        const unsigned thread_r    = Impl::s_using_hwloc && Kokkos::hwloc::can_bind_threads()
-                                   ? Kokkos::hwloc::bind_this_thread( thread_count , threads_coord )
-                                   : omp_rank ;
-
-        Impl::OpenMPExec::m_map_rank[ omp_rank ] = thread_r ;
-      }
-/* END #pragma omp critical */
-    }
-/* END #pragma omp parallel */
-
-    if ( ! thread_spawn_failed ) {
-      Impl::OpenMPExec::m_pool_topo[0] = thread_count ;
-      Impl::OpenMPExec::m_pool_topo[1] = Impl::s_using_hwloc ? thread_count / use_numa_count : thread_count;
-      Impl::OpenMPExec::m_pool_topo[2] = Impl::s_using_hwloc ? thread_count / ( use_numa_count * use_cores_per_numa ) : 1;
-
-      // New, unified host thread team data:
-      {
-        size_t pool_reduce_bytes  =   32 * thread_count ;
-        size_t team_reduce_bytes  =   32 * thread_count ;
-        size_t team_shared_bytes  = 1024 * thread_count ;
-        size_t thread_local_bytes = 1024 ;
-
-        Impl::OpenMPExec::resize_thread_data( pool_reduce_bytes
-                                            , team_reduce_bytes
-                                            , team_shared_bytes
-                                            , thread_local_bytes
-                                            );
-      }
-    }
-  }
-
-  if ( is_initialized || thread_spawn_failed ) {
-    std::string msg("Kokkos::OpenMP::initialize ERROR");
-
-    if ( is_initialized ) { msg.append(" : already initialized"); }
-    if ( thread_spawn_failed ) { msg.append(" : failed spawning threads"); }
 
+void OpenMP::initialize( int thread_count )
+{
+  if ( omp_in_parallel() ) {
+    std::string msg("Kokkos::OpenMP::initialize ERROR : in parallel");
     Kokkos::Impl::throw_runtime_exception(msg);
   }
 
+  if ( Impl::t_openmp_instance )
+  {
+    finalize();
+  }
+
+  {
+    if (nullptr == std::getenv("OMP_PROC_BIND") ) {
+      printf("Kokkos::OpenMP::initialize WARNING: OMP_PROC_BIND environment variable not set\n");
+      printf("  In general, for best performance with OpenMP 4.0 or better set OMP_PROC_BIND=spread and OMP_PLACES=threads\n");
+      printf("  For best performance with OpenMP 3.1 set OMP_PROC_BIND=true\n");
+      printf("  For unit testing set OMP_PROC_BIND=false\n");
+    }
+
+    OpenMP::memory_space space ;
+
+    // Before any other call to OMP query the maximum number of threads
+    // and save the value for re-initialization unit testing.
+
+    Impl::g_openmp_hardware_max_threads = get_current_max_threads();
+
+    int process_num_threads = Impl::g_openmp_hardware_max_threads;
+
+    if ( Kokkos::hwloc::available() ) {
+      process_num_threads = Kokkos::hwloc::get_available_numa_count()
+                          * Kokkos::hwloc::get_available_cores_per_numa()
+                          * Kokkos::hwloc::get_available_threads_per_core();
+    }
+
+    // if thread_count  < 0, use g_openmp_hardware_max_threads;
+    // if thread_count == 0, set g_openmp_hardware_max_threads to process_num_threads
+    // if thread_count  > 0, set g_openmp_hardware_max_threads to thread_count
+    if (thread_count < 0 ) {
+      thread_count = Impl::g_openmp_hardware_max_threads;
+    }
+    else if( thread_count == 0 && Impl::g_openmp_hardware_max_threads != process_num_threads ) {
+      Impl::g_openmp_hardware_max_threads = process_num_threads;
+      omp_set_num_threads(Impl::g_openmp_hardware_max_threads);
+    }
+    else {
+      if( thread_count > process_num_threads ) {
+        printf( "Kokkos::OpenMP::initialize WARNING: You are likely oversubscribing your CPU cores.\n");
+        printf( "  process threads available : %3d,  requested thread : %3d\n", process_num_threads, thread_count );
+      }
+      Impl::g_openmp_hardware_max_threads = thread_count;
+      omp_set_num_threads(Impl::g_openmp_hardware_max_threads);
+    }
+
+    // setup thread local
+    #pragma omp parallel num_threads(Impl::g_openmp_hardware_max_threads)
+    {
+      Impl::t_openmp_instance = nullptr;
+      Impl::t_openmp_hardware_id = omp_get_thread_num();
+      Impl::SharedAllocationRecord< void, void >::tracking_enable();
+    }
+
+    void * const ptr = space.allocate( sizeof(Impl::OpenMPExec) );
+
+    Impl::t_openmp_instance = new (ptr) Impl::OpenMPExec( Impl::g_openmp_hardware_max_threads );
+
+    // New, unified host thread team data:
+    {
+      size_t pool_reduce_bytes  =   32 * thread_count ;
+      size_t team_reduce_bytes  =   32 * thread_count ;
+      size_t team_shared_bytes  = 1024 * thread_count ;
+      size_t thread_local_bytes = 1024 ;
+
+      Impl::t_openmp_instance->resize_thread_data( pool_reduce_bytes
+                                                 , team_reduce_bytes
+                                                 , team_shared_bytes
+                                                 , thread_local_bytes
+                                                 );
+    }
+  }
+
+
   // Check for over-subscription
   //if( Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node() ) {
   //  std::cout << "Kokkos::OpenMP::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl;
@@ -373,20 +382,38 @@ void OpenMP::initialize( unsigned thread_count ,
 
 void OpenMP::finalize()
 {
-  Impl::OpenMPExec::verify_initialized( "OpenMP::finalize" );
-  Impl::OpenMPExec::verify_is_process( "OpenMP::finalize" );
+  if ( omp_in_parallel() )
+  {
+    std::string msg("Kokkos::OpenMP::finalize ERROR ");
+    if( !Impl::t_openmp_instance ) msg.append(": not initialized");
+    if( omp_in_parallel() ) msg.append(": in parallel");
+    Kokkos::Impl::throw_runtime_exception(msg);
+  }
 
-  // New, unified host thread team data:
-  Impl::OpenMPExec::clear_thread_data();
+  if ( Impl::t_openmp_instance ) {
 
-  Impl::OpenMPExec::m_pool_topo[0] = 0 ;
-  Impl::OpenMPExec::m_pool_topo[1] = 0 ;
-  Impl::OpenMPExec::m_pool_topo[2] = 0 ;
+    const int nthreads = Impl::t_openmp_instance->m_pool_size <= Impl::g_openmp_hardware_max_threads
+                       ? Impl::g_openmp_hardware_max_threads
+                       : Impl::t_openmp_instance->m_pool_size;
 
-  omp_set_num_threads(1);
+    using Exec = Impl::OpenMPExec;
+    Exec * instance = Impl::t_openmp_instance;
+    instance->~Exec();
 
-  if ( Impl::s_using_hwloc && Kokkos::hwloc::can_bind_threads() ) {
-    hwloc::unbind_this_thread();
+    OpenMP::memory_space space;
+    space.deallocate( instance, sizeof(Exec) );
+
+    #pragma omp parallel num_threads(nthreads)
+    {
+      Impl::t_openmp_hardware_id = 0;
+      Impl::t_openmp_instance    = nullptr;
+      Impl::SharedAllocationRecord< void, void >::tracking_disable();
+    }
+
+    // allow main thread to track
+    Impl::SharedAllocationRecord< void, void >::tracking_enable();
+
+    Impl::g_openmp_hardware_max_threads = 1;
   }
 
   #if defined(KOKKOS_ENABLE_PROFILING)
@@ -396,70 +423,48 @@ void OpenMP::finalize()
 
 //----------------------------------------------------------------------------
 
-void OpenMP::print_configuration( std::ostream & s , const bool detail )
+void OpenMP::print_configuration( std::ostream & s , const bool verbose )
 {
-  Impl::OpenMPExec::verify_is_process( "OpenMP::print_configuration" );
-
   s << "Kokkos::OpenMP" ;
 
-#if defined( KOKKOS_ENABLE_OPENMP )
-  s << " KOKKOS_ENABLE_OPENMP" ;
-#endif
-#if defined( KOKKOS_ENABLE_HWLOC )
-
-  const unsigned numa_count_       = Kokkos::hwloc::get_available_numa_count();
-  const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
-  const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
-
-  s << " hwloc[" << numa_count_ << "x" << cores_per_numa << "x" << threads_per_core << "]"
-    << " hwloc_binding_" << ( Impl::s_using_hwloc ? "enabled" : "disabled" )
-    ;
-#endif
-
-  const bool is_initialized = 0 != Impl::OpenMPExec::m_pool[0] ;
+  const bool is_initialized =  Impl::t_openmp_instance != nullptr;
 
   if ( is_initialized ) {
-    const int numa_count      = Kokkos::Impl::OpenMPExec::m_pool_topo[0] / Kokkos::Impl::OpenMPExec::m_pool_topo[1] ;
-    const int core_per_numa   = Kokkos::Impl::OpenMPExec::m_pool_topo[1] / Kokkos::Impl::OpenMPExec::m_pool_topo[2] ;
-    const int thread_per_core = Kokkos::Impl::OpenMPExec::m_pool_topo[2] ;
+    Impl::OpenMPExec::verify_is_master( "OpenMP::print_configuration" );
+
+    const int numa_count      = 1;
+    const int core_per_numa   = Impl::g_openmp_hardware_max_threads;
+    const int thread_per_core = 1;
 
     s << " thread_pool_topology[ " << numa_count
       << " x " << core_per_numa
       << " x " << thread_per_core
       << " ]"
       << std::endl ;
-
-    if ( detail ) {
-      std::vector< std::pair<unsigned,unsigned> > coord( Kokkos::Impl::OpenMPExec::m_pool_topo[0] );
-
-#pragma omp parallel
-      {
-#pragma omp critical
-        {
-          coord[ omp_get_thread_num() ] = hwloc::get_this_thread_coordinate();
-        }
-/* END #pragma omp critical */
-      }
-/* END #pragma omp parallel */
-
-      for ( unsigned i = 0 ; i < coord.size() ; ++i ) {
-        s << "  thread omp_rank[" << i << "]"
-          << " kokkos_rank[" << Impl::OpenMPExec::m_map_rank[ i ] << "]"
-          << " hwloc_coord[" << coord[i].first << "." << coord[i].second << "]"
-          << std::endl ;
-      }
-    }
   }
   else {
     s << " not initialized" << std::endl ;
   }
 }
 
+std::vector<OpenMP> OpenMP::partition(...)
+{ return std::vector<OpenMP>(1); }
+
+OpenMP OpenMP::create_instance(...) { return OpenMP(); }
+
+
+#if !defined( KOKKOS_DISABLE_DEPRECATED )
+
 int OpenMP::concurrency() {
-  return thread_pool_size(0);
+  return Impl::g_openmp_hardware_max_threads;
 }
 
-const char* OpenMP::name() { return "OpenMP"; }
+void OpenMP::initialize( int thread_count , int, int )
+{
+  initialize(thread_count);
+}
+
+#endif
 
 } // namespace Kokkos
 
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
index 75b7f5da4a..37d2ac8318 100644
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
@@ -47,6 +47,10 @@
 #include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ENABLE_OPENMP )
 
+#if !defined(_OPENMP)
+#error "You enabled Kokkos OpenMP support without enabling OpenMP in the compiler!"
+#endif
+
 #include <Kokkos_OpenMP.hpp>
 
 #include <impl/Kokkos_Traits.hpp>
@@ -54,6 +58,8 @@
 
 #include <Kokkos_Atomic.hpp>
 
+#include <Kokkos_UniqueToken.hpp>
+
 #include <iostream>
 #include <sstream>
 #include <fstream>
@@ -63,8 +69,14 @@
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
-namespace Kokkos {
-namespace Impl {
+namespace Kokkos { namespace Impl {
+
+class OpenMPExec;
+
+extern int g_openmp_hardware_max_threads;
+
+extern __thread int t_openmp_hardware_id;
+extern __thread OpenMPExec * t_openmp_instance;
 
 //----------------------------------------------------------------------------
 /** \brief  Data for OpenMP thread execution */
@@ -74,279 +86,279 @@ public:
 
   friend class Kokkos::OpenMP ;
 
-  enum { MAX_THREAD_COUNT = 4096 };
+  enum { MAX_THREAD_COUNT = 512 };
+
+  void clear_thread_data();
+
+  static void validate_partition( const int nthreads
+                                , int & num_partitions
+                                , int & partition_size
+                                );
 
 private:
+  OpenMPExec( int arg_pool_size )
+    : m_pool_size{ arg_pool_size }
+    , m_level{ omp_get_level() }
+    , m_pool()
+  {}
 
-  static int          m_pool_topo[ 4 ];
-  static int          m_map_rank[ MAX_THREAD_COUNT ];
+  ~OpenMPExec()
+  {
+    clear_thread_data();
+  }
 
-  static HostThreadTeamData * m_pool[ MAX_THREAD_COUNT ];
+  int m_pool_size;
+  int m_level;
 
-  static
-  void clear_thread_data();
+  HostThreadTeamData * m_pool[ MAX_THREAD_COUNT ];
 
 public:
 
-  // Topology of a cache coherent thread pool:
-  //   TOTAL = NUMA x GRAIN
-  //   pool_size( depth = 0 )
-  //   pool_size(0) = total number of threads
-  //   pool_size(1) = number of threads per NUMA
-  //   pool_size(2) = number of threads sharing finest grain memory hierarchy
+  static void verify_is_master( const char * const );
 
-  inline static
-  int pool_size( int depth = 0 ) { return m_pool_topo[ depth ]; }
-
-  static void finalize();
-
-  static void initialize( const unsigned team_count ,
-                          const unsigned threads_per_team ,
-                          const unsigned numa_count ,
-                          const unsigned cores_per_numa );
-
-  static void verify_is_process( const char * const );
-  static void verify_initialized( const char * const );
-
-
-  static
   void resize_thread_data( size_t pool_reduce_bytes
                          , size_t team_reduce_bytes
                          , size_t team_shared_bytes
                          , size_t thread_local_bytes );
 
-  inline static
-  HostThreadTeamData * get_thread_data() noexcept
-    { return m_pool[ m_map_rank[ omp_get_thread_num() ] ]; }
+  inline
+  HostThreadTeamData * get_thread_data() const noexcept
+  { return m_pool[ m_level == omp_get_level() ? 0 : omp_get_thread_num() ]; }
 
-  inline static
-  HostThreadTeamData * get_thread_data( int i ) noexcept
-    { return m_pool[i]; }
+  inline
+  HostThreadTeamData * get_thread_data( int i ) const noexcept
+  { return m_pool[i]; }
 };
 
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< class ... Properties >
-class TeamPolicyInternal< Kokkos::OpenMP, Properties ... >: public PolicyTraits<Properties ...>
-{
-public:
-
-  //! Tag this class as a kokkos execution policy
-  typedef TeamPolicyInternal      execution_policy ;
-
-  typedef PolicyTraits<Properties ... > traits;
-
-  TeamPolicyInternal& operator = (const TeamPolicyInternal& p) {
-    m_league_size = p.m_league_size;
-    m_team_size = p.m_team_size;
-    m_team_alloc = p.m_team_alloc;
-    m_team_iter = p.m_team_iter;
-    m_team_scratch_size[0] = p.m_team_scratch_size[0];
-    m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
-    m_team_scratch_size[1] = p.m_team_scratch_size[1];
-    m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
-    m_chunk_size = p.m_chunk_size;
-    return *this;
-  }
-
-  //----------------------------------------
-
-  template< class FunctorType >
-  inline static
-  int team_size_max( const FunctorType & ) {
-      int pool_size = traits::execution_space::thread_pool_size(1);
-      int max_host_team_size =  Impl::HostThreadTeamData::max_team_members;
-      return pool_size<max_host_team_size?pool_size:max_host_team_size;
-    }
-
-  template< class FunctorType >
-  inline static
-  int team_size_recommended( const FunctorType & )
-    { return traits::execution_space::thread_pool_size(2); }
-
-  template< class FunctorType >
-  inline static
-  int team_size_recommended( const FunctorType &, const int& )
-    { return traits::execution_space::thread_pool_size(2); }
-
-  //----------------------------------------
-
-private:
-
-  int m_league_size ;
-  int m_team_size ;
-  int m_team_alloc ;
-  int m_team_iter ;
-
-  size_t m_team_scratch_size[2];
-  size_t m_thread_scratch_size[2];
-
-  int m_chunk_size;
-
-  inline void init( const int league_size_request
-                  , const int team_size_request )
-    {
-      const int pool_size  = traits::execution_space::thread_pool_size(0);
-      const int max_host_team_size =  Impl::HostThreadTeamData::max_team_members;
-      const int team_max   = pool_size<max_host_team_size?pool_size:max_host_team_size;
-      const int team_grain = traits::execution_space::thread_pool_size(2);
-
-      m_league_size = league_size_request ;
-
-      m_team_size = team_size_request < team_max ?
-                    team_size_request : team_max ;
-
-      // Round team size up to a multiple of 'team_gain'
-      const int team_size_grain = team_grain * ( ( m_team_size + team_grain - 1 ) / team_grain );
-      const int team_count      = pool_size / team_size_grain ;
-
-      // Constraint : pool_size = m_team_alloc * team_count
-      m_team_alloc = pool_size / team_count ;
-
-      // Maxumum number of iterations each team will take:
-      m_team_iter  = ( m_league_size + team_count - 1 ) / team_count ;
-
-      set_auto_chunk_size();
-    }
-
-public:
-
-  inline int team_size()   const { return m_team_size ; }
-  inline int league_size() const { return m_league_size ; }
-
-  inline size_t scratch_size(const int& level, int team_size_ = -1) const {
-    if(team_size_ < 0) team_size_ = m_team_size;
-    return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level] ;
-  }
-
-  /** \brief  Specify league size, request team size */
-  TeamPolicyInternal( typename traits::execution_space &
-            , int league_size_request
-            , int team_size_request
-            , int /* vector_length_request */ = 1 )
-            : m_team_scratch_size { 0 , 0 }
-            , m_thread_scratch_size { 0 , 0 }
-            , m_chunk_size(0)
-    { init( league_size_request , team_size_request ); }
-
-  TeamPolicyInternal( typename traits::execution_space &
-            , int league_size_request
-            , const Kokkos::AUTO_t & /* team_size_request */
-            , int /* vector_length_request */ = 1)
-            : m_team_scratch_size { 0 , 0 }
-            , m_thread_scratch_size { 0 , 0 }
-            , m_chunk_size(0)
-    { init( league_size_request , traits::execution_space::thread_pool_size(2) ); }
-
-  TeamPolicyInternal( int league_size_request
-            , int team_size_request
-            , int /* vector_length_request */ = 1 )
-            : m_team_scratch_size { 0 , 0 }
-            , m_thread_scratch_size { 0 , 0 }
-            , m_chunk_size(0)
-    { init( league_size_request , team_size_request ); }
-
-  TeamPolicyInternal( int league_size_request
-            , const Kokkos::AUTO_t & /* team_size_request */
-            , int /* vector_length_request */ = 1 )
-            : m_team_scratch_size { 0 , 0 }
-            , m_thread_scratch_size { 0 , 0 }
-            , m_chunk_size(0)
-    { init( league_size_request , traits::execution_space::thread_pool_size(2) ); }
-
-  inline int team_alloc() const { return m_team_alloc ; }
-  inline int team_iter()  const { return m_team_iter ; }
-
-  inline int chunk_size() const { return m_chunk_size ; }
-
-  /** \brief set chunk_size to a discrete value*/
-  inline TeamPolicyInternal set_chunk_size(typename traits::index_type chunk_size_) const {
-    TeamPolicyInternal p = *this;
-    p.m_chunk_size = chunk_size_;
-    return p;
-  }
-
-  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
-    TeamPolicyInternal p = *this;
-    p.m_team_scratch_size[level] = per_team.value;
-    return p;
-  };
-
-  inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
-    TeamPolicyInternal p = *this;
-    p.m_thread_scratch_size[level] = per_thread.value;
-    return p;
-  };
-
-  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
-    TeamPolicyInternal p = *this;
-    p.m_team_scratch_size[level] = per_team.value;
-    p.m_thread_scratch_size[level] = per_thread.value;
-    return p;
-  };
-
-private:
-  /** \brief finalize chunk_size if it was set to AUTO*/
-  inline void set_auto_chunk_size() {
-
-    int concurrency = traits::execution_space::thread_pool_size(0)/m_team_alloc;
-    if( concurrency==0 ) concurrency=1;
-
-    if(m_chunk_size > 0) {
-      if(!Impl::is_integral_power_of_two( m_chunk_size ))
-        Kokkos::abort("TeamPolicy blocking granularity must be power of two" );
-    }
-
-    int new_chunk_size = 1;
-    while(new_chunk_size*100*concurrency < m_league_size)
-      new_chunk_size *= 2;
-    if(new_chunk_size < 128) {
-      new_chunk_size = 1;
-      while( (new_chunk_size*40*concurrency < m_league_size ) && (new_chunk_size<128) )
-        new_chunk_size*=2;
-    }
-    m_chunk_size = new_chunk_size;
-  }
-
-public:
-  typedef Impl::HostThreadTeamMember< Kokkos::OpenMP > member_type ;
-};
-} // namespace Impl
-
-} // namespace Kokkos
+}} // namespace Kokkos::Impl
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
 
-inline
-bool OpenMP::in_parallel()
-{ return omp_in_parallel(); }
+inline OpenMP::OpenMP() noexcept
+{}
 
 inline
-int OpenMP::thread_pool_size( int depth )
+bool OpenMP::is_initialized() noexcept
+{ return Impl::t_openmp_instance != nullptr; }
+
+inline
+bool OpenMP::in_parallel( OpenMP const& ) noexcept
 {
-  return Impl::OpenMPExec::pool_size(depth);
+  //t_openmp_instance is only non-null on a master thread
+  return   !Impl::t_openmp_instance
+         || Impl::t_openmp_instance->m_level < omp_get_level()
+         ;
+}
+
+inline
+int OpenMP::thread_pool_size() noexcept
+{
+  return   OpenMP::in_parallel()
+         ? omp_get_num_threads()
+         : Impl::t_openmp_instance->m_pool_size
+         ;
 }
 
 KOKKOS_INLINE_FUNCTION
-int OpenMP::thread_pool_rank()
+int OpenMP::thread_pool_rank() noexcept
 {
 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-  return Impl::OpenMPExec::m_map_rank[ omp_get_thread_num() ];
+  return Impl::t_openmp_instance ? 0 : omp_get_thread_num();
 #else
   return -1 ;
 #endif
 }
 
+inline
+void OpenMP::fence( OpenMP const& instance ) noexcept {}
+
+inline
+bool OpenMP::is_asynchronous( OpenMP const& instance ) noexcept
+{ return false; }
+
+template <typename F>
+void OpenMP::partition_master( F const& f
+                             , int num_partitions
+                             , int partition_size
+                             )
+{
+  if (omp_get_nested()) {
+    using Exec = Impl::OpenMPExec;
+
+    Exec * prev_instance = Impl::t_openmp_instance;
+
+    Exec::validate_partition( prev_instance->m_pool_size, num_partitions, partition_size );
+
+    OpenMP::memory_space space;
+
+    #pragma omp parallel num_threads(num_partitions)
+    {
+      void * const ptr = space.allocate( sizeof(Exec) );
+
+      Impl::t_openmp_instance = new (ptr) Exec( partition_size );
+
+      size_t pool_reduce_bytes  =   32 * partition_size ;
+      size_t team_reduce_bytes  =   32 * partition_size ;
+      size_t team_shared_bytes  = 1024 * partition_size ;
+      size_t thread_local_bytes = 1024 ;
+
+      Impl::t_openmp_instance->resize_thread_data( pool_reduce_bytes
+                                                 , team_reduce_bytes
+                                                 , team_shared_bytes
+                                                 , thread_local_bytes
+                                                 );
+
+      f( omp_get_thread_num(), omp_get_num_threads() );
+
+      Impl::t_openmp_instance->~Exec();
+      space.deallocate( Impl::t_openmp_instance, sizeof(Exec) );
+      Impl::t_openmp_instance = nullptr;
+    }
+
+    Impl::t_openmp_instance  = prev_instance;
+  }
+  else {
+    // nested openmp not enabled
+    f(0,1);
+  }
+}
+
+
+namespace Experimental {
+
+template<>
+class MasterLock<OpenMP>
+{
+public:
+  void lock()     { omp_set_lock( &m_lock );   }
+  void unlock()   { omp_unset_lock( &m_lock ); }
+  bool try_lock() { return static_cast<bool>(omp_test_lock( &m_lock )); }
+
+  MasterLock()  { omp_init_lock( &m_lock ); }
+  ~MasterLock() { omp_destroy_lock( &m_lock ); }
+
+  MasterLock( MasterLock const& ) = delete;
+  MasterLock( MasterLock && )     = delete;
+  MasterLock & operator=( MasterLock const& ) = delete;
+  MasterLock & operator=( MasterLock && )     = delete;
+
+private:
+  omp_lock_t m_lock;
+
+};
+
+template<>
+class UniqueToken< OpenMP, UniqueTokenScope::Instance>
+{
+public:
+  using execution_space = OpenMP;
+  using size_type       = int;
+
+  /// \brief create object size for concurrency on the given instance
+  ///
+  /// This object should not be shared between instances
+  UniqueToken( execution_space const& = execution_space() ) noexcept {}
+
+  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  int size() const noexcept
+    {
+      #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      return Kokkos::OpenMP::thread_pool_size();
+      #else
+      return 0 ;
+      #endif
+    }
+
+  /// \brief acquire value such that 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  int acquire() const  noexcept
+    {
+      #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      return Kokkos::OpenMP::thread_pool_rank();
+      #else
+      return 0 ;
+      #endif
+    }
+
+  /// \brief release a value acquired by generate
+  KOKKOS_INLINE_FUNCTION
+  void release( int ) const noexcept {}
+};
+
+template<>
+class UniqueToken< OpenMP, UniqueTokenScope::Global>
+{
+public:
+  using execution_space = OpenMP;
+  using size_type       = int;
+
+  /// \brief create object size for concurrency on the given instance
+  ///
+  /// This object should not be shared between instances
+  UniqueToken( execution_space const& = execution_space() ) noexcept {}
+
+  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  int size() const noexcept
+    {
+      #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      return Kokkos::Impl::g_openmp_hardware_max_threads ;
+      #else
+      return 0 ;
+      #endif
+    }
+
+  /// \brief acquire value such that 0 <= value < size()
+  KOKKOS_INLINE_FUNCTION
+  int acquire() const noexcept
+    {
+      #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      return Kokkos::Impl::t_openmp_hardware_id ;
+      #else
+      return 0 ;
+      #endif
+    }
+
+  /// \brief release a value acquired by generate
+  KOKKOS_INLINE_FUNCTION
+  void release( int ) const noexcept {}
+};
+
+} // namespace Experimental
+
+
+#if !defined( KOKKOS_DISABLE_DEPRECATED )
+
+inline
+int OpenMP::thread_pool_size( int depth )
+{
+  return depth < 2
+         ? thread_pool_size()
+         : 1;
+}
+
+KOKKOS_INLINE_FUNCTION
+int OpenMP::hardware_thread_id() noexcept
+{
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  return Impl::t_openmp_hardware_id;
+#else
+  return -1 ;
+#endif
+}
+
+inline
+int OpenMP::max_hardware_threads() noexcept
+{
+  return Impl::g_openmp_hardware_max_threads;
+}
+
+#endif // KOKKOS_DISABLE_DEPRECATED
+
 } // namespace Kokkos
 
 #endif
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
index c47e0fc654..b54abb0068 100644
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
@@ -52,6 +52,8 @@
 #include <OpenMP/Kokkos_OpenMP_Exec.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
 
+#include <KokkosExp_MDRangePolicy.hpp>
+
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
@@ -71,8 +73,9 @@ private:
   typedef typename Policy::WorkRange    WorkRange ;
   typedef typename Policy::member_type  Member ;
 
-  const FunctorType m_functor ;
-  const Policy      m_policy ;
+        OpenMPExec   * m_instance ;
+  const FunctorType    m_functor ;
+  const Policy         m_policy ;
 
   template< class TagType >
   inline static
@@ -110,16 +113,120 @@ private:
 public:
 
   inline void execute() const
+  {
+    enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
+         , Kokkos::Dynamic >::value
+         };
+
+    if ( OpenMP::in_parallel() ) {
+      exec_range< WorkTag >( m_functor
+                           , m_policy.begin()
+                           , m_policy.end() );
+    }
+    else {
+
+      OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_for");
+
+      const int pool_size = OpenMP::thread_pool_size();
+      #pragma omp parallel num_threads(pool_size)
+      {
+        HostThreadTeamData & data = *(m_instance->get_thread_data());
+
+        data.set_work_partition( m_policy.end() - m_policy.begin()
+            , m_policy.chunk_size() );
+
+        if ( is_dynamic ) {
+          // Make sure work partition is set before stealing
+          if ( data.pool_rendezvous() ) data.pool_rendezvous_release();
+        }
+
+        std::pair<int64_t,int64_t> range(0,0);
+
+        do {
+
+          range = is_dynamic ? data.get_work_stealing_chunk()
+            : data.get_work_partition();
+
+          ParallelFor::template
+            exec_range< WorkTag >( m_functor
+                , range.first  + m_policy.begin()
+                , range.second + m_policy.begin() );
+
+        } while ( is_dynamic && 0 <= range.first );
+      }
+    }
+  }
+
+  inline
+  ParallelFor( const FunctorType & arg_functor
+             , Policy arg_policy )
+    : m_instance( t_openmp_instance )
+    , m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    {}
+};
+
+
+// MDRangePolicy impl
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType
+                 , Kokkos::Experimental::MDRangePolicy< Traits ... >
+                 , Kokkos::OpenMP
+                 >
+{
+private:
+
+  typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
+  typedef typename MDRangePolicy::impl_range_policy         Policy ;
+  typedef typename MDRangePolicy::work_tag                  WorkTag ;
+
+  typedef typename Policy::WorkRange    WorkRange ;
+  typedef typename Policy::member_type  Member ;
+
+  typedef typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
+
+        OpenMPExec   * m_instance ;
+  const FunctorType   m_functor ;
+  const MDRangePolicy m_mdr_policy ;
+  const Policy        m_policy ;  // construct as RangePolicy( 0, num_tiles ).set_chunk_size(1) in ctor
+
+  inline static
+  void
+  exec_range( const MDRangePolicy & mdr_policy 
+            , const FunctorType & functor
+            , const Member ibeg , const Member iend )
     {
+      #ifdef KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION
+      #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+      #pragma ivdep
+      #endif
+      #endif
+      for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
+        iterate_type( mdr_policy, functor )( iwork );
+      }
+    }
+
+public:
+
+  inline void execute() const
+  {
       enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
                                       , Kokkos::Dynamic >::value };
 
-      OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_for");
-      OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_for");
+    if ( OpenMP::in_parallel() ) {
+      ParallelFor::exec_range ( m_mdr_policy
+                              , m_functor
+                              , m_policy.begin()
+                              , m_policy.end() );
+    }
+    else {
 
-#pragma omp parallel
+      OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_for");
+
+      const int pool_size = OpenMP::thread_pool_size();
+      #pragma omp parallel num_threads(pool_size)
       {
-        HostThreadTeamData & data = *OpenMPExec::get_thread_data();
+        HostThreadTeamData & data = *(m_instance->get_thread_data());
 
         data.set_work_partition( m_policy.end() - m_policy.begin()
                                , m_policy.chunk_size() );
@@ -136,8 +243,8 @@ public:
           range = is_dynamic ? data.get_work_stealing_chunk()
                              : data.get_work_partition();
 
-          ParallelFor::template
-            exec_range< WorkTag >( m_functor
+          ParallelFor::exec_range( m_mdr_policy 
+                                 , m_functor
                                  , range.first  + m_policy.begin()
                                  , range.second + m_policy.begin() );
 
@@ -145,12 +252,15 @@ public:
       }
       // END #pragma omp parallel
     }
+  }
 
   inline
   ParallelFor( const FunctorType & arg_functor
-             , Policy arg_policy )
-    : m_functor( arg_functor )
-    , m_policy(  arg_policy )
+             , MDRangePolicy arg_policy )
+    : m_instance( t_openmp_instance )
+    , m_functor( arg_functor )
+    , m_mdr_policy( arg_policy )
+    , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
     {}
 };
 
@@ -191,10 +301,11 @@ private:
   typedef typename Analysis::pointer_type    pointer_type ;
   typedef typename Analysis::reference_type  reference_type ;
 
-  const FunctorType   m_functor ;
-  const Policy        m_policy ;
-  const ReducerType   m_reducer ;
-  const pointer_type  m_result_ptr ;
+        OpenMPExec   * m_instance;
+  const FunctorType    m_functor;
+  const Policy         m_policy;
+  const ReducerType    m_reducer;
+  const pointer_type   m_result_ptr;
 
   template< class TagType >
   inline static
@@ -228,21 +339,21 @@ public:
       enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
                                       , Kokkos::Dynamic >::value };
 
-      OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_reduce");
-      OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_reduce");
+      OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_reduce");
 
       const size_t pool_reduce_bytes =
         Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));
 
-      OpenMPExec::resize_thread_data( pool_reduce_bytes
+      m_instance->resize_thread_data( pool_reduce_bytes
                                     , 0 // team_reduce_bytes
                                     , 0 // team_shared_bytes
                                     , 0 // thread_local_bytes
                                     );
 
-#pragma omp parallel
+      const int pool_size = OpenMP::thread_pool_size();
+      #pragma omp parallel num_threads(pool_size)
       {
-        HostThreadTeamData & data = *OpenMPExec::get_thread_data();
+        HostThreadTeamData & data = *(m_instance->get_thread_data());
 
         data.set_work_partition( m_policy.end() - m_policy.begin()
                                , m_policy.chunk_size() );
@@ -271,16 +382,15 @@ public:
 
         } while ( is_dynamic && 0 <= range.first );
       }
-// END #pragma omp parallel
 
       // Reduction:
 
-      const pointer_type ptr = pointer_type( OpenMPExec::get_thread_data(0)->pool_reduce_local() );
+      const pointer_type ptr = pointer_type( m_instance->get_thread_data(0)->pool_reduce_local() );
 
-      for ( int i = 1 ; i < OpenMPExec::pool_size() ; ++i ) {
+      for ( int i = 1 ; i < pool_size ; ++i ) {
         ValueJoin::join( ReducerConditional::select(m_functor , m_reducer)
                        , ptr
-                       , OpenMPExec::get_thread_data(i)->pool_reduce_local() );
+                       , m_instance->get_thread_data(i)->pool_reduce_local() );
       }
 
       Kokkos::Impl::FunctorFinal<  ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
@@ -303,7 +413,8 @@ public:
                            Kokkos::is_view< ViewType >::value &&
                            !Kokkos::is_reducer_type<ReducerType>::value
                   ,void*>::type = NULL)
-    : m_functor( arg_functor )
+    : m_instance( t_openmp_instance )
+    , m_functor( arg_functor )
     , m_policy(  arg_policy )
     , m_reducer( InvalidType() )
     , m_result_ptr(  arg_view.data() )
@@ -317,7 +428,8 @@ public:
   ParallelReduce( const FunctorType & arg_functor
                 , Policy       arg_policy
                 , const ReducerType& reducer )
-    : m_functor( arg_functor )
+    : m_instance( t_openmp_instance )
+    , m_functor( arg_functor )
     , m_policy(  arg_policy )
     , m_reducer( reducer )
     , m_result_ptr(  reducer.view().data() )
@@ -329,6 +441,173 @@ public:
 
 };
 
+
+// MDRangePolicy impl
+template< class FunctorType , class ReducerType, class ... Traits >
+class ParallelReduce< FunctorType
+                    , Kokkos::Experimental::MDRangePolicy< Traits ...>
+                    , ReducerType
+                    , Kokkos::OpenMP
+                    >
+{
+private:
+
+  typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
+  typedef typename MDRangePolicy::impl_range_policy         Policy ;
+
+  typedef typename MDRangePolicy::work_tag                  WorkTag ;
+  typedef typename Policy::WorkRange                        WorkRange ;
+  typedef typename Policy::member_type                      Member ;
+
+  typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  typedef typename ReducerTypeFwd::value_type ValueType; 
+
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd, WorkTag > ValueInit ;
+  typedef Kokkos::Impl::FunctorValueJoin<   ReducerTypeFwd, WorkTag > ValueJoin ;
+
+  typedef typename Analysis::pointer_type    pointer_type ;
+  typedef typename Analysis::reference_type  reference_type ;
+
+  using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy
+                                                                           , FunctorType
+                                                                           , WorkTag
+                                                                           , ValueType
+                                                                           >;
+
+        OpenMPExec   * m_instance ;
+  const FunctorType   m_functor ;
+  const MDRangePolicy m_mdr_policy ;
+  const Policy        m_policy ;     // construct as RangePolicy( 0, num_tiles ).set_chunk_size(1) in ctor
+  const ReducerType   m_reducer ;
+  const pointer_type  m_result_ptr ;
+
+  inline static
+  void
+  exec_range( const MDRangePolicy & mdr_policy
+            , const FunctorType & functor
+            , const Member ibeg , const Member iend
+            , reference_type update )
+    {
+      for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
+        iterate_type( mdr_policy, functor, update )( iwork );
+      }
+    }
+
+public:
+
+  inline void execute() const
+    {
+      enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
+                                      , Kokkos::Dynamic >::value };
+
+      OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_reduce");
+
+      const size_t pool_reduce_bytes =
+        Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));
+
+      m_instance->resize_thread_data( pool_reduce_bytes
+                                    , 0 // team_reduce_bytes
+                                    , 0 // team_shared_bytes
+                                    , 0 // thread_local_bytes
+                                    );
+
+      const int pool_size = OpenMP::thread_pool_size();
+      #pragma omp parallel num_threads(pool_size)
+      {
+        HostThreadTeamData & data = *(m_instance->get_thread_data());
+
+        data.set_work_partition( m_policy.end() - m_policy.begin()
+                               , m_policy.chunk_size() );
+
+        if ( is_dynamic ) {
+          // Make sure work partition is set before stealing
+          if ( data.pool_rendezvous() ) data.pool_rendezvous_release();
+        }
+
+        reference_type update =
+          ValueInit::init( ReducerConditional::select(m_functor , m_reducer)
+                         , data.pool_reduce_local() );
+
+        std::pair<int64_t,int64_t> range(0,0);
+
+        do {
+
+          range = is_dynamic ? data.get_work_stealing_chunk()
+                             : data.get_work_partition();
+
+          ParallelReduce::exec_range ( m_mdr_policy, m_functor
+                                     , range.first  + m_policy.begin()
+                                     , range.second + m_policy.begin()
+                                     , update );
+
+        } while ( is_dynamic && 0 <= range.first );
+      }
+// END #pragma omp parallel
+
+      // Reduction:
+
+      const pointer_type ptr = pointer_type( m_instance->get_thread_data(0)->pool_reduce_local() );
+
+      for ( int i = 1 ; i < pool_size ; ++i ) {
+        ValueJoin::join( ReducerConditional::select(m_functor , m_reducer)
+                       , ptr
+                       , m_instance->get_thread_data(i)->pool_reduce_local() );
+      }
+
+      Kokkos::Impl::FunctorFinal<  ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
+
+      if ( m_result_ptr ) {
+        const int n = Analysis::value_count( ReducerConditional::select(m_functor , m_reducer) );
+
+        for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
+      }
+    }
+
+  //----------------------------------------
+
+  template< class ViewType >
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+                , MDRangePolicy       arg_policy
+                , const ViewType    & arg_view
+                , typename std::enable_if<
+                           Kokkos::is_view< ViewType >::value &&
+                           !Kokkos::is_reducer_type<ReducerType>::value
+                  ,void*>::type = NULL)
+    : m_instance( t_openmp_instance )
+    , m_functor( arg_functor )
+    , m_mdr_policy(  arg_policy )
+    , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
+    , m_reducer( InvalidType() )
+    , m_result_ptr(  arg_view.data() )
+    {
+      /*static_assert( std::is_same< typename ViewType::memory_space
+                                      , Kokkos::HostSpace >::value
+        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
+    }
+
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+                , MDRangePolicy       arg_policy
+                , const ReducerType& reducer )
+    : m_instance( t_openmp_instance )
+    , m_functor( arg_functor )
+    , m_mdr_policy(  arg_policy )
+    , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
+    , m_reducer( reducer )
+    , m_result_ptr(  reducer.view().data() )
+    {
+      /*static_assert( std::is_same< typename ViewType::memory_space
+                                      , Kokkos::HostSpace >::value
+        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
+    }
+
+};
+
 } // namespace Impl
 } // namespace Kokkos
 
@@ -361,8 +640,9 @@ private:
   typedef typename Analysis::pointer_type    pointer_type ;
   typedef typename Analysis::reference_type  reference_type ;
 
-  const FunctorType   m_functor ;
-  const Policy        m_policy ;
+        OpenMPExec   * m_instance;
+  const FunctorType    m_functor;
+  const Policy         m_policy;
 
   template< class TagType >
   inline static
@@ -394,23 +674,23 @@ public:
   inline
   void execute() const
     {
-      OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_scan");
-      OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_scan");
+      OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_scan");
 
       const int    value_count       = Analysis::value_count( m_functor );
       const size_t pool_reduce_bytes = 2 * Analysis::value_size( m_functor );
 
-      OpenMPExec::resize_thread_data( pool_reduce_bytes
+      m_instance->resize_thread_data( pool_reduce_bytes
                                     , 0 // team_reduce_bytes
                                     , 0 // team_shared_bytes
                                     , 0 // thread_local_bytes
                                     );
 
-#pragma omp parallel
+      const int pool_size = OpenMP::thread_pool_size();
+      #pragma omp parallel num_threads(pool_size)
       {
-        HostThreadTeamData & data = *OpenMPExec::get_thread_data();
+        HostThreadTeamData & data = *(m_instance->get_thread_data());
 
-        const WorkRange range( m_policy, data.pool_rank(), data.pool_size() );
+        const WorkRange range( m_policy, omp_get_thread_num(), omp_get_num_threads() );
 
         reference_type update_sum =
           ValueInit::init( m_functor , data.pool_reduce_local() );
@@ -422,7 +702,7 @@ public:
 
           pointer_type ptr_prev = 0 ;
 
-          const int n = data.pool_size();
+          const int n = omp_get_num_threads();
 
           for ( int i = 0 ; i < n ; ++i ) {
 
@@ -452,7 +732,6 @@ public:
         ParallelScan::template exec_range< WorkTag >
           ( m_functor , range.begin() , range.end() , update_base , true );
       }
-/* END #pragma omp parallel */
 
     }
 
@@ -461,7 +740,8 @@ public:
   inline
   ParallelScan( const FunctorType & arg_functor
               , const Policy      & arg_policy )
-    : m_functor( arg_functor )
+    : m_instance( t_openmp_instance )
+    , m_functor( arg_functor )
     , m_policy(  arg_policy )
   {}
 
@@ -492,9 +772,10 @@ private:
   typedef typename Policy::schedule_type::type  SchedTag ;
   typedef typename Policy::member_type          Member ;
 
-  const FunctorType  m_functor ;
-  const Policy       m_policy ;
-  const int          m_shmem_size ;
+        OpenMPExec   * m_instance;
+  const FunctorType    m_functor;
+  const Policy         m_policy;
+  const int            m_shmem_size;
 
   template< class TagType >
   inline static
@@ -548,22 +829,22 @@ public:
     {
       enum { is_dynamic = std::is_same< SchedTag , Kokkos::Dynamic >::value };
 
-      OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_for");
-      OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_for");
+      OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_for");
 
       const size_t pool_reduce_size = 0 ; // Never shrinks
       const size_t team_reduce_size = TEAM_REDUCE_SIZE * m_policy.team_size();
       const size_t team_shared_size = m_shmem_size + m_policy.scratch_size(1);
       const size_t thread_local_size = 0 ; // Never shrinks
 
-      OpenMPExec::resize_thread_data( pool_reduce_size
+      m_instance->resize_thread_data( pool_reduce_size
                                     , team_reduce_size
                                     , team_shared_size
                                     , thread_local_size );
 
-#pragma omp parallel
+      const int pool_size = OpenMP::thread_pool_size();
+      #pragma omp parallel num_threads(pool_size)
       {
-        HostThreadTeamData & data = *OpenMPExec::get_thread_data();
+        HostThreadTeamData & data = *(m_instance->get_thread_data());
 
         const int active = data.organize_team( m_policy.team_size() );
 
@@ -598,14 +879,14 @@ public:
 
         data.disband_team();
       }
-// END #pragma omp parallel
     }
 
 
   inline
   ParallelFor( const FunctorType & arg_functor ,
                const Policy      & arg_policy )
-    : m_functor( arg_functor )
+    : m_instance( t_openmp_instance )
+    , m_functor( arg_functor )
     , m_policy(  arg_policy )
     , m_shmem_size( arg_policy.scratch_size(0) +
                     arg_policy.scratch_size(1) +
@@ -646,11 +927,12 @@ private:
   typedef typename Analysis::pointer_type    pointer_type ;
   typedef typename Analysis::reference_type  reference_type ;
 
-  const FunctorType  m_functor ;
-  const Policy       m_policy ;
-  const ReducerType  m_reducer ;
-  const pointer_type m_result_ptr ;
-  const int          m_shmem_size ;
+        OpenMPExec   * m_instance;
+  const FunctorType    m_functor;
+  const Policy         m_policy;
+  const ReducerType    m_reducer;
+  const pointer_type   m_result_ptr;
+  const int            m_shmem_size;
 
   template< class TagType >
   inline static
@@ -706,8 +988,7 @@ public:
     {
       enum { is_dynamic = std::is_same< SchedTag , Kokkos::Dynamic >::value };
 
-      OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_reduce");
-      OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_reduce");
+      OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_reduce");
 
       const size_t pool_reduce_size =
         Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));
@@ -716,14 +997,15 @@ public:
       const size_t team_shared_size = m_shmem_size + m_policy.scratch_size(1);
       const size_t thread_local_size = 0 ; // Never shrinks
 
-      OpenMPExec::resize_thread_data( pool_reduce_size
+      m_instance->resize_thread_data( pool_reduce_size
                                     , team_reduce_size
                                     , team_shared_size
                                     , thread_local_size );
 
-#pragma omp parallel
+      const int pool_size = OpenMP::thread_pool_size();
+      #pragma omp parallel num_threads(pool_size)
       {
-        HostThreadTeamData & data = *OpenMPExec::get_thread_data();
+        HostThreadTeamData & data = *(m_instance->get_thread_data());
 
         const int active = data.organize_team( m_policy.team_size() );
 
@@ -763,17 +1045,26 @@ public:
         }
 
         data.disband_team();
+
+        //  This thread has updated 'pool_reduce_local()' with its
+        //  contributions to the reduction.  The parallel region is
+        //  about to terminate and the master thread will load and
+        //  reduce each 'pool_reduce_local()' contribution.
+        //  Must 'memory_fence()' to guarantee that storing the update to
+        //  'pool_reduce_local()' will complete before this thread
+        //  exits the parallel region.
+
+        memory_fence();
       }
-// END #pragma omp parallel
 
       // Reduction:
 
-      const pointer_type ptr = pointer_type( OpenMPExec::get_thread_data(0)->pool_reduce_local() );
+      const pointer_type ptr = pointer_type( m_instance->get_thread_data(0)->pool_reduce_local() );
 
-      for ( int i = 1 ; i < OpenMPExec::pool_size() ; ++i ) {
+      for ( int i = 1 ; i < pool_size ; ++i ) {
         ValueJoin::join( ReducerConditional::select(m_functor , m_reducer)
                        , ptr
-                       , OpenMPExec::get_thread_data(i)->pool_reduce_local() );
+                       , m_instance->get_thread_data(i)->pool_reduce_local() );
       }
 
       Kokkos::Impl::FunctorFinal<  ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
@@ -796,7 +1087,8 @@ public:
                     Kokkos::is_view< ViewType >::value &&
                     !Kokkos::is_reducer_type<ReducerType>::value
                     ,void*>::type = NULL)
-    : m_functor( arg_functor )
+    : m_instance( t_openmp_instance )
+    , m_functor( arg_functor )
     , m_policy(  arg_policy )
     , m_reducer( InvalidType() )
     , m_result_ptr( arg_result.ptr_on_device() )
@@ -810,7 +1102,8 @@ public:
   ParallelReduce( const FunctorType & arg_functor
     , Policy       arg_policy
     , const ReducerType& reducer )
-  : m_functor( arg_functor )
+  : m_instance( t_openmp_instance )
+  , m_functor( arg_functor )
   , m_policy(  arg_policy )
   , m_reducer( reducer )
   , m_result_ptr(  reducer.view().data() )
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
index d4ade211f8..77363876b0 100644
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
@@ -105,7 +105,7 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::execute
 {
   using execution_space = Kokkos::OpenMP ;
   using queue_type      = TaskQueue< execution_space > ;
-  using task_root_type  = TaskBase< execution_space , void , void > ;
+  using task_root_type  = TaskBase< void , void , void > ;
   using Member          = Impl::HostThreadTeamMember< execution_space > ;
 
   static task_root_type * const end =
@@ -115,23 +115,19 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::execute
   HostThreadTeamData & team_data_single =
     HostThreadTeamDataSingleton::singleton();
 
-  const int team_size = Impl::OpenMPExec::pool_size(2); // Threads per core
-  // const int team_size = Impl::OpenMPExec::pool_size(1); // Threads per NUMA
+  Impl::OpenMPExec * instance = t_openmp_instance;
+  const int pool_size = OpenMP::thread_pool_size();
 
-#if 0
-fprintf(stdout,"TaskQueue<OpenMP> execute %d\n", team_size );
-fflush(stdout);
-#endif
+  const int team_size = 1;  // Threads per core
+  instance->resize_thread_data( 0 /* global reduce buffer */
+                              , 512 * team_size /* team reduce buffer */
+                              , 0 /* team shared buffer */
+                              , 0 /* thread local buffer */
+                              );
 
-  OpenMPExec::resize_thread_data( 0 /* global reduce buffer */
-                                , 512 * team_size /* team reduce buffer */
-                                , 0 /* team shared buffer */
-                                , 0 /* thread local buffer */
-                                );
-
-#pragma omp parallel
+  #pragma omp parallel num_threads(pool_size)
   {
-    Impl::HostThreadTeamData & self = *Impl::OpenMPExec::get_thread_data();
+    Impl::HostThreadTeamData & self = *(instance->get_thread_data());
 
     // Organizing threads into a team performs a barrier across the
     // entire pool to insure proper initialization of the team
@@ -142,18 +138,6 @@ fflush(stdout);
       Member single_exec( team_data_single );
       Member team_exec( self );
 
-#if 0
-fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) team(%d of %d) league(%d of %d) running\n"
-       , self.pool_rank()
-       , self.pool_size()
-       , team_exec.team_rank()
-       , team_exec.team_size()
-       , team_exec.league_rank()
-       , team_exec.league_size()
-       );
-fflush(stdout);
-#endif
-
       // Loop until all queues are empty and no tasks in flight
 
       task_root_type * task = 0 ;
@@ -197,15 +181,6 @@ fflush(stdout);
 
               // if a single thread task then execute now
 
-#if 0
-fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) executing single task 0x%lx\n"
-       , self.pool_rank()
-       , self.pool_size()
-       , int64_t(task)
-       );
-fflush(stdout);
-#endif
-
               (*task->m_apply)( task , & single_exec );
 
               leader_loop = true ;
@@ -220,57 +195,14 @@ fflush(stdout);
 
         if ( 0 != task ) { // Thread Team Task
 
-#if 0
-fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) team((%d of %d) league(%d of %d) executing team task 0x%lx\n"
-       , self.pool_rank()
-       , self.pool_size()
-       , team_exec.team_rank()
-       , team_exec.team_size()
-       , team_exec.league_rank()
-       , team_exec.league_size()
-       , int64_t(task)
-       );
-fflush(stdout);
-#endif
-
           (*task->m_apply)( task , & team_exec );
 
           // The m_apply function performs a barrier
         }
       } while( 0 != task );
-
-#if 0
-fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) team(%d of %d) league(%d of %d) ending\n"
-       , self.pool_rank()
-       , self.pool_size()
-       , team_exec.team_rank()
-       , team_exec.team_size()
-       , team_exec.league_rank()
-       , team_exec.league_size()
-       );
-fflush(stdout);
-#endif
-
     }
-
     self.disband_team();
-
-#if 0
-fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) disbanded\n"
-       , self.pool_rank()
-       , self.pool_size()
-       );
-fflush(stdout);
-#endif
-
   }
-// END #pragma omp parallel
-
-#if 0
-fprintf(stdout,"TaskQueue<OpenMP> execute %d end\n", team_size );
-fflush(stdout);
-#endif
-
 }
 
 void TaskQueueSpecialization< Kokkos::OpenMP >::
@@ -279,10 +211,10 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::
 {
   using execution_space = Kokkos::OpenMP ;
   using queue_type      = TaskQueue< execution_space > ;
-  using task_root_type  = TaskBase< execution_space , void , void > ;
+  using task_root_type  = TaskBase< void , void , void > ;
   using Member          = Impl::HostThreadTeamMember< execution_space > ;
 
-  if ( 1 == omp_get_num_threads() ) {
+  if ( 1 == OpenMP::thread_pool_size() ) {
 
     task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
 
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
index 82fbef255b..dfa1635e08 100644
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
@@ -45,7 +45,7 @@
 #define KOKKOS_IMPL_OPENMP_TASK_HPP
 
 #include <Kokkos_Macros.hpp>
-#if defined( KOKKOS_ENABLE_TASKDAG )
+#if defined( KOKKOS_ENABLE_OPENMP ) && defined( KOKKOS_ENABLE_TASKDAG )
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -60,7 +60,7 @@ public:
 
   using execution_space = Kokkos::OpenMP ;
   using queue_type      = Kokkos::Impl::TaskQueue< execution_space > ;
-  using task_base_type  = Kokkos::Impl::TaskBase< execution_space , void , void > ;
+  using task_base_type  = Kokkos::Impl::TaskBase< void , void , void > ;
   using member_type     = Kokkos::Impl::HostThreadTeamMember< execution_space > ;
 
   // Must specify memory space
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Team.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Team.hpp
new file mode 100644
index 0000000000..743e6b6e62
--- /dev/null
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Team.hpp
@@ -0,0 +1,245 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_OPENMP_TEAM_HPP
+#define KOKKOS_OPENMP_TEAM_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ENABLE_OPENMP )
+
+#include <OpenMP/Kokkos_OpenMP_Exec.hpp>
+
+namespace Kokkos { namespace Impl {
+
+template< class ... Properties >
+class TeamPolicyInternal< Kokkos::OpenMP, Properties ... >: public PolicyTraits<Properties ...>
+{
+public:
+
+  //! Tag this class as a kokkos execution policy
+  typedef TeamPolicyInternal      execution_policy ;
+
+  typedef PolicyTraits<Properties ... > traits;
+
+  TeamPolicyInternal& operator = (const TeamPolicyInternal& p) {
+    m_league_size = p.m_league_size;
+    m_team_size = p.m_team_size;
+    m_team_alloc = p.m_team_alloc;
+    m_team_iter = p.m_team_iter;
+    m_team_scratch_size[0] = p.m_team_scratch_size[0];
+    m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
+    m_team_scratch_size[1] = p.m_team_scratch_size[1];
+    m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
+    m_chunk_size = p.m_chunk_size;
+    return *this;
+  }
+
+  //----------------------------------------
+
+  template< class FunctorType >
+  inline static
+  int team_size_max( const FunctorType & ) {
+      int pool_size = traits::execution_space::thread_pool_size(1);
+      int max_host_team_size =  Impl::HostThreadTeamData::max_team_members;
+      return pool_size<max_host_team_size?pool_size:max_host_team_size;
+    }
+
+  template< class FunctorType >
+  inline static
+  int team_size_recommended( const FunctorType & )
+    { return traits::execution_space::thread_pool_size(2); }
+
+  template< class FunctorType >
+  inline static
+  int team_size_recommended( const FunctorType &, const int& )
+    { return traits::execution_space::thread_pool_size(2); }
+
+  //----------------------------------------
+
+private:
+
+  int m_league_size ;
+  int m_team_size ;
+  int m_team_alloc ;
+  int m_team_iter ;
+
+  size_t m_team_scratch_size[2];
+  size_t m_thread_scratch_size[2];
+
+  int m_chunk_size;
+
+  inline void init( const int league_size_request
+                  , const int team_size_request )
+    {
+      const int pool_size  = traits::execution_space::thread_pool_size(0);
+      const int max_host_team_size =  Impl::HostThreadTeamData::max_team_members;
+      const int team_max   = pool_size<max_host_team_size?pool_size:max_host_team_size;
+      const int team_grain = traits::execution_space::thread_pool_size(2);
+
+      m_league_size = league_size_request ;
+
+      m_team_size = team_size_request < team_max ?
+                    team_size_request : team_max ;
+
+      // Round team size up to a multiple of 'team_gain'
+      const int team_size_grain = team_grain * ( ( m_team_size + team_grain - 1 ) / team_grain );
+      const int team_count      = pool_size / team_size_grain ;
+
+      // Constraint : pool_size = m_team_alloc * team_count
+      m_team_alloc = pool_size / team_count ;
+
+      // Maxumum number of iterations each team will take:
+      m_team_iter  = ( m_league_size + team_count - 1 ) / team_count ;
+
+      set_auto_chunk_size();
+    }
+
+public:
+
+  inline int team_size()   const { return m_team_size ; }
+  inline int league_size() const { return m_league_size ; }
+
+  inline size_t scratch_size(const int& level, int team_size_ = -1) const {
+    if(team_size_ < 0) team_size_ = m_team_size;
+    return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level] ;
+  }
+
+  /** \brief  Specify league size, request team size */
+  TeamPolicyInternal( typename traits::execution_space &
+            , int league_size_request
+            , int team_size_request
+            , int /* vector_length_request */ = 1 )
+            : m_team_scratch_size { 0 , 0 }
+            , m_thread_scratch_size { 0 , 0 }
+            , m_chunk_size(0)
+    { init( league_size_request , team_size_request ); }
+
+  TeamPolicyInternal( typename traits::execution_space &
+            , int league_size_request
+            , const Kokkos::AUTO_t & /* team_size_request */
+            , int /* vector_length_request */ = 1)
+            : m_team_scratch_size { 0 , 0 }
+            , m_thread_scratch_size { 0 , 0 }
+            , m_chunk_size(0)
+    { init( league_size_request , traits::execution_space::thread_pool_size(2) ); }
+
+  TeamPolicyInternal( int league_size_request
+            , int team_size_request
+            , int /* vector_length_request */ = 1 )
+            : m_team_scratch_size { 0 , 0 }
+            , m_thread_scratch_size { 0 , 0 }
+            , m_chunk_size(0)
+    { init( league_size_request , team_size_request ); }
+
+  TeamPolicyInternal( int league_size_request
+            , const Kokkos::AUTO_t & /* team_size_request */
+            , int /* vector_length_request */ = 1 )
+            : m_team_scratch_size { 0 , 0 }
+            , m_thread_scratch_size { 0 , 0 }
+            , m_chunk_size(0)
+    { init( league_size_request , traits::execution_space::thread_pool_size(2) ); }
+
+  inline int team_alloc() const { return m_team_alloc ; }
+  inline int team_iter()  const { return m_team_iter ; }
+
+  inline int chunk_size() const { return m_chunk_size ; }
+
+  /** \brief set chunk_size to a discrete value*/
+  inline TeamPolicyInternal set_chunk_size(typename traits::index_type chunk_size_) const {
+    TeamPolicyInternal p = *this;
+    p.m_chunk_size = chunk_size_;
+    return p;
+  }
+
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
+    TeamPolicyInternal p = *this;
+    p.m_team_scratch_size[level] = per_team.value;
+    return p;
+  };
+
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
+    TeamPolicyInternal p = *this;
+    p.m_thread_scratch_size[level] = per_thread.value;
+    return p;
+  };
+
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
+    TeamPolicyInternal p = *this;
+    p.m_team_scratch_size[level] = per_team.value;
+    p.m_thread_scratch_size[level] = per_thread.value;
+    return p;
+  };
+
+private:
+  /** \brief finalize chunk_size if it was set to AUTO*/
+  inline void set_auto_chunk_size() {
+
+    int concurrency = traits::execution_space::thread_pool_size(0)/m_team_alloc;
+    if( concurrency==0 ) concurrency=1;
+
+    if(m_chunk_size > 0) {
+      if(!Impl::is_integral_power_of_two( m_chunk_size ))
+        Kokkos::abort("TeamPolicy blocking granularity must be power of two" );
+    }
+
+    int new_chunk_size = 1;
+    while(new_chunk_size*100*concurrency < m_league_size)
+      new_chunk_size *= 2;
+    if(new_chunk_size < 128) {
+      new_chunk_size = 1;
+      while( (new_chunk_size*40*concurrency < m_league_size ) && (new_chunk_size<128) )
+        new_chunk_size*=2;
+    }
+    m_chunk_size = new_chunk_size;
+  }
+
+public:
+  typedef Impl::HostThreadTeamMember< Kokkos::OpenMP > member_type ;
+};
+
+}} // namespace Kokkos::Impl
+
+#endif
+#endif /* KOKKOS_OPENMP_TEAM_HPP */
+
+
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp
new file mode 100644
index 0000000000..289ad15451
--- /dev/null
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp
@@ -0,0 +1,107 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_OPENMP_WORKGRAPHPOLICY_HPP
+#define KOKKOS_OPENMP_WORKGRAPHPOLICY_HPP
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType ,
+                   Kokkos::Experimental::WorkGraphPolicy< Traits ... > ,
+                   Kokkos::OpenMP
+                 >
+  : public Kokkos::Impl::Experimental::
+           WorkGraphExec< FunctorType,
+                          Kokkos::OpenMP,
+                          Traits ...
+                        >
+{
+private:
+
+  typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ;
+  typedef Kokkos::Impl::Experimental::
+          WorkGraphExec<FunctorType, Kokkos::OpenMP, Traits ... > Base ;
+
+  template< class TagType >
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_one(const typename Policy::member_type& i) const {
+    Base::m_functor( i );
+  }
+
+  template< class TagType >
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_one(const typename Policy::member_type& i) const {
+    const TagType t{} ;
+    Base::m_functor( t , i );
+  }
+
+public:
+
+  inline
+  void execute()
+  {
+    const int pool_size = OpenMP::thread_pool_size();
+
+    #pragma omp parallel num_threads(pool_size)
+    {
+      for (std::int32_t i; (-1 != (i = Base::before_work())); ) {
+        exec_one< typename Policy::work_tag >( i );
+        Base::after_work(i);
+      }
+    }
+  }
+
+  inline
+  ParallelFor( const FunctorType & arg_functor
+             , const Policy      & arg_policy )
+    : Base( arg_functor, arg_policy )
+  {
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* #define KOKKOS_OPENMP_WORKGRAPHPOLICY_HPP */
diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp
index bec7844ed6..258a9d2ff7 100644
--- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp
+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -45,7 +45,7 @@
 #define KOKKOS_OPENMPTARGETEXEC_HPP
 
 #include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_spinwait.hpp>
+#include <impl/Kokkos_Spinwait.hpp>
 
 #include <Kokkos_Atomic.hpp>
 #include <iostream>
@@ -59,10 +59,10 @@ namespace Impl {
 
 
 class OpenMPTargetExec {
-public: 
+public:
   enum { MAX_ACTIVE_THREADS = 256*8*56*4 };
   enum { MAX_ACTIVE_TEAMS = MAX_ACTIVE_THREADS/32 };
-  
+
 private:
   static void* scratch_ptr;
 
@@ -70,7 +70,7 @@ public:
   static void verify_is_process( const char * const );
   static void verify_initialized( const char * const );
 
-  static void* get_scratch_ptr();  
+  static void* get_scratch_ptr();
   static void clear_scratch();
   static void resize_scratch( int64_t reduce_bytes , int64_t team_reduce_bytes, int64_t team_shared_bytes, int64_t thread_local_bytes );
 
@@ -159,7 +159,7 @@ public:
 
   KOKKOS_INLINE_FUNCTION void team_barrier() const
     {
-      #pragma omp barrier  
+      #pragma omp barrier
     }
 
   template<class ValueType>
@@ -191,13 +191,13 @@ public:
 
       typedef ValueType value_type;
       const JoinLambdaAdapter<value_type,JoinOp> op(op_in);
-      
+
       // Make sure there is enough scratch space:
       typedef typename if_c< sizeof(value_type) < TEAM_REDUCE_SIZE
                            , value_type , void >::type type ;
 
       const int n_values = TEAM_REDUCE_SIZE/sizeof(value_type);
-      type * team_scratch = (type*) ((char*)m_glb_scratch + TEAM_REDUCE_SIZE*omp_get_team_num()); 
+      type * team_scratch = (type*) ((char*)m_glb_scratch + TEAM_REDUCE_SIZE*omp_get_team_num());
       for(int i = m_team_rank; i < n_values; i+= m_team_size) {
         team_scratch[i] = value_type();
       }
@@ -209,7 +209,7 @@ public:
           team_scratch[m_team_rank%n_values]+=value;
         #pragma omp barrier
       }
-      
+
       for(int d = 1; d<n_values;d*=2) {
         if((m_team_rank+d<n_values) && (m_team_rank%(2*d)==0)) {
           team_scratch[m_team_rank] += team_scratch[m_team_rank+d];
@@ -374,12 +374,12 @@ private:
   int m_chunk_size;
 
   inline void init( const int league_size_request
-                  , const int team_size_request 
+                  , const int team_size_request
                   , const int vector_length_request )
     {
       m_league_size = league_size_request ;
 
-      m_team_size = team_size_request; 
+      m_team_size = team_size_request;
 
       m_vector_length = vector_length_request;
 
diff --git a/lib/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.hpp b/lib/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.hpp
index c3b773e073..abf390b176 100644
--- a/lib/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.hpp
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.hpp
@@ -47,7 +47,7 @@
 #include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ENABLE_QTHREADS )
 
-#include <impl/Kokkos_spinwait.hpp>
+#include <impl/Kokkos_Spinwait.hpp>
 
 //----------------------------------------------------------------------------
 
diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
index 4c805310cc..35b2163ae5 100644
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
@@ -45,14 +45,14 @@
 #include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ENABLE_THREADS )
 
-#include <Kokkos_Core_fwd.hpp>
-
 #include <cstdint>
 #include <limits>
 #include <utility>
 #include <iostream>
 #include <sstream>
+
 #include <Kokkos_Core.hpp>
+
 #include <impl/Kokkos_Error.hpp>
 #include <impl/Kokkos_CPUDiscovery.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>
@@ -80,9 +80,7 @@ const void * volatile s_current_function_arg = 0 ;
 
 struct Sentinel {
   Sentinel()
-  {
-    HostSpace::register_in_parallel( ThreadsExec::in_parallel );
-  }
+  {}
 
   ~Sentinel()
   {
@@ -122,6 +120,8 @@ void execute_function_noop( ThreadsExec & , const void * ) {}
 
 void ThreadsExec::driver(void)
 {
+  SharedAllocationRecord< void, void >::tracking_enable();
+
   ThreadsExec this_thread ;
 
   while ( ThreadsExec::Active == this_thread.m_pool_state ) {
@@ -726,6 +726,8 @@ void ThreadsExec::initialize( unsigned thread_count ,
   // Init the array for used for arbitrarily sized atomics
   Impl::init_lock_array_host_space();
 
+  Impl::SharedAllocationRecord< void, void >::tracking_enable();
+
   #if defined(KOKKOS_ENABLE_PROFILING)
     Kokkos::Profiling::initialize();
   #endif
diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
index 74de3a2596..7557bad7d9 100644
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
@@ -50,11 +50,12 @@
 #include <cstdio>
 
 #include <utility>
-#include <impl/Kokkos_spinwait.hpp>
+#include <impl/Kokkos_Spinwait.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
 
 #include <Kokkos_Atomic.hpp>
 
+#include <Kokkos_UniqueToken.hpp>
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
@@ -275,6 +276,17 @@ public:
       if ( ! rev_rank ) {
         Final::final( f , reduce_memory() );
       }
+
+      //  This thread has updated 'reduce_memory()' and upon returning
+      //  from this function will set 'm_pool_state' to inactive.
+      //  If this is a non-root thread then setting 'm_pool_state'
+      //  to inactive triggers another thread to exit a spinwait
+      //  and read the 'reduce_memory'.
+      //  Must 'memory_fence()' to guarantee that storing the update to
+      //  'reduce_memory()' will complete before storing the the update to
+      //  'm_pool_state'.
+
+      memory_fence();
     }
 
   inline
@@ -627,6 +639,62 @@ inline void Threads::fence()
 
 } /* namespace Kokkos */
 
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos { namespace Experimental {
+
+template<>
+class UniqueToken< Threads, UniqueTokenScope::Instance>
+{
+public:
+  using execution_space = Threads;
+  using size_type       = int;
+
+  /// \brief create object size for concurrency on the given instance
+  ///
+  /// This object should not be shared between instances
+  UniqueToken( execution_space const& = execution_space() ) noexcept {}
+
+  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
+  inline
+  int size() const noexcept { return Threads::thread_pool_size(); }
+
+  /// \brief acquire value such that 0 <= value < size()
+  inline
+  int acquire() const  noexcept { return Threads::thread_pool_rank(); }
+
+  /// \brief release a value acquired by generate
+  inline
+  void release( int ) const noexcept {}
+};
+
+template<>
+class UniqueToken< Threads, UniqueTokenScope::Global>
+{
+public:
+  using execution_space = Threads;
+  using size_type       = int;
+
+  /// \brief create object size for concurrency on the given instance
+  ///
+  /// This object should not be shared between instances
+  UniqueToken( execution_space const& = execution_space() ) noexcept {}
+
+  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
+  inline
+  int size() const noexcept { return Threads::thread_pool_size(); }
+
+  /// \brief acquire value such that 0 <= value < size()
+  inline
+  int acquire() const  noexcept { return Threads::thread_pool_rank(); }
+
+  /// \brief release a value acquired by generate
+  inline
+  void release( int ) const noexcept {}
+};
+
+}} // namespace Kokkos::Experimental
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 #endif
diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
index c12019413b..6060bf191f 100644
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
@@ -50,7 +50,7 @@
 #include <cstdio>
 
 #include <utility>
-#include <impl/Kokkos_spinwait.hpp>
+#include <impl/Kokkos_Spinwait.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
 #include <impl/Kokkos_HostThreadTeam.hpp>
 
@@ -482,6 +482,8 @@ public:
   void next_static()
     {
       if ( m_league_rank < m_league_end ) {
+        // Make sure all stores are complete before entering the barrier
+        memory_fence();
         team_barrier();
         set_team_shared();
       }
@@ -518,6 +520,8 @@ public:
       return;
 
     if ( m_league_rank < m_league_chunk_end ) {
+      // Make sure all stores are complete before entering the barrier
+      memory_fence();
       team_barrier();
       set_team_shared();
     }
diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp
index 0ee0cd3280..18ac7d26ad 100644
--- a/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp
@@ -55,6 +55,8 @@
 #include <impl/Kokkos_StaticAssert.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
 
+#include <KokkosExp_MDRangePolicy.hpp>
+
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
@@ -174,6 +176,108 @@ public:
     {}
 };
 
+
+// MDRangePolicy impl
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType
+                 , Kokkos::Experimental::MDRangePolicy< Traits ... >
+                 , Kokkos::Threads
+                 >
+{
+private:
+  typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
+  typedef typename MDRangePolicy::impl_range_policy         Policy ;
+
+  typedef typename MDRangePolicy::work_tag                  WorkTag ;
+
+  typedef typename Policy::WorkRange   WorkRange ;
+  typedef typename Policy::member_type Member ;
+
+  typedef typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
+
+  const FunctorType   m_functor ;
+  const MDRangePolicy m_mdr_policy ;
+  const Policy        m_policy ;  // construct as RangePolicy( 0, num_tiles ).set_chunk_size(1) in ctor
+
+  inline static
+  void
+  exec_range( const MDRangePolicy & mdr_policy 
+            , const FunctorType & functor
+            , const Member ibeg , const Member iend )
+    {
+      #if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
+          defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
+      #pragma ivdep
+      #endif
+      for ( Member i = ibeg ; i < iend ; ++i ) {
+        iterate_type( mdr_policy, functor )( i );
+      }
+    }
+
+  static void exec( ThreadsExec & exec , const void * arg )
+  {
+    exec_schedule<typename Policy::schedule_type::type>(exec,arg);
+  }
+
+  template<class Schedule>
+  static
+  typename std::enable_if< std::is_same<Schedule,Kokkos::Static>::value >::type
+  exec_schedule( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelFor & self = * ((const ParallelFor *) arg );
+
+    WorkRange range( self.m_policy , exec.pool_rank() , exec.pool_size() );
+
+    ParallelFor::exec_range
+      ( self.m_mdr_policy, self.m_functor , range.begin() , range.end() );
+
+    exec.fan_in();
+  }
+
+  template<class Schedule>
+  static
+  typename std::enable_if< std::is_same<Schedule,Kokkos::Dynamic>::value >::type
+  exec_schedule( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelFor & self = * ((const ParallelFor *) arg );
+
+    WorkRange range( self.m_policy , exec.pool_rank() , exec.pool_size() );
+
+    exec.set_work_range(range.begin(),range.end(),self.m_policy.chunk_size());
+    exec.reset_steal_target();
+    exec.barrier();
+
+    long work_index = exec.get_work_index();
+
+    while(work_index != -1) {
+      const Member begin = static_cast<Member>(work_index) * self.m_policy.chunk_size();
+      const Member end = begin + self.m_policy.chunk_size() < self.m_policy.end()?begin+self.m_policy.chunk_size():self.m_policy.end();
+
+      ParallelFor::exec_range
+        ( self.m_mdr_policy, self.m_functor , begin , end );
+      work_index = exec.get_work_index();
+    }
+
+    exec.fan_in();
+  }
+
+public:
+
+  inline
+  void execute() const
+    {
+      ThreadsExec::start( & ParallelFor::exec , this );
+      ThreadsExec::fence();
+    }
+
+  ParallelFor( const FunctorType & arg_functor
+             , const MDRangePolicy      & arg_policy )
+    : m_functor( arg_functor )
+    , m_mdr_policy( arg_policy )
+    , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
+    {}
+};
+
 //----------------------------------------------------------------------------
 /* ParallelFor Kokkos::Threads with TeamPolicy */
 
@@ -440,6 +544,169 @@ public:
 
 };
 
+
+// MDRangePolicy impl
+template< class FunctorType , class ReducerType, class ... Traits >
+class ParallelReduce< FunctorType
+                    , Kokkos::Experimental::MDRangePolicy< Traits ... >
+                    , ReducerType
+                    , Kokkos::Threads
+                    >
+{
+private:
+
+  typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
+  typedef typename MDRangePolicy::impl_range_policy Policy ;
+
+  typedef typename MDRangePolicy::work_tag    WorkTag ;
+  typedef typename Policy::WorkRange   WorkRange ;
+  typedef typename Policy::member_type Member ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  typedef typename ReducerTypeFwd::value_type ValueType; 
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd, WorkTag > ValueInit ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy
+                                                                           , FunctorType
+                                                                           , WorkTag
+                                                                           , ValueType
+                                                                           >;
+
+  const FunctorType   m_functor ;
+  const MDRangePolicy m_mdr_policy ;
+  const Policy        m_policy ;  // construct as RangePolicy( 0, num_tiles ).set_chunk_size(1) in ctor
+  const ReducerType   m_reducer ;
+  const pointer_type  m_result_ptr ;
+
+  inline static
+  void
+  exec_range( const MDRangePolicy & mdr_policy
+            , const FunctorType & functor
+            , const Member & ibeg , const Member & iend
+            , reference_type update )
+    {
+      #if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
+          defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
+      #pragma ivdep
+      #endif
+      for ( Member i = ibeg ; i < iend ; ++i ) {
+        iterate_type( mdr_policy, functor, update )( i );
+      }
+    }
+
+  static void
+  exec( ThreadsExec & exec , const void * arg ) {
+    exec_schedule<typename Policy::schedule_type::type>(exec, arg);
+  }
+
+  template<class Schedule>
+  static
+  typename std::enable_if< std::is_same<Schedule,Kokkos::Static>::value >::type
+  exec_schedule( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelReduce & self = * ((const ParallelReduce *) arg );
+    const WorkRange range( self.m_policy, exec.pool_rank(), exec.pool_size() );
+
+    ParallelReduce::exec_range
+      ( self.m_mdr_policy, self.m_functor , range.begin() , range.end()
+      , ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() ) );
+
+    exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) );
+  }
+
+  template<class Schedule>
+  static
+  typename std::enable_if< std::is_same<Schedule,Kokkos::Dynamic>::value >::type
+    exec_schedule( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelReduce & self = * ((const ParallelReduce *) arg );
+    const WorkRange range( self.m_policy, exec.pool_rank(), exec.pool_size() );
+
+    exec.set_work_range(range.begin(),range.end(),self.m_policy.chunk_size());
+    exec.reset_steal_target();
+    exec.barrier();
+
+    long work_index = exec.get_work_index();
+    reference_type update = ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() );
+    while(work_index != -1) {
+      const Member begin = static_cast<Member>(work_index) * self.m_policy.chunk_size();
+      const Member end = begin + self.m_policy.chunk_size() < self.m_policy.end()?begin+self.m_policy.chunk_size():self.m_policy.end();
+      ParallelReduce::exec_range
+        ( self.m_mdr_policy, self.m_functor , begin , end
+        , update );
+      work_index = exec.get_work_index();
+    }
+
+    exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) );
+  }
+
+public:
+
+  inline
+  void execute() const
+    {
+      ThreadsExec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
+
+      ThreadsExec::start( & ParallelReduce::exec , this );
+
+      ThreadsExec::fence();
+
+      if ( m_result_ptr ) {
+
+        const pointer_type data =
+          (pointer_type) ThreadsExec::root_reduce_scratch();
+
+        const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
+        for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
+      }
+    }
+
+  template< class HostViewType >
+  ParallelReduce( const FunctorType  & arg_functor ,
+                  const MDRangePolicy       & arg_policy ,
+                  const HostViewType & arg_result_view ,
+                  typename std::enable_if<
+                               Kokkos::is_view< HostViewType >::value &&
+                              !Kokkos::is_reducer_type<ReducerType>::value
+                  ,void*>::type = NULL)
+    : m_functor( arg_functor )
+    , m_mdr_policy( arg_policy )
+    , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
+    , m_reducer( InvalidType() )
+    , m_result_ptr( arg_result_view.ptr_on_device() )
+    {
+      static_assert( Kokkos::is_view< HostViewType >::value
+        , "Kokkos::Threads reduce result must be a View" );
+
+      static_assert( std::is_same< typename HostViewType::memory_space , HostSpace >::value
+        , "Kokkos::Threads reduce result must be a View in HostSpace" );
+    }
+
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+                , MDRangePolicy       arg_policy
+                , const ReducerType& reducer )
+    : m_functor( arg_functor )
+    , m_mdr_policy(  arg_policy )
+    , m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
+    , m_reducer( reducer )
+    , m_result_ptr(  reducer.view().data() )
+    {
+      /*static_assert( std::is_same< typename ViewType::memory_space
+                                      , Kokkos::HostSpace >::value
+        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
+    }
+
+};
+
+
 //----------------------------------------------------------------------------
 /* ParallelReduce with Kokkos::Threads and TeamPolicy */
 
diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp
new file mode 100644
index 0000000000..be904a1670
--- /dev/null
+++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp
@@ -0,0 +1,115 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_THREADS_WORKGRAPHPOLICY_HPP
+#define KOKKOS_THREADS_WORKGRAPHPOLICY_HPP
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType ,
+                   Kokkos::Experimental::WorkGraphPolicy< Traits ... > ,
+                   Kokkos::Threads
+                 >
+  : public Kokkos::Impl::Experimental::
+           WorkGraphExec< FunctorType,
+                          Kokkos::Threads,
+                          Traits ...
+                        >
+{
+private:
+
+  typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ;
+  typedef Kokkos::Impl::Experimental::
+          WorkGraphExec<FunctorType, Kokkos::Threads, Traits ... > Base ;
+  typedef ParallelFor<FunctorType,
+                      Kokkos::Experimental::WorkGraphPolicy<Traits ...>,
+                      Kokkos::Threads> Self ;
+
+  template< class TagType >
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_one(const typename Policy::member_type& i) const {
+    Base::m_functor( i );
+  }
+
+  template< class TagType >
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_one(const typename Policy::member_type& i) const {
+    const TagType t{} ;
+    Base::m_functor( t , i );
+  }
+
+  inline void exec_one_thread() const {
+    for (std::int32_t i; (-1 != (i = Base::before_work())); ) {
+      exec_one< typename Policy::work_tag >( i );
+      Base::after_work(i);
+    }
+  }
+
+  static inline void thread_main( ThreadsExec&, const void* arg ) {
+    const Self& self = *(static_cast<const Self*>(arg));
+    self.exec_one_thread();
+  }
+
+public:
+
+  inline
+  void execute()
+  {
+    ThreadsExec::start( & Self::thread_main, this );
+    ThreadsExec::fence();
+  }
+
+  inline
+  ParallelFor( const FunctorType & arg_functor
+             , const Policy      & arg_policy )
+    : Base( arg_functor, arg_policy )
+  {
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* #define KOKKOS_THREADS_WORKGRAPHPOLICY_HPP */
diff --git a/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp b/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
index 77a1e8754d..0171b209e5 100644
--- a/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
+++ b/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
@@ -141,7 +141,6 @@ namespace Kokkos { namespace Experimental { namespace Impl {
 #define LOOP_ARGS_8 LOOP_ARGS_7, i7 + m_offset[7]
 
 
-
 // New Loop Macros...
 // parallel_for, non-tagged
 #define APPLY( func, ... ) \
@@ -1010,8 +1009,6 @@ namespace Kokkos { namespace Experimental { namespace Impl {
 // end tagged macros
 
 
-
-
 // Structs for calling loops
 template < int Rank, bool IsLeft, typename IType, typename Tagged, typename Enable = void >
 struct Tile_Loop_Type;
@@ -1279,6 +1276,19 @@ struct Tile_Loop_Type<8, IsLeft, IType, Tagged, typename std::enable_if< !std::i
 template <typename T>
 using is_void = std::is_same< T , void >;
 
+template <typename T>
+struct is_type_array : std::false_type 
+{
+  using value_type = T;
+};
+
+template <typename T>
+struct is_type_array< T[] > : std::true_type
+{
+  using value_type = T;
+};
+
+
 template < typename RP
          , typename Functor
          , typename Tag = void
@@ -1761,18 +1771,17 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
   RP         const& m_rp;
   Functor    const& m_func;
   typename std::conditional< std::is_same<Tag,void>::value,int,Tag>::type m_tag;
-//  value_type  & m_v;
-
 };
 
 
-// ValueType: For reductions
+// For ParallelReduce
+// ValueType - scalar: For reductions
 template < typename RP
          , typename Functor
          , typename Tag
          , typename ValueType
          >
-struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void<ValueType >::value >::type >
+struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void<ValueType >::value && !is_type_array<ValueType>::value >::type >
 {
   using index_type = typename RP::index_type;
   using point_type = typename RP::point_type;
@@ -2251,12 +2260,497 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
 };
 
 
+// For ParallelReduce
+// Extra specialization for array reductions
+// ValueType[]: For array reductions
+template < typename RP
+         , typename Functor
+         , typename Tag
+         , typename ValueType
+         >
+struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void<ValueType >::value && is_type_array<ValueType>::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using point_type = typename RP::point_type;
+
+  using value_type = typename is_type_array<ValueType>::value_type; // strip away the 'array-ness' [], only underlying type remains
+
+  inline
+  HostIterateTile( RP const& rp, Functor const& func, value_type *v ) // v should be an array; treat as pointer for compatibility since size is not known nor needed here
+    : m_rp(rp) //Cuda 7.0 does not like braces...
+    , m_func(func)
+    , m_v(v) // use with non-void ValueType struct
+  {}
+
+  inline
+  bool check_iteration_bounds( point_type& partial_tile , point_type& offset ) const {
+    bool is_full_tile = true;
+
+      for ( int i = 0; i < RP::rank; ++i ) {
+        if ((offset[i] + m_rp.m_tile[i]) <= m_rp.m_upper[i]) {
+            partial_tile[i] = m_rp.m_tile[i] ;
+        }
+        else {
+          is_full_tile = false ;
+            partial_tile[i] = (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1
+                            : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 ? (m_rp.m_upper[i] - offset[i])
+                            : (m_rp.m_upper[i] - m_rp.m_lower[i]) ; // when single tile encloses range
+        }
+      }
+
+    return is_full_tile ;
+  } // end check bounds
+
+
+  template <int Rank>
+  struct RankTag
+  {
+    typedef RankTag type;
+    enum { value = (int)Rank };
+  };
+
+
+#if KOKKOS_ENABLE_NEW_LOOP_MACROS
+  template <typename IType>
+  inline
+  void
+  operator()(IType tile_idx) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    Tile_Loop_Type< RP::rank, (RP::inner_direction == RP::Left), index_type, Tag >::apply( m_v, m_func, full_tile, m_offset, m_rp.m_tile, m_tiledims );
+
+  }
+
+#else
+  template <typename IType>
+  inline
+  void
+  operator()(IType tile_idx) const
+  { operator_impl( tile_idx , RankTag<RP::rank>() ); }
+  // added due to compiler error when using sfinae to choose operator based on rank
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<2> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_2L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_2 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_2L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_2 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_2R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_2 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_2R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_2 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 2
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<3> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_3L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_3 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_3L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_3 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_3R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_3 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_3R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_3 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 3
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<4> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_4L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_4 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_4L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_4 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_4R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_4 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_4R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_4 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 4
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<5> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_5L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_5 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_5L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_5 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_5R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_5 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_5R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_5 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 5
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<6> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_6L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_6 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_6L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_6 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_6R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_6 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_6R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_6 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 6
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<7> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_7L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_7 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_7L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_7 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_7R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_7 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_7R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_7 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 7
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<8> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_8L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_8 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_8L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_8 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_8R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_8 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_8R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_8 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 8
+#endif
+
+
+    template <typename... Args>
+    typename std::enable_if<( sizeof...(Args) == RP::rank && std::is_same<Tag,void>::value), void>::type
+    apply(Args &&... args) const
+    {
+      m_func(args... , m_v);
+    }
+
+    template <typename... Args>
+    typename std::enable_if<( sizeof...(Args) == RP::rank && !std::is_same<Tag,void>::value), void>::type
+    apply(Args &&... args) const
+    {
+      m_func( m_tag, args... , m_v);
+    }
+
+
+  RP         const& m_rp;
+  Functor    const& m_func;
+  value_type * m_v;
+  typename std::conditional< std::is_same<Tag,void>::value,int,Tag>::type m_tag;
+
+};
+
+
 // ------------------------------------------------------------------ //
 
 // MDFunctor - wraps the range_policy and functor to pass to IterateTile
-// Serial, Threads, OpenMP
+// Used for md_parallel_{for,reduce} with Serial, Threads, OpenMP
 // Cuda uses DeviceIterateTile directly within md_parallel_for
-// ParallelReduce
+// TODO Once md_parallel_{for,reduce} removed, this can be removed
+
+// ParallelReduce - scalar reductions
 template < typename MDRange, typename Functor, typename ValueType = void >
 struct MDFunctor
 {
@@ -2273,7 +2767,7 @@ struct MDFunctor
 
 
   inline
-  MDFunctor( MDRange const& range, Functor const& f, ValueType & v )
+  MDFunctor( MDRange const& range, Functor const& f )
     : m_range( range )
     , m_func( f )
   {}
@@ -2290,7 +2784,6 @@ struct MDFunctor
   inline
   MDFunctor& operator=( MDFunctor && ) = default;
 
-//  KOKKOS_FORCEINLINE_FUNCTION //Caused cuda warning - __host__ warning
   inline
   void operator()(index_type t, value_type & v) const
   {
@@ -2301,6 +2794,56 @@ struct MDFunctor
   Functor   m_func;
 };
 
+
+// ParallelReduce - array reductions 
+template < typename MDRange, typename Functor, typename ValueType >
+struct MDFunctor< MDRange, Functor, ValueType[] >
+{
+  using range_policy = MDRange;
+  using functor_type = Functor;
+  using value_type   = ValueType[];
+  using work_tag     = typename range_policy::work_tag;
+  using index_type   = typename range_policy::index_type;
+  using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRange
+                                                                           , Functor
+                                                                           , work_tag
+                                                                           , value_type
+                                                                           >;
+
+
+  inline
+  MDFunctor( MDRange const& range, Functor const& f )
+    : m_range( range )
+    , m_func( f )
+    , value_count( f.value_count )
+  {}
+
+  inline
+  MDFunctor( MDFunctor const& ) = default;
+
+  inline
+  MDFunctor& operator=( MDFunctor const& ) = default;
+
+  inline
+  MDFunctor( MDFunctor && ) = default;
+
+  inline
+  MDFunctor& operator=( MDFunctor && ) = default;
+
+  // FIXME Init and Join, as defined in m_func, are not working through the MDFunctor
+  // Best path forward is to eliminate need for MDFunctor, directly use MDRangePolicy within Parallel{For,Reduce} ??
+  inline
+  void operator()(index_type t, value_type v) const
+  {
+    iterate_type(m_range, m_func, v)(t);
+  }
+
+  MDRange   m_range;
+  Functor   m_func;
+  size_t    value_count;
+};
+
+
 // ParallelFor
 template < typename MDRange, typename Functor >
 struct MDFunctor< MDRange, Functor, void >
@@ -2349,4 +2892,3 @@ struct MDFunctor< MDRange, Functor, void >
 } } } //end namespace Kokkos::Experimental::Impl
 
 #endif
-
diff --git a/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp b/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp
index c5685c5b62..3fb15c8d1e 100644
--- a/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp
@@ -55,16 +55,19 @@ template < typename ExecutionSpace   = void
          , typename WorkTag          = void
          , typename IndexType        = void
          , typename IterationPattern = void
+         , typename LaunchBounds     = void
          >
 struct PolicyTraitsBase
 {
-  using type = PolicyTraitsBase< ExecutionSpace, Schedule, WorkTag, IndexType, IterationPattern>;
+  using type = PolicyTraitsBase< ExecutionSpace, Schedule, WorkTag, IndexType, 
+               IterationPattern, LaunchBounds>;
 
   using execution_space   = ExecutionSpace;
   using schedule_type     = Schedule;
   using work_tag          = WorkTag;
   using index_type        = IndexType;
   using iteration_pattern = IterationPattern;
+  using launch_bounds     = LaunchBounds;
 };
 
 
@@ -78,6 +81,7 @@ struct SetExecutionSpace
                                , typename PolicyBase::work_tag
                                , typename PolicyBase::index_type
                                , typename PolicyBase::iteration_pattern
+                               , typename PolicyBase::launch_bounds
                                >;
 };
 
@@ -91,6 +95,7 @@ struct SetSchedule
                                , typename PolicyBase::work_tag
                                , typename PolicyBase::index_type
                                , typename PolicyBase::iteration_pattern
+                               , typename PolicyBase::launch_bounds
                                >;
 };
 
@@ -104,6 +109,7 @@ struct SetWorkTag
                                , WorkTag
                                , typename PolicyBase::index_type
                                , typename PolicyBase::iteration_pattern
+                               , typename PolicyBase::launch_bounds
                                >;
 };
 
@@ -117,6 +123,7 @@ struct SetIndexType
                                , typename PolicyBase::work_tag
                                , IndexType
                                , typename PolicyBase::iteration_pattern
+                               , typename PolicyBase::launch_bounds
                                >;
 };
 
@@ -131,6 +138,22 @@ struct SetIterationPattern
                                , typename PolicyBase::work_tag
                                , typename PolicyBase::index_type
                                , IterationPattern
+                               , typename PolicyBase::launch_bounds
+                               >;
+};
+
+
+template <typename PolicyBase, typename LaunchBounds>
+struct SetLaunchBounds
+{
+  static_assert( is_void<typename PolicyBase::launch_bounds>::value
+               , "Kokkos Error: More than one launch_bounds given" );
+  using type = PolicyTraitsBase< typename PolicyBase::execution_space
+                               , typename PolicyBase::schedule_type
+                               , typename PolicyBase::work_tag
+                               , typename PolicyBase::index_type
+                               , typename PolicyBase::iteration_pattern
+                               , LaunchBounds
                                >;
 };
 
@@ -146,8 +169,9 @@ struct AnalyzePolicy<Base, T, Traits...> : public
     , typename std::conditional< is_index_type<T>::value       , SetIndexType<Base,T>
     , typename std::conditional< std::is_integral<T>::value    , SetIndexType<Base, IndexType<T> >
     , typename std::conditional< is_iteration_pattern<T>::value, SetIterationPattern<Base,T>
+    , typename std::conditional< is_launch_bounds<T>::value    , SetLaunchBounds<Base,T>
     , SetWorkTag<Base,T>
-    >::type >::type >::type >::type>::type::type
+    >::type >::type >::type >::type >::type>::type::type
   , Traits...
   >
 {};
@@ -178,11 +202,18 @@ struct AnalyzePolicy<Base>
                                                      , void // TODO set default iteration pattern
                                                      , typename Base::iteration_pattern
                                                      >::type;
+
+  using launch_bounds = typename std::conditional< is_void< typename Base::launch_bounds >::value
+                                                     , LaunchBounds<>
+                                                     , typename Base::launch_bounds
+                                                     >::type;
+
   using type = PolicyTraitsBase< execution_space
                                , schedule_type
                                , work_tag
                                , index_type
                                , iteration_pattern
+                               , launch_bounds
                                >;
 };
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
index 010b15064e..5b894b037b 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
@@ -41,6 +41,10 @@
 //@HEADER
 */
 
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+#include <xmmintrin.h>
+#endif
+
 #include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP )
 #define KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP
@@ -126,11 +130,21 @@ T atomic_compare_exchange( volatile T * const dest , const T & compare ,
 
 inline
 int atomic_compare_exchange( volatile int * const dest, const int compare, const int val)
-{ return __sync_val_compare_and_swap(dest,compare,val); }
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_val_compare_and_swap(dest,compare,val);
+}
 
 inline
 long atomic_compare_exchange( volatile long * const dest, const long compare, const long val )
-{ return __sync_val_compare_and_swap(dest,compare,val); }
+{ 
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_val_compare_and_swap(dest,compare,val);
+}
 
 #if defined( KOKKOS_ENABLE_GNU_ATOMICS )
 
@@ -159,6 +173,10 @@ T atomic_compare_exchange( volatile T * const dest, const T & compare,
     KOKKOS_INLINE_FUNCTION U() {};
   } tmp ;
 
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
   tmp.i = __sync_val_compare_and_swap( (int*) dest , *((int*)&compare) , *((int*)&val) );
   return tmp.t ;
 }
@@ -175,6 +193,10 @@ T atomic_compare_exchange( volatile T * const dest, const T & compare,
     KOKKOS_INLINE_FUNCTION U() {};
   } tmp ;
 
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
   tmp.i = __sync_val_compare_and_swap( (long*) dest , *((long*)&compare) , *((long*)&val) );
   return tmp.t ;
 }
@@ -193,6 +215,10 @@ T atomic_compare_exchange( volatile T * const dest, const T & compare,
     KOKKOS_INLINE_FUNCTION U() {};
   } tmp ;
 
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
   tmp.i = Impl::cas128( (Impl::cas128_t*) dest , *((Impl::cas128_t*)&compare) , *((Impl::cas128_t*)&val) );
   return tmp.t ;
 }
@@ -209,6 +235,10 @@ T atomic_compare_exchange( volatile T * const dest , const T compare ,
             #endif
              , const T >::type& val )
 {
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
   while( !Impl::lock_address_host_space( (void*) dest ) );
   T return_val = *dest;
   if( return_val == compare ) {
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Decrement.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Decrement.hpp
index 127de528f5..2a13a4865c 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Decrement.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Decrement.hpp
@@ -41,6 +41,10 @@
 //@HEADER
 */
 
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+#include <xmmintrin.h>
+#endif
+
 #include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ATOMIC_HPP) && ! defined( KOKKOS_ATOMIC_DECREMENT_HPP )
 #define KOKKOS_ATOMIC_DECREMENT_HPP
@@ -54,6 +58,10 @@ template<>
 KOKKOS_INLINE_FUNCTION
 void atomic_decrement<char>(volatile char* a) {
 #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) a, _MM_HINT_ET0 );
+#endif
+
   __asm__ __volatile__(
       "lock decb %0"
       : /* no output registers */
@@ -69,6 +77,10 @@ template<>
 KOKKOS_INLINE_FUNCTION
 void atomic_decrement<short>(volatile short* a) {
 #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) a, _MM_HINT_ET0 );
+#endif
+
   __asm__ __volatile__(
       "lock decw %0"
       : /* no output registers */
@@ -84,6 +96,10 @@ template<>
 KOKKOS_INLINE_FUNCTION
 void atomic_decrement<int>(volatile int* a) {
 #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) a, _MM_HINT_ET0 );
+#endif
+
   __asm__ __volatile__(
       "lock decl %0"
       : /* no output registers */
@@ -99,6 +115,9 @@ template<>
 KOKKOS_INLINE_FUNCTION
 void atomic_decrement<long long int>(volatile long long int* a) {
 #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) a, _MM_HINT_ET0 );
+#endif
   __asm__ __volatile__(
       "lock decq %0"
       : /* no output registers */
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
index a1ff47abce..9ba3cae9fc 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
@@ -41,6 +41,10 @@
 //@HEADER
 */
 
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+#include <xmmintrin.h>
+#endif
+
 #include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_EXCHANGE_HPP )
 #define KOKKOS_ATOMIC_EXCHANGE_HPP
@@ -81,6 +85,10 @@ T atomic_exchange(
   typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val )
 {
   // int tmp = __ullAtomicExch( (int*) dest , *((int*)&val) );
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
   int tmp = atomicExch( ((int*)dest) , *((int*)&val) );
   return *((T*)&tmp);
 }
@@ -93,6 +101,11 @@ T atomic_exchange(
                                     sizeof(T) == sizeof(unsigned long long int) , const T & >::type val )
 {
   typedef unsigned long long int type ;
+
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
   // type tmp = __ullAtomicExch( (type*) dest , *((type*)&val) );
   type tmp = atomicExch( ((type*)dest) , *((type*)&val) );
   return *((T*)&tmp);
@@ -108,6 +121,10 @@ T atomic_exchange( volatile T * const dest ,
 {
   T return_val;
   // This is a way to (hopefully) avoid dead lock in a warp
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
   int done = 0;
   unsigned int active = __ballot(1);
   unsigned int done_active = 0;
@@ -173,6 +190,9 @@ T atomic_exchange( volatile T * const dest ,
                                   , const T & >::type val )
 {
   typedef typename Kokkos::Impl::if_c< sizeof(T) == sizeof(int) , int , long >::type type ;
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
 
   const type v = *((type*)&val); // Extract to be sure the value doesn't change
 
@@ -201,6 +221,10 @@ T atomic_exchange( volatile T * const dest ,
   typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(Impl::cas128_t)
                                   , const T & >::type val )
 {
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
   union U {
     Impl::cas128_t i ;
     T t ;
@@ -260,6 +284,10 @@ void atomic_assign( volatile T * const dest ,
 {
   typedef typename Kokkos::Impl::if_c< sizeof(T) == sizeof(int) , int , long >::type type ;
 
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
   const type v = *((type*)&val); // Extract to be sure the value doesn't change
 
   type assumed ;
@@ -285,6 +313,10 @@ void atomic_assign( volatile T * const dest ,
   typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(Impl::cas128_t)
                                   , const T & >::type val )
 {
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
   union U {
     Impl::cas128_t i ;
     T t ;
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
index 860c8e0e43..084c55efed 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
@@ -41,6 +41,10 @@
 //@HEADER
 */
 
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+#include <xmmintrin.h>
+#endif
+
 #include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_ADD_HPP )
 #define KOKKOS_ATOMIC_FETCH_ADD_HPP
@@ -161,36 +165,60 @@ T atomic_fetch_add( volatile T * const dest ,
 inline
 int atomic_fetch_add( volatile int * dest , const int val )
 {
-        int original = val;
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH ) 
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
 
-        __asm__ __volatile__(
-                "lock xadd %1, %0"
-                : "+m" (*dest), "+r" (original)
-                : "m" (*dest), "r" (original)
-                : "memory"
+  int original = val;
+
+  __asm__ __volatile__(
+  	"lock xadd %1, %0"
+        : "+m" (*dest), "+r" (original)
+        : "m" (*dest), "r" (original)
+        : "memory"
         );
 
-        return original;
+  return original;
 }
 #else
 inline
 int atomic_fetch_add( volatile int * const dest , const int val )
-{ return __sync_fetch_and_add(dest, val); }
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_add(dest, val);
+}
 #endif
 
 inline
 long int atomic_fetch_add( volatile long int * const dest , const long int val )
-{ return __sync_fetch_and_add(dest,val); }
+{ 
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_add(dest,val);
+}
 
 #if defined( KOKKOS_ENABLE_GNU_ATOMICS )
 
 inline
 unsigned int atomic_fetch_add( volatile unsigned int * const dest , const unsigned int val )
-{ return __sync_fetch_and_add(dest,val); }
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_add(dest,val);
+}
 
 inline
 unsigned long int atomic_fetch_add( volatile unsigned long int * const dest , const unsigned long int val )
-{ return __sync_fetch_and_add(dest,val); }
+{ 
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_add(dest,val);
+}
 
 #endif
 
@@ -205,6 +233,10 @@ T atomic_fetch_add( volatile T * const dest ,
     inline U() {};
   } assume , oldval , newval ;
 
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
   oldval.t = *dest ;
 
   do {
@@ -228,6 +260,10 @@ T atomic_fetch_add( volatile T * const dest ,
     inline U() {};
   } assume , oldval , newval ;
 
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
   oldval.t = *dest ;
 
   do {
@@ -253,6 +289,10 @@ T atomic_fetch_add( volatile T * const dest ,
     inline U() {};
   } assume , oldval , newval ;
 
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
   oldval.t = *dest ;
 
   do {
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp
index 83f5b2a5aa..6ecb65336c 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp
@@ -41,6 +41,10 @@
 //@HEADER
 */
 
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+#include <xmmintrin.h>
+#endif
+
 #include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_AND_HPP )
 #define KOKKOS_ATOMIC_FETCH_AND_HPP
@@ -76,21 +80,41 @@ unsigned long long int atomic_fetch_and( volatile unsigned long long int * const
 
 inline
 int atomic_fetch_and( volatile int * const dest , const int val )
-{ return __sync_fetch_and_and(dest,val); }
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_and(dest,val);
+}
 
 inline
 long int atomic_fetch_and( volatile long int * const dest , const long int val )
-{ return __sync_fetch_and_and(dest,val); }
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_and(dest,val);
+}
 
 #if defined( KOKKOS_ENABLE_GNU_ATOMICS )
 
 inline
 unsigned int atomic_fetch_and( volatile unsigned int * const dest , const unsigned int val )
-{ return __sync_fetch_and_and(dest,val); }
+{ 
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_and(dest,val);
+}
 
 inline
 unsigned long int atomic_fetch_and( volatile unsigned long int * const dest , const unsigned long int val )
-{ return __sync_fetch_and_and(dest,val); }
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_and(dest,val);
+}
 
 #endif
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp
index 8c73b4c3ef..ed3b438f89 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp
@@ -41,6 +41,10 @@
 //@HEADER
 */
 
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+#include <xmmintrin.h>
+#endif
+
 #include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_OR_HPP )
 #define KOKKOS_ATOMIC_FETCH_OR_HPP
@@ -76,21 +80,41 @@ unsigned long long int atomic_fetch_or( volatile unsigned long long int * const
 
 inline
 int atomic_fetch_or( volatile int * const dest , const int val )
-{ return __sync_fetch_and_or(dest,val); }
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_or(dest,val);
+}
 
 inline
 long int atomic_fetch_or( volatile long int * const dest , const long int val )
-{ return __sync_fetch_and_or(dest,val); }
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_or(dest,val);
+}
 
 #if defined( KOKKOS_ENABLE_GNU_ATOMICS )
 
 inline
 unsigned int atomic_fetch_or( volatile unsigned int * const dest , const unsigned int val )
-{ return __sync_fetch_and_or(dest,val); }
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_or(dest,val);
+}
 
 inline
 unsigned long int atomic_fetch_or( volatile unsigned long int * const dest , const unsigned long int val )
-{ return __sync_fetch_and_or(dest,val); }
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_or(dest,val);
+}
 
 #endif
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
index 504731d3a2..038cc13e9a 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
@@ -41,6 +41,10 @@
 //@HEADER
 */
 
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+#include <xmmintrin.h>
+#endif
+
 #include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_SUB_HPP )
 #define KOKKOS_ATOMIC_FETCH_SUB_HPP
@@ -136,21 +140,41 @@ T atomic_fetch_sub( volatile T * const dest ,
 
 inline
 int atomic_fetch_sub( volatile int * const dest , const int val )
-{ return __sync_fetch_and_sub(dest,val); }
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_sub(dest,val);
+}
 
 inline
 long int atomic_fetch_sub( volatile long int * const dest , const long int val )
-{ return __sync_fetch_and_sub(dest,val); }
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_sub(dest,val);
+}
 
 #if defined( KOKKOS_ENABLE_GNU_ATOMICS )
 
 inline
 unsigned int atomic_fetch_sub( volatile unsigned int * const dest , const unsigned int val )
-{ return __sync_fetch_and_sub(dest,val); }
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_sub(dest,val);
+}
 
 inline
 unsigned long int atomic_fetch_sub( volatile unsigned long int * const dest , const unsigned long int val )
-{ return __sync_fetch_and_sub(dest,val); }
+{
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+  return __sync_fetch_and_sub(dest,val);
+}
 
 #endif
 
@@ -161,6 +185,10 @@ T atomic_fetch_sub( volatile T * const dest ,
 {
   union { int i ; T t ; } assume , oldval , newval ;
 
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
   oldval.t = *dest ;
 
   do {
@@ -178,6 +206,10 @@ T atomic_fetch_sub( volatile T * const dest ,
   typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
                                     sizeof(T) == sizeof(long) , const T >::type val )
 {
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
   union { long i ; T t ; } assume , oldval , newval ;
 
   oldval.t = *dest ;
@@ -202,6 +234,10 @@ T atomic_fetch_sub( volatile T * const dest ,
                && ( sizeof(T) != 8 )
              , const T >::type& val )
 {
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
+#endif
+
   while( !Impl::lock_address_host_space( (void*) dest ) );
   T return_val = *dest;
   *dest = return_val - val;
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Increment.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Increment.hpp
index 2985fad95e..e7626603fc 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Increment.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Increment.hpp
@@ -41,6 +41,10 @@
 //@HEADER
 */
 
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+#include <xmmintrin.h>
+#endif
+
 #include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ATOMIC_HPP) && ! defined( KOKKOS_ATOMIC_INCREMENT_HPP )
 #define KOKKOS_ATOMIC_INCREMENT_HPP
@@ -52,6 +56,9 @@ template<>
 KOKKOS_INLINE_FUNCTION
 void atomic_increment<char>(volatile char* a) {
 #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) a, _MM_HINT_ET0 );
+#endif
   __asm__ __volatile__(
       "lock incb %0"
       : /* no output registers */
@@ -67,6 +74,9 @@ template<>
 KOKKOS_INLINE_FUNCTION
 void atomic_increment<short>(volatile short* a) {
 #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) a, _MM_HINT_ET0 );
+#endif
   __asm__ __volatile__(
       "lock incw %0"
       : /* no output registers */
@@ -82,6 +92,9 @@ template<>
 KOKKOS_INLINE_FUNCTION
 void atomic_increment<int>(volatile int* a) {
 #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) a, _MM_HINT_ET0 );
+#endif
   __asm__ __volatile__(
       "lock incl %0"
       : /* no output registers */
@@ -97,6 +110,9 @@ template<>
 KOKKOS_INLINE_FUNCTION
 void atomic_increment<long long int>(volatile long long int* a) {
 #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
+#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
+  _mm_prefetch( (const char*) a, _MM_HINT_ET0 );
+#endif
   __asm__ __volatile__(
       "lock incq %0"
       : /* no output registers */
diff --git a/lib/kokkos/core/src/impl/Kokkos_Core.cpp b/lib/kokkos/core/src/impl/Kokkos_Core.cpp
index f0ff6d78ec..f52cc469ac 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Core.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Core.cpp
@@ -87,17 +87,12 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
 #if defined( KOKKOS_ENABLE_OPENMP )
   if( std::is_same< Kokkos::OpenMP , Kokkos::DefaultExecutionSpace >::value ||
       std::is_same< Kokkos::OpenMP , Kokkos::HostSpace::execution_space >::value ) {
-    if(num_threads>0) {
-      if(use_numa>0) {
-        Kokkos::OpenMP::initialize(num_threads,use_numa);
-      }
-      else {
-        Kokkos::OpenMP::initialize(num_threads);
-      }
-    } else {
-      Kokkos::OpenMP::initialize();
+    if(use_numa>0) {
+      Kokkos::OpenMP::initialize(num_threads,use_numa);
+    }
+    else {
+      Kokkos::OpenMP::initialize(num_threads);
     }
-    //std::cout << "Kokkos::initialize() fyi: OpenMP enabled and initialized" << std::endl ;
   }
   else {
     //std::cout << "Kokkos::initialize() fyi: OpenMP enabled but not initialized" << std::endl ;
@@ -437,10 +432,7 @@ void initialize(int& narg, char* arg[])
       iarg++;
     }
 
-    InitArguments arguments;
-    arguments.num_threads = num_threads;
-    arguments.num_numa = numa;
-    arguments.device_id = device;
+    InitArguments arguments{num_threads, numa, device};
     Impl::initialize_internal(arguments);
 }
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp b/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
index dc75fb072f..fccd8e090f 100644
--- a/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
@@ -170,28 +170,31 @@ struct FunctorValueTraits< FunctorType , ArgTag , true /* == exists FunctorType:
   static_assert( 0 == ( sizeof(value_type) % sizeof(int) ) ,
     "Reduction functor's declared value_type requires: 0 == sizeof(value_type) % sizeof(int)" );
 
+  /* this cast to bool is needed for correctness by NVCC */
+  enum : bool { IsArray = static_cast<bool>(Impl::is_array< typename FunctorType::value_type >::value) };
+
   // If not an array then what is the sizeof(value_type)
-  enum { StaticValueSize = Impl::is_array< typename FunctorType::value_type >::value ? 0 : sizeof(value_type) };
+  enum { StaticValueSize = IsArray ? 0 : sizeof(value_type) };
 
   typedef value_type                 * pointer_type ;
 
   // The reference_type for an array is 'value_type *'
   // The reference_type for a single value is 'value_type &'
 
-  typedef typename Impl::if_c< ! StaticValueSize , value_type *
-                                                 , value_type & >::type  reference_type ;
+  typedef typename Impl::if_c< IsArray , value_type *
+                                       , value_type & >::type  reference_type ;
 
   // Number of values if single value
   template< class F >
   KOKKOS_FORCEINLINE_FUNCTION static
-  typename Impl::enable_if< std::is_same<F,FunctorType>::value && StaticValueSize , unsigned >::type
+  typename Impl::enable_if< std::is_same<F,FunctorType>::value && ! IsArray , unsigned >::type
     value_count( const F & ) { return 1 ; }
 
   // Number of values if an array, protect via templating because 'f.value_count'
   // will only exist when the functor declares the value_type to be an array.
   template< class F >
   KOKKOS_FORCEINLINE_FUNCTION static
-  typename Impl::enable_if< std::is_same<F,FunctorType>::value && ! StaticValueSize , unsigned >::type
+  typename Impl::enable_if< std::is_same<F,FunctorType>::value && IsArray , unsigned >::type
     value_count( const F & f ) { return f.value_count ; }
 
   // Total size of the value
diff --git a/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp b/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
index 8cb7430035..e11f8b6d34 100644
--- a/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
@@ -70,62 +70,6 @@
 #ifdef KOKKOS_ENABLE_HBWSPACE
 #define MEMKIND_TYPE MEMKIND_HBW //hbw_get_kind(HBW_PAGESIZE_4KB)
 
-namespace Kokkos {
-namespace Experimental {
-namespace {
-
-static const int QUERY_SPACE_IN_PARALLEL_MAX = 16 ;
-
-typedef int (* QuerySpaceInParallelPtr )();
-
-QuerySpaceInParallelPtr s_in_parallel_query[ QUERY_SPACE_IN_PARALLEL_MAX ] ;
-int s_in_parallel_query_count = 0 ;
-
-} // namespace <empty>
-
-void HBWSpace::register_in_parallel( int (*device_in_parallel)() )
-{
-  if ( 0 == device_in_parallel ) {
-    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::HBWSpace::register_in_parallel ERROR : given NULL" ) );
-  }
-
-  int i = -1 ;
-
-  if ( ! (device_in_parallel)() ) {
-    for ( i = 0 ; i < s_in_parallel_query_count && ! (*(s_in_parallel_query[i]))() ; ++i );
-  }
-
-  if ( i < s_in_parallel_query_count ) {
-    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::HBWSpace::register_in_parallel_query ERROR : called in_parallel" ) );
-
-  }
-
-  if ( QUERY_SPACE_IN_PARALLEL_MAX <= i ) {
-    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::HBWSpace::register_in_parallel_query ERROR : exceeded maximum" ) );
-
-  }
-
-  for ( i = 0 ; i < s_in_parallel_query_count && s_in_parallel_query[i] != device_in_parallel ; ++i );
-
-  if ( i == s_in_parallel_query_count ) {
-    s_in_parallel_query[s_in_parallel_query_count++] = device_in_parallel ;
-  }
-}
-
-int HBWSpace::in_parallel()
-{
-  const int n = s_in_parallel_query_count ;
-
-  int i = 0 ;
-
-  while ( i < n && ! (*(s_in_parallel_query[i]))() ) { ++i ; }
-
-  return i < n ;
-}
-
-} // namespace Experiemtal
-} // namespace Kokkos
-
 /*--------------------------------------------------------------------------*/
 
 namespace Kokkos {
diff --git a/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp
index 2a5c34c375..a5a73ddebb 100644
--- a/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp
@@ -106,62 +106,6 @@
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
-namespace Kokkos {
-namespace {
-
-static const int QUERY_SPACE_IN_PARALLEL_MAX = 16 ;
-
-typedef int (* QuerySpaceInParallelPtr )();
-
-QuerySpaceInParallelPtr s_in_parallel_query[ QUERY_SPACE_IN_PARALLEL_MAX ] ;
-int s_in_parallel_query_count = 0 ;
-
-} // namespace <empty>
-
-void HostSpace::register_in_parallel( int (*device_in_parallel)() )
-{
-  if ( 0 == device_in_parallel ) {
-    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel ERROR : given NULL" ) );
-  }
-
-  int i = -1 ;
-
-  if ( ! (device_in_parallel)() ) {
-    for ( i = 0 ; i < s_in_parallel_query_count && ! (*(s_in_parallel_query[i]))() ; ++i );
-  }
-
-  if ( i < s_in_parallel_query_count ) {
-    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel_query ERROR : called in_parallel" ) );
-
-  }
-
-  if ( QUERY_SPACE_IN_PARALLEL_MAX <= i ) {
-    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel_query ERROR : exceeded maximum" ) );
-
-  }
-
-  for ( i = 0 ; i < s_in_parallel_query_count && s_in_parallel_query[i] != device_in_parallel ; ++i );
-
-  if ( i == s_in_parallel_query_count ) {
-    s_in_parallel_query[s_in_parallel_query_count++] = device_in_parallel ;
-  }
-}
-
-int HostSpace::in_parallel()
-{
-  const int n = s_in_parallel_query_count ;
-
-  int i = 0 ;
-
-  while ( i < n && ! (*(s_in_parallel_query[i]))() ) { ++i ; }
-
-  return i < n ;
-}
-
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-
 namespace Kokkos {
 
 /* Default allocation mechanism */
@@ -340,9 +284,6 @@ void HostSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_
   }
 }
 
-constexpr const char* HostSpace::name() {
-  return m_name;
-}
 } // namespace Kokkos
 
 //----------------------------------------------------------------------------
diff --git a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp
index ac200209c7..d2446bde09 100644
--- a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp
@@ -45,7 +45,7 @@
 #include <Kokkos_Macros.hpp>
 #include <impl/Kokkos_HostThreadTeam.hpp>
 #include <impl/Kokkos_Error.hpp>
-#include <impl/Kokkos_spinwait.hpp>
+#include <impl/Kokkos_Spinwait.hpp>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -58,9 +58,11 @@ void HostThreadTeamData::organize_pool
 {
   bool ok = true ;
 
+  memory_fence();
+
   // Verify not already a member of a pool:
   for ( int rank = 0 ; rank < size && ok ; ++rank ) {
-    ok = ( 0 != members[rank] ) && ( 0 == members[rank]->m_pool_scratch );
+    ok = ( nullptr != members[rank] ) && ( 0 == members[rank]->m_pool_scratch );
   }
 
   if ( ok ) {
@@ -89,7 +91,6 @@ void HostThreadTeamData::organize_pool
         mem->m_team_alloc   = 1 ;
         mem->m_league_rank  = rank ;
         mem->m_league_size  = size ;
-        mem->m_pool_rendezvous_step = 0 ;
         mem->m_team_rendezvous_step = 0 ;
         pool[ rank ] = mem ;
       }
@@ -116,7 +117,6 @@ void HostThreadTeamData::disband_pool()
    m_team_alloc   = 1 ;
    m_league_rank  = 0 ;
    m_league_size  = 1 ;
-   m_pool_rendezvous_step = 0 ;
    m_team_rendezvous_step = 0 ;
 }
 
@@ -256,11 +256,6 @@ int HostThreadTeamData::rendezvous( int64_t * const buffer
 
   const int sync_offset = ( step & mask_mem_cycle ) + size_mem_cycle ;
 
-  union {
-    int64_t full ;
-    int8_t  byte[8] ;
-  } value ;
-
   if ( rank ) {
 
     const int group_begin = rank << shift_byte ; // == rank * size_byte
@@ -275,13 +270,14 @@ int HostThreadTeamData::rendezvous( int64_t * const buffer
       const int end = group_begin + size_byte < size
                     ? size_byte : size - group_begin ;
 
-      value.full = 0 ;
-      for ( int i = 0 ; i < end ; ++i ) value.byte[i] = int8_t( step );
+      int64_t value = 0 ;
 
-      store_fence(); // This should not be needed but fixes #742
+      for ( int i = 0 ; i < end ; ++i ) {
+        ((int8_t*) & value )[i] = int8_t( step );
+      }
 
       spinwait_until_equal( buffer[ (rank << shift_mem_cycle) + sync_offset ]
-                          , value.full );
+                          , value );
     }
 
     {
@@ -316,10 +312,12 @@ int HostThreadTeamData::rendezvous( int64_t * const buffer
 
     const int end = size_byte < size ? 8 : size ;
 
-    value.full = 0 ;
-    for ( int i = 1 ; i < end ; ++i ) value.byte[i] = int8_t( step );
+    int64_t value = 0 ;
+    for ( int i = 1 ; i < end ; ++i ) {
+      ((int8_t *) & value)[i] = int8_t( step );
+    }
 
-    spinwait_until_equal( buffer[ sync_offset ], value.full );
+    spinwait_until_equal( buffer[ sync_offset ], value );
   }
 
   return rank ? 0 : 1 ;
diff --git a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
index c050a16eae..7facc0a410 100644
--- a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
@@ -50,6 +50,7 @@
 #include <Kokkos_ExecPolicy.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
 #include <impl/Kokkos_FunctorAnalysis.hpp>
+#include <impl/Kokkos_Rendezvous.hpp>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -67,14 +68,12 @@ public:
 
   // Assume upper bounds on number of threads:
   //   pool size       <= 1024 threads
-  //   pool rendezvous <= ( 1024 / 8 ) * 4 + 4 = 2052
   //   team size       <= 64 threads
-  //   team rendezvous <= ( 64 / 8 ) * 4 + 4 = 36
 
   enum : int { max_pool_members  = 1024 };
   enum : int { max_team_members  = 64 };
-  enum : int { max_pool_rendezvous  = ( max_pool_members / 8 ) * 4 + 4 };
-  enum : int { max_team_rendezvous  = ( max_team_members / 8 ) * 4 + 4 };
+  enum : int { max_pool_rendezvous = rendezvous_buffer_size( max_pool_members ) };
+  enum : int { max_team_rendezvous = rendezvous_buffer_size( max_team_members ) };
 
 private:
 
@@ -114,7 +113,6 @@ private:
   int         m_league_size ;
   int         m_work_chunk ;
   int         m_steal_rank ; // work stealing rank
-  int mutable m_pool_rendezvous_step ;
   int mutable m_team_rendezvous_step ;
 
   HostThreadTeamData * team_member( int r ) const noexcept
@@ -147,6 +145,7 @@ public:
   int team_rendezvous( int const root ) const noexcept
     {
       return 1 == m_team_size ? 1 :
+             HostThreadTeamData::
              rendezvous( m_team_scratch + m_team_rendezvous
                        , m_team_rendezvous_step
                        , m_team_size
@@ -157,6 +156,7 @@ public:
   int team_rendezvous() const noexcept
     {
       return 1 == m_team_size ? 1 :
+             HostThreadTeamData::
              rendezvous( m_team_scratch + m_team_rendezvous
                        , m_team_rendezvous_step
                        , m_team_size
@@ -167,6 +167,7 @@ public:
   void team_rendezvous_release() const noexcept
     {
       if ( 1 < m_team_size ) {
+        HostThreadTeamData::
         rendezvous_release( m_team_scratch + m_team_rendezvous
                           , m_team_rendezvous_step );
       }
@@ -175,19 +176,30 @@ public:
   inline
   int pool_rendezvous() const noexcept
     {
+      static constexpr int yield_wait =
+        #if defined( KOKKOS_COMPILER_IBM )
+            // If running on IBM POWER architecture the global
+            // level rendzvous should immediately yield when
+            // waiting for other threads in the pool to arrive.
+          1
+        #else
+          0
+        #endif
+          ;
       return 1 == m_pool_size ? 1 :
+             Kokkos::Impl::
              rendezvous( m_pool_scratch + m_pool_rendezvous
-                       , m_pool_rendezvous_step
                        , m_pool_size
-                       , m_pool_rank );
+                       , m_pool_rank
+                       , yield_wait );
     }
 
   inline
   void pool_rendezvous_release() const noexcept
     {
       if ( 1 < m_pool_size ) {
-        rendezvous_release( m_pool_scratch + m_pool_rendezvous
-                          , m_pool_rendezvous_step );
+        Kokkos::Impl::
+        rendezvous_release( m_pool_scratch + m_pool_rendezvous );
       }
     }
 
@@ -213,7 +225,6 @@ public:
     , m_league_size(1)
     , m_work_chunk(0)
     , m_steal_rank(0)
-    , m_pool_rendezvous_step(0)
     , m_team_rendezvous_step(0)
     {}
 
@@ -406,7 +417,7 @@ fflush(stdout);
       // Steal from next team, round robin
       // The next team is offset by m_team_alloc if it fits in the pool.
 
-      m_steal_rank = m_team_base + m_team_alloc + m_team_size <= m_pool_size ? 
+      m_steal_rank = m_team_base + m_team_alloc + m_team_size <= m_pool_size ?
                      m_team_base + m_team_alloc : 0 ;
     }
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp
index 98482cfab6..608d514c79 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp
@@ -50,51 +50,70 @@
 namespace Kokkos {
 namespace Profiling {
 
+static initFunction initProfileLibrary = nullptr;
+static finalizeFunction finalizeProfileLibrary = nullptr;
+
+static beginFunction beginForCallee = nullptr;
+static beginFunction beginScanCallee = nullptr;
+static beginFunction beginReduceCallee = nullptr;
+static endFunction endForCallee = nullptr;
+static endFunction endScanCallee = nullptr;
+static endFunction endReduceCallee = nullptr;
+
+static pushFunction pushRegionCallee = nullptr;
+static popFunction popRegionCallee = nullptr;
+
+static allocateDataFunction allocateDataCallee = nullptr;
+static deallocateDataFunction deallocateDataCallee = nullptr;
+
+static beginDeepCopyFunction beginDeepCopyCallee = nullptr;
+static endDeepCopyFunction endDeepCopyCallee = nullptr;
+
 SpaceHandle::SpaceHandle(const char* space_name) {
   strncpy(name,space_name,64);
 }
 
 bool profileLibraryLoaded() {
-  return (NULL != initProfileLibrary);
+  return (nullptr != initProfileLibrary);
 }
 
 void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
-  if(NULL != beginForCallee) {
+  if(nullptr != beginForCallee) {
     Kokkos::fence();
     (*beginForCallee)(kernelPrefix.c_str(), devID, kernelID);
   }
 }
 
 void endParallelFor(const uint64_t kernelID) {
-  if(NULL != endForCallee) {
+  if(nullptr != endForCallee) {
     Kokkos::fence();
     (*endForCallee)(kernelID);
   }
 }
 
 void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
-  if(NULL != beginScanCallee) {
+  if(nullptr != beginScanCallee) {
     Kokkos::fence();
     (*beginScanCallee)(kernelPrefix.c_str(), devID, kernelID);
   }
 }
 
 void endParallelScan(const uint64_t kernelID) {
-  if(NULL != endScanCallee) {
+  if(nullptr != endScanCallee) {
     Kokkos::fence();
     (*endScanCallee)(kernelID);
   }
 }
 
 void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
-  if(NULL != beginReduceCallee) {
+  if(nullptr != beginReduceCallee) {
     Kokkos::fence();
     (*beginReduceCallee)(kernelPrefix.c_str(), devID, kernelID);
   }
 }
 
 void endParallelReduce(const uint64_t kernelID) {
-  if(NULL != endReduceCallee) {
+  if(nullptr != endReduceCallee) {
     Kokkos::fence();
     (*endReduceCallee)(kernelID);
   }
@@ -102,31 +121,47 @@ void endParallelReduce(const uint64_t kernelID) {
 
 
 void pushRegion(const std::string& kName) {
-  if( NULL != pushRegionCallee ) {
+  if( nullptr != pushRegionCallee ) {
     Kokkos::fence();
     (*pushRegionCallee)(kName.c_str());
   }
 }
 
 void popRegion() {
-  if( NULL != popRegionCallee ) {
+  if( nullptr != popRegionCallee ) {
     Kokkos::fence();
     (*popRegionCallee)();
   }
 }
 
 void allocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size) {
-  if(NULL != allocateDataCallee) {
+  if(nullptr != allocateDataCallee) {
     (*allocateDataCallee)(space,label.c_str(),ptr,size);
   }
 }
 
 void deallocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size) {
-  if(NULL != allocateDataCallee) {
+  if(nullptr != deallocateDataCallee) {
     (*deallocateDataCallee)(space,label.c_str(),ptr,size);
   }
 }
 
+void beginDeepCopy(const SpaceHandle dst_space, const std::string dst_label, const void* dst_ptr,
+    const SpaceHandle src_space, const std::string src_label, const void* src_ptr,
+    const uint64_t size) {
+  if(nullptr != beginDeepCopyCallee) {
+    (*beginDeepCopyCallee)(dst_space, dst_label.c_str(), dst_ptr,
+                      src_space, src_label.c_str(), src_ptr,
+                      size);
+  }
+}
+
+void endDeepCopy() {
+  if(nullptr != endDeepCopyCallee) {
+    (*endDeepCopyCallee)();
+  }
+}
+
 void initialize() {
 
   // Make sure initialize calls happens only once
@@ -140,7 +175,7 @@ void initialize() {
 
   // If we do not find a profiling library in the environment then exit
   // early.
-  if( NULL == envProfileLibrary ) {
+  if( nullptr == envProfileLibrary ) {
     return ;
   }
 
@@ -149,10 +184,10 @@ void initialize() {
 
   char* profileLibraryName = strtok(envProfileCopy, ";");
 
-  if( (NULL != profileLibraryName) && (strcmp(profileLibraryName, "") != 0) ) {
+  if( (nullptr != profileLibraryName) && (strcmp(profileLibraryName, "") != 0) ) {
     firstProfileLibrary = dlopen(profileLibraryName, RTLD_NOW | RTLD_GLOBAL);
 
-    if(NULL == firstProfileLibrary) {
+    if(nullptr == firstProfileLibrary) {
       std::cerr << "Error: Unable to load KokkosP library: " <<
         profileLibraryName << std::endl;
     } else {
@@ -191,14 +226,19 @@ void initialize() {
       auto p12 = dlsym(firstProfileLibrary, "kokkosp_deallocate_data");
       deallocateDataCallee = *((deallocateDataFunction*) &p12);
 
+      auto p13 = dlsym(firstProfileLibrary, "kokkosp_begin_deep_copy");
+      beginDeepCopyCallee = *((beginDeepCopyFunction*) &p13);
+      auto p14 = dlsym(firstProfileLibrary, "kokkosp_end_deep_copy");
+      endDeepCopyCallee = *((endDeepCopyFunction*) &p14);
+
     }
   }
 
-  if(NULL != initProfileLibrary) {
+  if(nullptr != initProfileLibrary) {
     (*initProfileLibrary)(0,
         (uint64_t) KOKKOSP_INTERFACE_VERSION,
         (uint32_t) 0,
-        NULL);
+        nullptr);
   }
 
   free(envProfileCopy);
@@ -210,28 +250,30 @@ void finalize() {
   if(is_finalized) return;
   is_finalized = 1;
 
-  if(NULL != finalizeProfileLibrary) {
+  if(nullptr != finalizeProfileLibrary) {
     (*finalizeProfileLibrary)();
 
-    // Set all profile hooks to NULL to prevent
+    // Set all profile hooks to nullptr to prevent
     // any additional calls. Once we are told to
     // finalize, we mean it
-    initProfileLibrary = NULL;
-    finalizeProfileLibrary = NULL;
+    initProfileLibrary = nullptr;
+    finalizeProfileLibrary = nullptr;
 
-    beginForCallee = NULL;
-    beginScanCallee = NULL;
-    beginReduceCallee = NULL;
-    endScanCallee = NULL;
-    endForCallee = NULL;
-    endReduceCallee = NULL;
+    beginForCallee = nullptr;
+    beginScanCallee = nullptr;
+    beginReduceCallee = nullptr;
+    endScanCallee = nullptr;
+    endForCallee = nullptr;
+    endReduceCallee = nullptr;
 
-    pushRegionCallee = NULL;
-    popRegionCallee = NULL;
+    pushRegionCallee = nullptr;
+    popRegionCallee = nullptr;
 
-    allocateDataCallee = NULL;
-    deallocateDataCallee = NULL;
+    allocateDataCallee = nullptr;
+    deallocateDataCallee = nullptr;
 
+    beginDeepCopyCallee = nullptr;
+    endDeepCopyCallee = nullptr;
   }
 }
 }
diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
index f76e5dfa04..2c2e524d9d 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
@@ -81,23 +81,11 @@ typedef void (*popFunction)();
 typedef void (*allocateDataFunction)(const SpaceHandle, const char*, const void*, const uint64_t);
 typedef void (*deallocateDataFunction)(const SpaceHandle, const char*, const void*, const uint64_t);
 
-
-static initFunction initProfileLibrary = NULL;
-static finalizeFunction finalizeProfileLibrary = NULL;
-
-static beginFunction beginForCallee = NULL;
-static beginFunction beginScanCallee = NULL;
-static beginFunction beginReduceCallee = NULL;
-static endFunction endForCallee = NULL;
-static endFunction endScanCallee = NULL;
-static endFunction endReduceCallee = NULL;
-
-static pushFunction pushRegionCallee = NULL;
-static popFunction popRegionCallee = NULL;
-
-static allocateDataFunction allocateDataCallee = NULL;
-static deallocateDataFunction deallocateDataCallee = NULL;
-
+typedef void (*beginDeepCopyFunction)(
+    SpaceHandle, const char*, const void*,
+    SpaceHandle, const char*, const void*,
+    uint64_t);
+typedef void (*endDeepCopyFunction)();
 
 bool profileLibraryLoaded();
 
@@ -114,35 +102,14 @@ void popRegion();
 void allocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size);
 void deallocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size);
 
+void beginDeepCopy(const SpaceHandle dst_space, const std::string dst_label, const void* dst_ptr,
+    const SpaceHandle src_space, const std::string src_label, const void* src_ptr,
+    const uint64_t size);
+void endDeepCopy();
+
 void initialize();
 void finalize();
 
-//Define finalize_fake inline to get rid of warnings for unused static variables
-inline void finalize_fake() {
-  if(NULL != finalizeProfileLibrary) {
-    (*finalizeProfileLibrary)();
-
-    // Set all profile hooks to NULL to prevent
-    // any additional calls. Once we are told to
-    // finalize, we mean it
-    beginForCallee = NULL;
-    beginScanCallee = NULL;
-    beginReduceCallee = NULL;
-    endScanCallee = NULL;
-    endForCallee = NULL;
-    endReduceCallee = NULL;
-
-    allocateDataCallee = NULL;
-    deallocateDataCallee = NULL;
-
-    initProfileLibrary = NULL;
-    finalizeProfileLibrary = NULL;
-    pushRegionCallee = NULL;
-    popRegionCallee = NULL;
-  }
-}
-
-
 }
 }
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_Rendezvous.cpp b/lib/kokkos/core/src/impl/Kokkos_Rendezvous.cpp
new file mode 100644
index 0000000000..ac697fce4b
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Rendezvous.cpp
@@ -0,0 +1,208 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_Atomic.hpp>
+#include <impl/Kokkos_Rendezvous.hpp>
+#include <impl/Kokkos_Spinwait.hpp>
+
+namespace Kokkos { namespace Impl {
+
+//----------------------------------------------------------------------------
+/* pattern for rendezvous
+ *
+ *  if ( rendezvous() ) {
+ *     ... all other threads are still in team_rendezvous() ...
+ *     rendezvous_release();
+ *     ... all other threads are released from team_rendezvous() ...
+ *  }
+ */
+
+int rendezvous( volatile int64_t * const buffer
+              , int const size
+              , int const rank
+              , int const slow
+              ) noexcept
+{
+  enum : int { shift_byte = 3 };
+  enum : int { size_byte  = ( 01 << shift_byte ) }; // == 8
+  enum : int { mask_byte  = size_byte - 1 };
+
+  enum : int { shift_mem_cycle = 2 };
+  enum : int { size_mem_cycle  = ( 01 << shift_mem_cycle ) }; // == 4
+  enum : int { mask_mem_cycle  = size_mem_cycle - 1 };
+
+  // Cycle step values: 1 <= step <= size_val_cycle
+  // An odd multiple of memory cycle so that when a memory location
+  // is reused it has a different value.
+  // Must be representable within a single byte: size_val_cycle < 16
+
+  enum : int { size_val_cycle = 3 * size_mem_cycle };
+
+  // Requires:
+  //   Called by rank = [ 0 .. size )
+  //   buffer aligned to int64_t[4]
+
+  // A sequence of rendezvous uses four cycled locations in memory
+  // and non-equal cycled synchronization values to
+  // 1) prevent rendezvous from overtaking one another and
+  // 2) give each spin wait location an int64_t[4] span
+  //    so that it has its own cache line.
+
+  const int64_t step = (buffer[0] % size_val_cycle ) + 1 ;
+
+  // The leading int64_t[4] span is for thread 0 to write
+  // and all other threads to read spin-wait.
+  // sync_offset is the index into this array for this step.
+
+  const int sync_offset = ( step & mask_mem_cycle ) + size_mem_cycle + size_mem_cycle ;
+
+  if ( rank ) {
+
+    const int group_begin = rank << shift_byte ; // == rank * size_byte
+
+    if ( group_begin < size ) {
+
+      //  This thread waits for threads
+      //   [ group_begin .. group_begin + 8 )
+      //   [ rank*8      .. rank*8 + 8      )
+      // to write to their designated bytes.
+
+      const int end = group_begin + size_byte < size
+                    ? size_byte : size - group_begin ;
+
+      int64_t value = 0;
+      for ( int i = 0 ; i < end ; ++i ) {
+        value |= step << (i * size_byte );
+      }
+
+      store_fence(); // This should not be needed but fixes #742
+
+      if ( slow ) {
+        yield_until_equal( buffer[ (rank << shift_mem_cycle) + sync_offset ]
+                          , value );
+      }
+      else {
+        spinwait_until_equal( buffer[ (rank << shift_mem_cycle) + sync_offset ]
+                            , value );
+      }
+    }
+
+    {
+      // This thread sets its designated byte.
+      //   ( rank % size_byte ) +
+      //   ( ( rank / size_byte ) * size_byte * size_mem_cycle ) +
+      //   ( sync_offset * size_byte )
+      const int offset = ( rank & mask_byte )
+                       + ( ( rank & ~mask_byte ) << shift_mem_cycle )
+                       + ( sync_offset << shift_byte );
+
+      // All of this thread's previous memory stores must be complete before
+      // this thread stores the step value at this thread's designated byte
+      // in the shared synchronization array.
+
+      Kokkos::memory_fence();
+
+      ((volatile int8_t*) buffer)[ offset ] = int8_t( step );
+
+      // Memory fence to push the previous store out
+      Kokkos::memory_fence();
+    }
+
+    // Wait for thread 0 to release all other threads
+
+    if ( slow ) {
+      yield_until_equal( buffer[ (step & mask_mem_cycle) + size_mem_cycle ] , int64_t(step) );
+    }
+    else {
+      spinwait_until_equal( buffer[ (step & mask_mem_cycle) + size_mem_cycle ] , int64_t(step) );
+    }
+  }
+  else {
+    // Thread 0 waits for threads [1..7]
+    // to write to their designated bytes.
+
+    const int end = size_byte < size ? 8 : size ;
+
+    int64_t value = 0;
+    for ( int i = 1 ; i < end ; ++i ) {
+      value |= step << (i * size_byte );
+    }
+
+    if ( slow ) {
+      yield_until_equal( buffer[ sync_offset ], value );
+    }
+    else {
+      spinwait_until_equal( buffer[ sync_offset ], value );
+    }
+  }
+
+  return rank ? 0 : 1 ;
+}
+
+void rendezvous_release( volatile int64_t * const buffer ) noexcept
+{
+  enum : int { shift_mem_cycle = 2 };
+  enum : int { size_mem_cycle  = ( 01 << shift_mem_cycle ) }; // == 4
+  enum : int { mask_mem_cycle  = size_mem_cycle - 1 };
+  enum : int { size_val_cycle = 3 * size_mem_cycle };
+
+  // Requires:
+  //   Called after team_rendezvous
+  //   Called only by true == team_rendezvous(root)
+
+  // update step
+  const int64_t step = (buffer[0] % size_val_cycle ) + 1;
+  buffer[0] = step;
+
+  // Memory fence to be sure all previous writes are complete:
+  Kokkos::memory_fence();
+
+  buffer[ (step & mask_mem_cycle) + size_mem_cycle ] = step;
+
+  // Memory fence to push the store out
+  Kokkos::memory_fence();
+}
+
+}} // namespace Kokkos::Impl
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Rendezvous.hpp b/lib/kokkos/core/src/impl/Kokkos_Rendezvous.hpp
new file mode 100644
index 0000000000..57f8633bca
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Rendezvous.hpp
@@ -0,0 +1,87 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_RENDEZVOUS_HPP
+#define KOKKOS_IMPL_RENDEZVOUS_HPP
+
+#include <cstdint>
+
+namespace Kokkos { namespace Impl {
+
+inline
+constexpr int rendezvous_buffer_size( int max_members ) noexcept
+{
+  return (((max_members + 7) / 8) * 4) + 4 + 4;
+}
+
+/** \brief  Thread pool rendezvous
+ *
+ *  Rendezvous pattern:
+ *   if ( rendezvous(root) ) {
+ *     ... only root thread here while all others wait ...
+ *     rendezvous_release();
+ *   }
+ *   else {
+ *     ... all other threads release here ...
+ *   }
+ *
+ *  Requires: buffer[ rendezvous_buffer_size( max_threads ) ];
+ *
+ *  When slow != 0 the expectation is thread arrival will be 
+ *  slow so the threads that arrive early should quickly yield
+ *  their core to the runtime thus possibly allowing the late
+ *  arriving threads to have more resources
+ *  (e.g., power and clock frequency).
+ */
+int rendezvous( volatile int64_t * const buffer
+              , int const size
+              , int const rank
+              , int const slow = 0 ) noexcept ;
+
+void rendezvous_release( volatile int64_t * const buffer ) noexcept ;
+
+
+}} // namespace Kokkos::Impl
+
+#endif // KOKKOS_IMPL_RENDEZVOUS_HPP
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial.cpp b/lib/kokkos/core/src/impl/Kokkos_Serial.cpp
index 755271c07e..dfbeba461e 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Serial.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial.cpp
@@ -50,6 +50,7 @@
 #include <impl/Kokkos_Traits.hpp>
 #include <impl/Kokkos_Error.hpp>
 
+#include <impl/Kokkos_SharedAlloc.hpp>
 
 /*--------------------------------------------------------------------------*/
 
@@ -123,7 +124,6 @@ void serial_resize_thread_team_data( size_t pool_reduce_bytes
   }
 }
 
-// Get thread team data structure for omp_get_thread_num()
 HostThreadTeamData * serial_get_thread_team_data()
 {
   return & g_serial_thread_team_data ;
@@ -151,6 +151,8 @@ void Serial::initialize( unsigned threads_count
   (void) use_cores_per_numa;
   (void) allow_asynchronous_threadpool;
 
+  Impl::SharedAllocationRecord< void, void >::tracking_enable();
+
   // Init the array of locks used for arbitrarily sized atomics
   Impl::init_lock_array_host_space();
   #if defined(KOKKOS_ENABLE_PROFILING)
diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial_Task.cpp b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.cpp
index 76297161b1..0b6fbd9af0 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Serial_Task.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.cpp
@@ -62,7 +62,7 @@ void TaskQueueSpecialization< Kokkos::Serial >::execute
 {
   using execution_space = Kokkos::Serial ;
   using queue_type      = TaskQueue< execution_space > ;
-  using task_root_type  = TaskBase< execution_space , void , void > ;
+  using task_root_type  = TaskBase< void , void , void > ;
   using Member          = Impl::HostThreadTeamMember< execution_space > ;
 
   task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
@@ -122,7 +122,7 @@ void TaskQueueSpecialization< Kokkos::Serial > ::
 {
   using execution_space = Kokkos::Serial ;
   using queue_type      = TaskQueue< execution_space > ;
-  using task_root_type  = TaskBase< execution_space , void , void > ;
+  using task_root_type  = TaskBase< void , void , void > ;
   using Member          = Impl::HostThreadTeamMember< execution_space > ;
 
   task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp
index 2eb2b5cf52..39deebbbf1 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp
@@ -65,7 +65,7 @@ public:
   using execution_space = Kokkos::Serial ;
   using memory_space    = Kokkos::HostSpace ;
   using queue_type      = Kokkos::Impl::TaskQueue< execution_space > ;
-  using task_base_type  = Kokkos::Impl::TaskBase< execution_space , void , void > ;
+  using task_base_type  = Kokkos::Impl::TaskBase< void , void , void > ;
   using member_type     = Kokkos::Impl::HostThreadTeamMember< execution_space > ;
 
   static
diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial_WorkGraphPolicy.hpp b/lib/kokkos/core/src/impl/Kokkos_Serial_WorkGraphPolicy.hpp
new file mode 100644
index 0000000000..dc30ffe9e0
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial_WorkGraphPolicy.hpp
@@ -0,0 +1,102 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_SERIAL_WORKGRAPHPOLICY_HPP
+#define KOKKOS_SERIAL_WORKGRAPHPOLICY_HPP
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType ,
+                   Kokkos::Experimental::WorkGraphPolicy< Traits ... > ,
+                   Kokkos::Serial
+                 >
+  : public Kokkos::Impl::Experimental::
+           WorkGraphExec< FunctorType,
+                          Kokkos::Serial,
+                          Traits ...
+                        >
+{
+private:
+
+  typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ;
+  typedef Kokkos::Impl::Experimental::
+          WorkGraphExec<FunctorType, Kokkos::Serial, Traits ... > Base ;
+
+  template< class TagType >
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_one(const typename Policy::member_type& i) const {
+    Base::m_functor( i );
+  }
+
+  template< class TagType >
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_one(const typename Policy::member_type& i) const {
+    const TagType t{} ;
+    Base::m_functor( t , i );
+  }
+
+public:
+
+  inline
+  void execute()
+  {
+    for (std::int32_t i; (-1 != (i = Base::before_work())); ) {
+      exec_one< typename Policy::work_tag >( i );
+      Base::after_work(i);
+    }
+  }
+
+  inline
+  ParallelFor( const FunctorType & arg_functor
+             , const Policy      & arg_policy )
+    : Base( arg_functor, arg_policy )
+  {
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* #define KOKKOS_SERIAL_WORKGRAPHPOLICY_HPP */
diff --git a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp
index e28c1194a7..af79523e0c 100644
--- a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp
@@ -46,23 +46,23 @@
 namespace Kokkos {
 namespace Impl {
 
-int SharedAllocationRecord< void , void >::s_tracking_enabled = 1 ;
+namespace {
 
-void SharedAllocationRecord< void , void >::tracking_claim_and_disable()
-{
-  // A host thread claim and disable tracking flag
+__thread int t_tracking_enabled = 1;
 
-  while ( ! Kokkos::atomic_compare_exchange_strong( & s_tracking_enabled, 1, 0 ) );
 }
 
-void SharedAllocationRecord< void , void >::tracking_release_and_enable()
-{
-  // The host thread that claimed and disabled the tracking flag
-  // now release and enable tracking.
+int SharedAllocationRecord< void , void >::tracking_enabled()
+{ return t_tracking_enabled; }
 
-  if ( ! Kokkos::atomic_compare_exchange_strong( & s_tracking_enabled, 0, 1 ) ){
-    Kokkos::Impl::throw_runtime_exception("Kokkos::Impl::SharedAllocationRecord<>::tracking_release_and_enable FAILED, this host process thread did not hold the lock" );
-  }
+void SharedAllocationRecord< void , void >::tracking_disable()
+{
+  t_tracking_enabled = 0;
+}
+
+void SharedAllocationRecord< void , void >::tracking_enable()
+{
+  t_tracking_enabled = 1;
 }
 
 //----------------------------------------------------------------------------
diff --git a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp
index 4dc61bb02e..2e3cc1a163 100644
--- a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp
@@ -71,6 +71,9 @@ public:
   KOKKOS_INLINE_FUNCTION static
   const SharedAllocationHeader * get_header( void * alloc_ptr )
     { return reinterpret_cast<SharedAllocationHeader*>( reinterpret_cast<char*>(alloc_ptr) - sizeof(SharedAllocationHeader) ); }
+
+  KOKKOS_INLINE_FUNCTION
+  const char* label() const { return m_label; }
 };
 
 template<>
@@ -83,8 +86,6 @@ protected:
 
   typedef void (* function_type )( SharedAllocationRecord<void,void> * );
 
-  static int s_tracking_enabled ;
-
   SharedAllocationHeader * const m_alloc_ptr ;
   size_t                   const m_alloc_size ;
   function_type            const m_dealloc ;
@@ -110,17 +111,17 @@ protected:
 public:
   inline std::string get_label() const { return std::string("Unmanaged"); }
 
-  static int tracking_enabled() { return s_tracking_enabled ; }
+  static int tracking_enabled();
 
   /**\brief A host process thread claims and disables the
    *        shared allocation tracking flag.
    */
-  static void tracking_claim_and_disable();
+  static void tracking_disable();
 
   /**\brief A host process thread releases and enables the
    *        shared allocation tracking flag.
    */
-  static void tracking_release_and_enable();
+  static void tracking_enable();
 
   ~SharedAllocationRecord() = default ;
 
@@ -317,6 +318,11 @@ public:
 #endif
     }
 
+  KOKKOS_INLINE_FUNCTION
+  bool has_record() const {
+    return (m_record_bits & (~DO_NOT_DEREF_FLAG)) != 0;
+  }
+
   KOKKOS_FORCEINLINE_FUNCTION
   ~SharedAllocationTracker()
     { KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT }
diff --git a/lib/kokkos/core/src/impl/Kokkos_Spinwait.cpp b/lib/kokkos/core/src/impl/Kokkos_Spinwait.cpp
new file mode 100644
index 0000000000..3d3f83ed85
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Spinwait.cpp
@@ -0,0 +1,210 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+
+#include <Kokkos_Atomic.hpp>
+#include <impl/Kokkos_Spinwait.hpp>
+#include <impl/Kokkos_BitOps.hpp>
+
+#if defined( KOKKOS_ENABLE_STDTHREAD )
+  #include <thread>
+#elif !defined( _WIN32 )
+  #include <sched.h>
+  #include <time.h>
+#else
+  #include <process.h>
+  #include <winsock2.h>
+  #include <windows.h>
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+void host_thread_yield( const uint32_t i , const int force_yield )
+{
+  static constexpr uint32_t sleep_limit = 1 << 13 ;
+  static constexpr uint32_t yield_limit = 1 << 12 ;
+
+  const int c = Kokkos::Impl::bit_scan_reverse(i);
+
+  if ( sleep_limit < i ) {
+
+    // Attempt to put the thread to sleep for 'c' milliseconds
+
+    #if defined( KOKKOS_ENABLE_STDTHREAD )
+      std::this_thread::sleep_for( std::chrono::nanoseconds( c * 1000 ) )
+    #elif !defined( _WIN32 )
+      timespec req ;
+      req.tv_sec  = 0 ;
+      req.tv_nsec = 1000 * c ;
+      nanosleep( &req, nullptr );
+    #else /* defined( _WIN32 ) IS Microsoft Windows */
+      Sleep(c);
+    #endif
+  }
+
+  else if ( force_yield || yield_limit < i ) {
+
+    // Attempt to yield thread resources to runtime
+
+    #if defined( KOKKOS_ENABLE_STDTHREAD )
+      std::this_thread::yield();
+    #elif !defined( _WIN32 )
+      sched_yield();
+    #else /* defined( _WIN32 ) IS Microsoft Windows */
+      YieldProcessor();
+    #endif
+  }
+
+  #if defined( KOKKOS_ENABLE_ASM )
+
+  else if ( (1u<<4) < i ) {
+
+    // Insert a few no-ops to quiet the thread:
+
+    for ( int k = 0 ; k < c ; ++k ) {
+      #if defined( __amd64 ) || defined( __amd64__ ) || \
+       	    defined( __x86_64 ) || defined( __x86_64__ )
+    	#if !defined( _WIN32 ) /* IS NOT Microsoft Windows */
+          asm volatile( "nop\n" );
+	#else
+          __asm__ __volatile__( "nop\n" );
+        #endif
+      #elif defined(__PPC64__)
+          asm volatile( "nop\n" );
+      #endif
+    }
+  }
+
+  {
+    // Insert memory pause
+      #if defined( __amd64 ) || defined( __amd64__ ) || \
+       	    defined( __x86_64 ) || defined( __x86_64__ )
+    	#if !defined( _WIN32 ) /* IS NOT Microsoft Windows */
+          asm volatile( "pause\n":::"memory" );
+	#else
+          __asm__ __volatile__( "pause\n":::"memory" );
+        #endif
+      #elif defined(__PPC64__)
+	asm volatile( "or 27, 27, 27" ::: "memory" );
+      #endif
+  }
+
+  #endif /* defined( KOKKOS_ENABLE_ASM ) */
+}
+
+}}} // namespace Kokkos::Impl::{anonymous}
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+void spinwait_while_equal( volatile int32_t & flag , const int32_t value )
+{
+  Kokkos::store_fence();
+  uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,0);
+  Kokkos::load_fence();
+}
+
+void spinwait_until_equal( volatile int32_t & flag , const int32_t value )
+{
+  Kokkos::store_fence();
+  uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,0);
+  Kokkos::load_fence();
+}
+
+void spinwait_while_equal( volatile int64_t & flag , const int64_t value )
+{
+  Kokkos::store_fence();
+  uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,0);
+  Kokkos::load_fence();
+}
+
+void spinwait_until_equal( volatile int64_t & flag , const int64_t value )
+{
+  Kokkos::store_fence();
+  uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,0);
+  Kokkos::load_fence();
+}
+
+void yield_while_equal( volatile int32_t & flag , const int32_t value )
+{
+  Kokkos::store_fence();
+  uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,1);
+  Kokkos::load_fence();
+}
+
+void yield_until_equal( volatile int32_t & flag , const int32_t value )
+{
+  Kokkos::store_fence();
+  uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,1);
+  Kokkos::load_fence();
+}
+
+void yield_while_equal( volatile int64_t & flag , const int64_t value )
+{
+  Kokkos::store_fence();
+  uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,1);
+  Kokkos::load_fence();
+}
+
+void yield_until_equal( volatile int64_t & flag , const int64_t value )
+{
+  Kokkos::store_fence();
+  uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,1);
+  Kokkos::load_fence();
+}
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+#else
+void KOKKOS_CORE_SRC_IMPL_SPINWAIT_PREVENT_LINK_ERROR() {}
+#endif
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_spinwait.hpp b/lib/kokkos/core/src/impl/Kokkos_Spinwait.hpp
similarity index 82%
rename from lib/kokkos/core/src/impl/Kokkos_spinwait.hpp
rename to lib/kokkos/core/src/impl/Kokkos_Spinwait.hpp
index 6e34b8a943..b49e308566 100644
--- a/lib/kokkos/core/src/impl/Kokkos_spinwait.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Spinwait.hpp
@@ -59,6 +59,13 @@ void spinwait_until_equal( volatile int32_t & flag , const int32_t value );
 
 void spinwait_while_equal( volatile int64_t & flag , const int64_t value );
 void spinwait_until_equal( volatile int64_t & flag , const int64_t value );
+
+void yield_while_equal( volatile int32_t & flag , const int32_t value );
+void yield_until_equal( volatile int32_t & flag , const int32_t value );
+
+void yield_while_equal( volatile int64_t & flag , const int64_t value );
+void yield_until_equal( volatile int64_t & flag , const int64_t value );
+
 #else
 
 KOKKOS_INLINE_FUNCTION
@@ -71,6 +78,16 @@ void spinwait_while_equal( volatile int64_t & , const int64_t ) {}
 KOKKOS_INLINE_FUNCTION
 void spinwait_until_equal( volatile int64_t & , const int64_t ) {}
 
+KOKKOS_INLINE_FUNCTION
+void yield_while_equal( volatile int32_t & , const int32_t ) {}
+KOKKOS_INLINE_FUNCTION
+void yield_until_equal( volatile int32_t & , const int32_t ) {}
+
+KOKKOS_INLINE_FUNCTION
+void yield_while_equal( volatile int64_t & , const int64_t ) {}
+KOKKOS_INLINE_FUNCTION
+void yield_until_equal( volatile int64_t & , const int64_t ) {}
+
 #endif
 
 } /* namespace Impl */
diff --git a/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp b/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
index bee98e6745..5f8699302d 100644
--- a/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
@@ -59,24 +59,15 @@
 namespace Kokkos {
 namespace Impl {
 
-/*\brief  Implementation data for task data management, access, and execution.
- *
- *  Curiously recurring template pattern (CRTP)
- *  to allow static_cast from the
- *  task root type and a task's FunctorType.
- *
- *    TaskBase< Space , ResultType , FunctorType >
- *      : TaskBase< Space , ResultType , void >
- *      , FunctorType
- *      { ... };
- *
- *    TaskBase< Space , ResultType , void >
- *      : TaskBase< Space , void , void >
- *      { ... };
- */
-template< typename Space , typename ResultType , typename FunctorType >
+template< class Space , typename ResultType , class FunctorType >
 class TaskBase ;
 
+template< typename Space >
+class TaskQueue ;
+
+template< typename Space >
+class TaskQueueSpecialization ;
+
 } /* namespace Impl */
 } /* namespace Kokkos */
 
@@ -86,8 +77,217 @@ class TaskBase ;
 namespace Kokkos {
 namespace Impl {
 
-template< typename Space >
-class TaskQueueSpecialization ;
+/** \brief  Base class for task management, access, and execution.
+ *
+ *  Inheritance structure to allow static_cast from the task root type
+ *  and a task's FunctorType.
+ *
+ *    // Enable a functor to access the base class
+ *    // and provide memory for result value.
+ *    TaskBase< Space , ResultType , FunctorType >
+ *      : TaskBase< void , void , void >
+ *      , FunctorType
+ *      { ... };
+ *    Followed by memory allocated for result value.
+ *
+ *
+ *  States of a task:
+ *
+ *    Constructing State, NOT IN a linked list
+ *      m_wait == 0
+ *      m_next == 0
+ *
+ *    Scheduling transition : Constructing -> Waiting
+ *      before:
+ *        m_wait == 0
+ *        m_next == this task's initial dependence, 0 if none
+ *      after:
+ *        m_wait == EndTag
+ *        m_next == EndTag
+ *
+ *    Waiting State, IN a linked list
+ *      m_apply != 0
+ *      m_queue != 0
+ *      m_ref_count > 0
+ *      m_wait == head of linked list of tasks waiting on this task
+ *      m_next == next of linked list of tasks
+ *
+ *    transition : Waiting -> Executing
+ *      before:
+ *        m_next == EndTag
+ *      after::
+ *        m_next == LockTag
+ *
+ *    Executing State, NOT IN a linked list
+ *      m_apply != 0
+ *      m_queue != 0
+ *      m_ref_count > 0
+ *      m_wait == head of linked list of tasks waiting on this task
+ *      m_next == LockTag
+ *
+ *    Respawn transition : Executing -> Executing-Respawn
+ *      before:
+ *        m_next == LockTag
+ *      after:
+ *        m_next == this task's updated dependence, 0 if none
+ *
+ *    Executing-Respawn State, NOT IN a linked list
+ *      m_apply != 0
+ *      m_queue != 0
+ *      m_ref_count > 0
+ *      m_wait == head of linked list of tasks waiting on this task
+ *      m_next == this task's updated dependence, 0 if none
+ *
+ *    transition : Executing -> Complete
+ *      before:
+ *        m_wait == head of linked list
+ *      after:
+ *        m_wait == LockTag
+ *
+ *    Complete State, NOT IN a linked list
+ *      m_wait == LockTag: cannot add dependence (<=> complete)
+ *      m_next == LockTag: not a member of a wait queue
+ *
+ */
+template<>
+class TaskBase< void , void , void >
+{
+public:
+
+  enum : int16_t   { TaskTeam = 0 , TaskSingle = 1 , Aggregate = 2 };
+  enum : uintptr_t { LockTag = ~uintptr_t(0) , EndTag = ~uintptr_t(1) };
+
+  template< typename > friend class Kokkos::TaskScheduler ;
+
+  typedef TaskQueue< void > queue_type ;
+
+  typedef void (* function_type) ( TaskBase * , void * );
+
+  // sizeof(TaskBase) == 48
+
+  function_type  m_apply ;       ///< Apply function pointer
+  queue_type   * m_queue ;       ///< Pointer to queue
+  TaskBase     * m_wait ;        ///< Linked list of tasks waiting on this
+  TaskBase     * m_next ;        ///< Waiting linked-list next
+  int32_t        m_ref_count ;   ///< Reference count
+  int32_t        m_alloc_size ;  ///< Allocation size
+  int32_t        m_dep_count ;   ///< Aggregate's number of dependences
+  int16_t        m_task_type ;   ///< Type of task
+  int16_t        m_priority ;    ///< Priority of runnable task
+
+  TaskBase( TaskBase && ) = delete ;
+  TaskBase( const TaskBase & ) = delete ;
+  TaskBase & operator = ( TaskBase && ) = delete ;
+  TaskBase & operator = ( const TaskBase & ) = delete ;
+
+  KOKKOS_INLINE_FUNCTION ~TaskBase() = default ;
+
+  KOKKOS_INLINE_FUNCTION constexpr
+  TaskBase()
+    : m_apply(      0 )
+    , m_queue(      0 )
+    , m_wait(       0 )
+    , m_next(       0 )
+    , m_ref_count(  0 )
+    , m_alloc_size( 0 )
+    , m_dep_count(  0 )
+    , m_task_type(  0 )
+    , m_priority(   0 )
+    {}
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  TaskBase * volatile * aggregate_dependences() volatile
+    { return reinterpret_cast<TaskBase*volatile*>( this + 1 ); }
+
+  KOKKOS_INLINE_FUNCTION
+  bool requested_respawn()
+    {
+      // This should only be called when a task has finished executing and is
+      // in the transition to either the complete or executing-respawn state.
+      TaskBase * const lock = reinterpret_cast< TaskBase * >( LockTag );
+      return lock != m_next;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void add_dependence( TaskBase* dep )
+    {
+      // Precondition: lock == m_next
+
+      TaskBase * const lock = (TaskBase *) LockTag ;
+
+      // Assign dependence to m_next.  It will be processed in the subsequent
+      // call to schedule.  Error if the dependence is reset.
+      if ( lock != Kokkos::atomic_exchange( & m_next, dep ) ) {
+        Kokkos::abort("TaskScheduler ERROR: resetting task dependence");
+      }
+
+      if ( 0 != dep ) {
+        // The future may be destroyed upon returning from this call
+        // so increment reference count to track this assignment.
+        Kokkos::atomic_increment( &(dep->m_ref_count) );
+      }
+    }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  int32_t reference_count() const
+    { return *((int32_t volatile *)( & m_ref_count )); }
+
+};
+
+static_assert( sizeof(TaskBase<void,void,void>) == 48
+             , "Verifying expected sizeof(TaskBase<void,void,void>)" );
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template< typename ResultType >
+struct TaskResult {
+
+  enum : int32_t { size = sizeof(ResultType) };
+
+  using reference_type = ResultType & ;
+
+  KOKKOS_INLINE_FUNCTION static
+  ResultType * ptr( TaskBase<void,void,void> * task )
+    {
+      return reinterpret_cast< ResultType * >
+        ( reinterpret_cast< char * >(task) + task->m_alloc_size - sizeof(ResultType) );
+    }
+
+  KOKKOS_INLINE_FUNCTION static
+  reference_type get( TaskBase<void,void,void> * task )
+    { return *ptr( task ); }
+};
+
+template<>
+struct TaskResult< void > {
+
+  enum : int32_t { size = 0 };
+
+  using reference_type = void ;
+
+  KOKKOS_INLINE_FUNCTION static
+  void * ptr( TaskBase<void,void,void> * ) { return (void*) 0 ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  reference_type get( TaskBase<void,void,void> * ) {}
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+class TaskQueue< void > {};
 
 /** \brief  Manage task allocation, deallocation, and scheduling.
  *
@@ -95,7 +295,7 @@ class TaskQueueSpecialization ;
  *  All other aspects of task management have shared implementation.
  */
 template< typename ExecSpace >
-class TaskQueue {
+class TaskQueue : public TaskQueue<void> {
 private:
 
   friend class TaskQueueSpecialization< ExecSpace > ;
@@ -106,7 +306,7 @@ private:
   using memory_space    = typename specialization::memory_space ;
   using device_type     = Kokkos::Device< execution_space , memory_space > ;
   using memory_pool     = Kokkos::MemoryPool< device_type > ;
-  using task_root_type  = Kokkos::Impl::TaskBase<execution_space,void,void> ;
+  using task_root_type  = Kokkos::Impl::TaskBase<void,void,void> ;
 
   struct Destroy {
     TaskQueue * m_queue ;
@@ -198,12 +398,10 @@ public:
     }
 
   // Assign task pointer with reference counting of assigned tasks
-  template< typename LV , typename RV >
   KOKKOS_FUNCTION static
-  void assign( TaskBase< execution_space,LV,void> ** const lhs
-             , TaskBase< execution_space,RV,void> *  const rhs )
+  void assign( task_root_type ** const lhs
+             , task_root_type *  const rhs )
     {
-      using task_lhs = TaskBase< execution_space,LV,void> ;
 #if 0
   {
     printf( "assign( 0x%lx { 0x%lx %d %d } , 0x%lx { 0x%lx %d %d } )\n"
@@ -225,7 +423,7 @@ public:
 
       // Force write of *lhs
 
-      *static_cast< task_lhs * volatile * >(lhs) = rhs ;
+      *static_cast< task_root_type * volatile * >(lhs) = rhs ;
 
       Kokkos::memory_fence();
     }
@@ -238,6 +436,38 @@ public:
 
   KOKKOS_FUNCTION
   void deallocate( void * p , size_t n ); ///< Deallocate to the memory pool
+
+
+  //----------------------------------------
+  /**\brief  Allocation size for a spawned task */
+
+  template< typename FunctorType >
+  KOKKOS_FUNCTION
+  size_t spawn_allocation_size() const
+    {
+      using value_type = typename FunctorType::value_type ;
+
+      using task_type = Impl::TaskBase< execution_space
+                                      , value_type
+                                      , FunctorType > ;
+
+      enum : size_t { align = ( 1 << 4 ) , align_mask = align - 1 };
+      enum : size_t { task_size   = sizeof(task_type) };
+      enum : size_t { result_size = Impl::TaskResult< value_type >::size };
+      enum : size_t { alloc_size =
+        ( ( task_size   + align_mask ) & ~align_mask ) +
+        ( ( result_size + align_mask ) & ~align_mask ) };
+
+      return m_memory.allocate_block_size( task_size );
+    }
+
+  /**\brief  Allocation size for a when_all aggregate */
+
+  KOKKOS_FUNCTION
+  size_t when_all_allocation_size( int narg ) const
+    {
+      return m_memory.allocate_block_size( sizeof(task_root_type) + narg * sizeof(task_root_type*) );
+    }
 };
 
 } /* namespace Impl */
@@ -249,261 +479,9 @@ public:
 namespace Kokkos {
 namespace Impl {
 
-template<>
-class TaskBase< void , void , void > {
-public:
-  enum : int16_t   { TaskTeam = 0 , TaskSingle = 1 , Aggregate = 2 };
-  enum : uintptr_t { LockTag = ~uintptr_t(0) , EndTag = ~uintptr_t(1) };
-};
-
-/** \brief  Base class for task management, access, and execution.
- *
- *  Inheritance structure to allow static_cast from the task root type
- *  and a task's FunctorType.
- *
- *    // Enable a Future to access result data
- *    TaskBase< Space , ResultType , void >
- *      : TaskBase< void , void , void >
- *      { ... };
- *
- *    // Enable a functor to access the base class
- *    TaskBase< Space , ResultType , FunctorType >
- *      : TaskBase< Space , ResultType , void >
- *      , FunctorType
- *      { ... };
- *
- *
- *  States of a task:
- *
- *    Constructing State, NOT IN a linked list
- *      m_wait == 0
- *      m_next == 0
- *
- *    Scheduling transition : Constructing -> Waiting
- *      before:
- *        m_wait == 0
- *        m_next == this task's initial dependence, 0 if none
- *      after:
- *        m_wait == EndTag
- *        m_next == EndTag
- *
- *    Waiting State, IN a linked list
- *      m_apply != 0
- *      m_queue != 0
- *      m_ref_count > 0
- *      m_wait == head of linked list of tasks waiting on this task
- *      m_next == next of linked list of tasks
- *
- *    transition : Waiting -> Executing
- *      before:
- *        m_next == EndTag
- *      after::
- *        m_next == LockTag
- *
- *    Executing State, NOT IN a linked list
- *      m_apply != 0
- *      m_queue != 0
- *      m_ref_count > 0
- *      m_wait == head of linked list of tasks waiting on this task
- *      m_next == LockTag
- *
- *    Respawn transition : Executing -> Executing-Respawn
- *      before:
- *        m_next == LockTag
- *      after:
- *        m_next == this task's updated dependence, 0 if none
- *
- *    Executing-Respawn State, NOT IN a linked list
- *      m_apply != 0
- *      m_queue != 0
- *      m_ref_count > 0
- *      m_wait == head of linked list of tasks waiting on this task
- *      m_next == this task's updated dependence, 0 if none
- *
- *    transition : Executing -> Complete
- *      before:
- *        m_wait == head of linked list
- *      after:
- *        m_wait == LockTag
- *
- *    Complete State, NOT IN a linked list
- *      m_wait == LockTag: cannot add dependence
- *      m_next == LockTag: not a member of a wait queue
- *
- */
-template< typename ExecSpace >
-class TaskBase< ExecSpace , void , void >
-{
-public:
-
-  enum : int16_t   { TaskTeam   = TaskBase<void,void,void>::TaskTeam
-                   , TaskSingle = TaskBase<void,void,void>::TaskSingle
-                   , Aggregate  = TaskBase<void,void,void>::Aggregate };
-
-  enum : uintptr_t { LockTag = TaskBase<void,void,void>::LockTag
-                   , EndTag  = TaskBase<void,void,void>::EndTag };
-
-  using execution_space = ExecSpace ;
-  using queue_type      = TaskQueue< execution_space > ;
-
-  template< typename > friend class Kokkos::TaskScheduler ;
-
-  typedef void (* function_type) ( TaskBase * , void * );
-
-  // sizeof(TaskBase) == 48
-
-  function_type  m_apply ;       ///< Apply function pointer
-  queue_type   * m_queue ;       ///< Queue in which this task resides
-  TaskBase     * m_wait ;        ///< Linked list of tasks waiting on this
-  TaskBase     * m_next ;        ///< Waiting linked-list next
-  int32_t        m_ref_count ;   ///< Reference count
-  int32_t        m_alloc_size ;  ///< Allocation size
-  int32_t        m_dep_count ;   ///< Aggregate's number of dependences
-  int16_t        m_task_type ;   ///< Type of task
-  int16_t        m_priority ;    ///< Priority of runnable task
-
-  TaskBase() = delete ;
-  TaskBase( TaskBase && ) = delete ;
-  TaskBase( const TaskBase & ) = delete ;
-  TaskBase & operator = ( TaskBase && ) = delete ;
-  TaskBase & operator = ( const TaskBase & ) = delete ;
-
-  KOKKOS_INLINE_FUNCTION ~TaskBase() = default ;
-
-  // Constructor for a runnable task
-  KOKKOS_INLINE_FUNCTION
-  constexpr TaskBase( function_type arg_apply
-                    , queue_type  * arg_queue
-                    , TaskBase    * arg_dependence
-                    , int           arg_ref_count
-                    , int           arg_alloc_size
-                    , int           arg_task_type
-                    , int           arg_priority
-                    ) noexcept
-    : m_apply(      arg_apply )
-    , m_queue(      arg_queue )
-    , m_wait( 0 )
-    , m_next(       arg_dependence )
-    , m_ref_count(  arg_ref_count )
-    , m_alloc_size( arg_alloc_size )
-    , m_dep_count( 0 )
-    , m_task_type(  arg_task_type )
-    , m_priority(   arg_priority )
-    {}
-
-  // Constructor for an aggregate task
-  KOKKOS_INLINE_FUNCTION
-  constexpr TaskBase( queue_type  * arg_queue
-                    , int           arg_ref_count
-                    , int           arg_alloc_size
-                    , int           arg_dep_count
-                    ) noexcept
-    : m_apply( 0 )
-    , m_queue( arg_queue )
-    , m_wait( 0 )
-    , m_next( 0 )
-    , m_ref_count(  arg_ref_count )
-    , m_alloc_size( arg_alloc_size )
-    , m_dep_count(  arg_dep_count )
-    , m_task_type(  Aggregate )
-    , m_priority( 0 )
-    {}
-
-  //----------------------------------------
-
-  KOKKOS_INLINE_FUNCTION
-  TaskBase ** aggregate_dependences()
-    { return reinterpret_cast<TaskBase**>( this + 1 ); }
-
-  KOKKOS_INLINE_FUNCTION
-  bool requested_respawn()
-    {
-      // This should only be called when a task has finished executing and is
-      // in the transition to either the complete or executing-respawn state.
-      TaskBase * const lock = reinterpret_cast< TaskBase * >( LockTag );
-      return lock != m_next;
-    }
-
-  KOKKOS_INLINE_FUNCTION
-  void add_dependence( TaskBase* dep )
-    {
-      // Precondition: lock == m_next
-
-      TaskBase * const lock = (TaskBase *) LockTag ;
-
-      // Assign dependence to m_next.  It will be processed in the subsequent
-      // call to schedule.  Error if the dependence is reset.
-      if ( lock != Kokkos::atomic_exchange( & m_next, dep ) ) {
-        Kokkos::abort("TaskScheduler ERROR: resetting task dependence");
-      }
-
-      if ( 0 != dep ) {
-        // The future may be destroyed upon returning from this call
-        // so increment reference count to track this assignment.
-        Kokkos::atomic_increment( &(dep->m_ref_count) );
-      }
-    }
-
-  using get_return_type = void ;
-
-  KOKKOS_INLINE_FUNCTION
-  get_return_type get() const {}
-};
-
-template < typename ExecSpace , typename ResultType >
-class TaskBase< ExecSpace , ResultType , void >
-  : public TaskBase< ExecSpace , void , void >
-{
-private:
-
-  using root_type     = TaskBase<ExecSpace,void,void> ;
-  using function_type = typename root_type::function_type ;
-  using queue_type    = typename root_type::queue_type ;
-
-  static_assert( sizeof(root_type) == 48 , "" );
-
-  TaskBase() = delete ;
-  TaskBase( TaskBase && ) = delete ;
-  TaskBase( const TaskBase & ) = delete ;
-  TaskBase & operator = ( TaskBase && ) = delete ;
-  TaskBase & operator = ( const TaskBase & ) = delete ;
-
-public:
-
-  ResultType   m_result ;
-
-  KOKKOS_INLINE_FUNCTION ~TaskBase() = default ;
-
-  // Constructor for runnable task
-  KOKKOS_INLINE_FUNCTION
-  constexpr TaskBase( function_type arg_apply
-                    , queue_type  * arg_queue
-                    , root_type   * arg_dependence
-                    , int           arg_ref_count
-                    , int           arg_alloc_size
-                    , int           arg_task_type
-                    , int           arg_priority
-                    )
-    : root_type( arg_apply
-               , arg_queue
-               , arg_dependence
-               , arg_ref_count
-               , arg_alloc_size
-               , arg_task_type
-               , arg_priority
-               )
-    , m_result()
-    {}
-
-  using get_return_type = ResultType const & ;
-
-  KOKKOS_INLINE_FUNCTION
-  get_return_type get() const { return m_result ; }
-};
-
-template< typename ExecSpace , typename ResultType , typename FunctorType >
+template< class ExecSpace , typename ResultType , class FunctorType >
 class TaskBase
-  : public TaskBase< ExecSpace , ResultType , void >
+  : public TaskBase< void , void , void >
   , public FunctorType
 {
 private:
@@ -516,50 +494,31 @@ private:
 
 public:
 
-  using root_type       = TaskBase< ExecSpace , void , void > ;
-  using base_type       = TaskBase< ExecSpace , ResultType , void > ;
-  using specialization  = TaskQueueSpecialization< ExecSpace > ;
-  using function_type   = typename root_type::function_type ;
-  using queue_type      = typename root_type::queue_type ;
-  using member_type     = typename specialization::member_type ;
+  using root_type       = TaskBase< void , void , void > ;
   using functor_type    = FunctorType ;
   using result_type     = ResultType ;
 
-  template< typename Type >
-  KOKKOS_INLINE_FUNCTION static
-  void apply_functor
-    ( Type * const task
-    , typename std::enable_if
-        < std::is_same< typename Type::result_type , void >::value
-        , member_type * const
-        >::type member
-    )
-    {
-      using fType = typename Type::functor_type ;
-      static_cast<fType*>(task)->operator()( *member );
-    }
+  using specialization  = TaskQueueSpecialization< ExecSpace > ;
+  using member_type     = typename specialization::member_type ;
 
-  template< typename Type >
-  KOKKOS_INLINE_FUNCTION static
-  void apply_functor
-    ( Type * const task
-    , typename std::enable_if
-        < ! std::is_same< typename Type::result_type , void >::value
-        , member_type * const
-        >::type member
-    )
-    {
-      using fType = typename Type::functor_type ;
-      static_cast<fType*>(task)->operator()( *member , task->m_result );
-    }
+  KOKKOS_INLINE_FUNCTION
+  void apply_functor( member_type * const member , void * )
+    { functor_type::operator()( *member ); }
+
+  template< typename T >
+  KOKKOS_INLINE_FUNCTION
+  void apply_functor( member_type * const member
+                    , T           * const result )
+    { functor_type::operator()( *member , *result ); }
 
   KOKKOS_FUNCTION static
   void apply( root_type * root , void * exec )
     {
       TaskBase    * const task   = static_cast< TaskBase * >( root );
       member_type * const member = reinterpret_cast< member_type * >( exec );
+      result_type * const result = TaskResult< result_type >::ptr( task );
 
-      TaskBase::template apply_functor( task , member );
+      task->apply_functor( member , result );
 
       // Task may be serial or team.
       // If team then must synchronize before querying if respawn was requested.
@@ -576,26 +535,9 @@ public:
     }
 
   // Constructor for runnable task
-  KOKKOS_INLINE_FUNCTION
-  constexpr TaskBase( function_type arg_apply
-                    , queue_type  * arg_queue
-                    , root_type   * arg_dependence
-                    , int           arg_ref_count
-                    , int           arg_alloc_size
-                    , int           arg_task_type
-                    , int           arg_priority
-                    , FunctorType && arg_functor
-                    )
-    : base_type( arg_apply
-               , arg_queue
-               , arg_dependence
-               , arg_ref_count
-               , arg_alloc_size
-               , arg_task_type
-               , arg_priority
-               )
-    , functor_type( arg_functor )
-    {}
+  KOKKOS_INLINE_FUNCTION constexpr
+  TaskBase( FunctorType && arg_functor )
+    : root_type() , functor_type( std::move(arg_functor) ) {}
 
   KOKKOS_INLINE_FUNCTION
   ~TaskBase() {}
diff --git a/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp b/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
index aee381afad..1974f7e1ca 100644
--- a/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
@@ -44,6 +44,8 @@
 #include <Kokkos_Macros.hpp>
 #if defined( KOKKOS_ENABLE_TASKDAG )
 
+#define KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING 0
+
 namespace Kokkos {
 namespace Impl {
 
@@ -100,9 +102,11 @@ KOKKOS_FUNCTION
 void TaskQueue< ExecSpace >::decrement
   ( TaskQueue< ExecSpace >::task_root_type * task )
 {
-  const int count = Kokkos::atomic_fetch_add(&(task->m_ref_count),-1);
+  task_root_type volatile & t = *task ;
 
-#if 0
+  const int count = Kokkos::atomic_fetch_add(&(t.m_ref_count),-1);
+
+#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING
   if ( 1 == count ) {
     printf( "decrement-destroy( 0x%lx { 0x%lx %d %d } )\n"
           , uintptr_t( task )
@@ -114,9 +118,13 @@ void TaskQueue< ExecSpace >::decrement
 #endif
 
   if ( ( 1 == count ) &&
-       ( task->m_next == (task_root_type *) task_root_type::LockTag ) ) {
+       ( t.m_next == (task_root_type *) task_root_type::LockTag ) ) {
     // Reference count is zero and task is complete, deallocate.
-    task->m_queue->deallocate( task , task->m_alloc_size );
+
+    TaskQueue< ExecSpace > * const queue =
+      static_cast< TaskQueue< ExecSpace > * >( t.m_queue );
+
+    queue->deallocate( task , t.m_alloc_size );
   }
   else if ( count <= 1 ) {
     Kokkos::abort("TaskScheduler task has negative reference count or is incomplete" );
@@ -171,7 +179,7 @@ bool TaskQueue< ExecSpace >::push_task
   // Fail the push attempt if the queue is locked;
   // otherwise retry until the push succeeds.
 
-#if 0
+#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING
   printf( "push_task( 0x%lx { 0x%lx } 0x%lx { 0x%lx 0x%lx %d %d %d } )\n"
         , uintptr_t(queue)
         , uintptr_t(*queue)
@@ -186,9 +194,9 @@ bool TaskQueue< ExecSpace >::push_task
   task_root_type * const zero = (task_root_type *) 0 ;
   task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
 
-  task_root_type * volatile * const next = & task->m_next ;
+  task_root_type * volatile & next = task->m_next ;
 
-  if ( zero != *next ) {
+  if ( zero != next ) {
     Kokkos::abort("TaskQueue::push_task ERROR: already a member of another queue" );
   }
 
@@ -196,9 +204,9 @@ bool TaskQueue< ExecSpace >::push_task
 
   while ( lock != y ) {
 
-    *next = y ;
+    next = y ;
 
-    // Do not proceed until '*next' has been stored.
+    // Do not proceed until 'next' has been stored.
     Kokkos::memory_fence();
 
     task_root_type * const x = y ;
@@ -211,9 +219,9 @@ bool TaskQueue< ExecSpace >::push_task
   // Failed, replace 'task->m_next' value since 'task' remains
   // not a member of a queue.
 
-  *next = zero ;
+  next = zero ;
 
-  // Do not proceed until '*next' has been stored.
+  // Do not proceed until 'next' has been stored.
   Kokkos::memory_fence();
 
   return false ;
@@ -270,11 +278,13 @@ TaskQueue< ExecSpace >::pop_ready_task
       // This thread has exclusive access to
       // the queue and the popped task's m_next.
 
-      *queue = task->m_next ; task->m_next = lock ;
+      task_root_type * volatile & next = task->m_next ;
+
+      *queue = next ; next = lock ;
 
       Kokkos::memory_fence();
 
-#if 0
+#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING
       printf( "pop_ready_task( 0x%lx 0x%lx { 0x%lx 0x%lx %d %d %d } )\n"
             , uintptr_t(queue)
             , uintptr_t(task)
@@ -323,7 +333,7 @@ void TaskQueue< ExecSpace >::schedule_runnable
   //     task->m_wait == head of linked list (queue)
   //     task->m_next == member of linked list (queue)
 
-#if 0
+#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING
   printf( "schedule_runnable( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
         , uintptr_t(task)
         , uintptr_t(task->m_wait)
@@ -337,20 +347,22 @@ void TaskQueue< ExecSpace >::schedule_runnable
   task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
   task_root_type * const end  = (task_root_type *) task_root_type::EndTag ;
 
+  task_root_type volatile & t = *task ;
+
   bool respawn = false ;
 
   //----------------------------------------
 
-  if ( zero == task->m_wait ) {
+  if ( zero == t.m_wait ) {
     // Task in Constructing state
     // - Transition to Waiting state
     // Preconditions:
     // - call occurs exclusively within a single thread
 
-    task->m_wait = end ;
+    t.m_wait = end ;
     // Task in Waiting state
   }
-  else if ( lock != task->m_wait ) {
+  else if ( lock != t.m_wait ) {
     // Task in Executing state with Respawn request
     // - Update dependence
     // - Transition to Waiting state
@@ -373,7 +385,9 @@ void TaskQueue< ExecSpace >::schedule_runnable
 
   // Exclusive access so don't need an atomic exchange
   // task_root_type * dep = Kokkos::atomic_exchange( & task->m_next , zero );
-  task_root_type * dep = task->m_next ; task->m_next = zero ;
+  task_root_type * dep = t.m_next ; t.m_next = zero ;
+
+  Kokkos::memory_fence();
 
   const bool is_ready =
     ( 0 == dep ) || ( ! push_task( & dep->m_wait , task ) );
@@ -398,7 +412,7 @@ void TaskQueue< ExecSpace >::schedule_runnable
     Kokkos::atomic_increment( & m_ready_count );
 
     task_root_type * volatile * const ready_queue =
-      & m_ready[ task->m_priority ][ task->m_task_type ];
+      & m_ready[ t.m_priority ][ t.m_task_type ];
 
     // A push_task fails if the ready queue is locked.
     // A ready queue is only locked during a push or pop;
@@ -441,7 +455,7 @@ void TaskQueue< ExecSpace >::schedule_aggregate
   //     task->m_wait == head of linked list (queue)
   //     task->m_next == member of linked list (queue)
 
-#if 0
+#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING
   printf( "schedule_aggregate( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
         , uintptr_t(task)
         , uintptr_t(task->m_wait)
@@ -455,18 +469,20 @@ void TaskQueue< ExecSpace >::schedule_aggregate
   task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
   task_root_type * const end  = (task_root_type *) task_root_type::EndTag ;
 
+  task_root_type volatile & t = *task ;
+
   //----------------------------------------
 
-  if ( zero == task->m_wait ) {
+  if ( zero == t.m_wait ) {
     // Task in Constructing state
     // - Transition to Waiting state
     // Preconditions:
     // - call occurs exclusively within a single thread
 
-    task->m_wait = end ;
+    t.m_wait = end ;
     // Task in Waiting state
   }
-  else if ( lock == task->m_wait ) {
+  else if ( lock == t.m_wait ) {
     // Task in Complete state
     Kokkos::abort("TaskQueue::schedule_aggregate ERROR: task is complete");
   }
@@ -477,14 +493,14 @@ void TaskQueue< ExecSpace >::schedule_aggregate
   // (1) created or
   // (2) being removed from a completed task's wait list.
 
-  task_root_type ** const aggr = task->aggregate_dependences();
+  task_root_type * volatile * const aggr = t.aggregate_dependences();
 
   // Assume the 'when_all' is complete until a dependence is
   // found that is not complete.
 
   bool is_complete = true ;
 
-  for ( int i = task->m_dep_count ; 0 < i && is_complete ; ) {
+  for ( int i = t.m_dep_count ; 0 < i && is_complete ; ) {
 
     --i ;
 
@@ -523,7 +539,7 @@ void TaskQueue< ExecSpace >::schedule_aggregate
     // Complete the when_all 'task' to schedule other tasks
     // that are waiting for the when_all 'task' to complete.
 
-    task->m_next = lock ;
+    t.m_next = lock ;
 
     complete( task );
 
@@ -573,7 +589,7 @@ void TaskQueue< ExecSpace >::complete
   task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
   task_root_type * const end  = (task_root_type *) task_root_type::EndTag ;
 
-#if 0
+#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING
   printf( "complete( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
         , uintptr_t(task)
         , uintptr_t(task->m_wait)
@@ -584,11 +600,13 @@ void TaskQueue< ExecSpace >::complete
   fflush( stdout );
 #endif
 
-  const bool runnable = task_root_type::Aggregate != task->m_task_type ;
+  task_root_type volatile & t = *task ;
+
+  const bool runnable = task_root_type::Aggregate != t.m_task_type ;
 
   //----------------------------------------
 
-  if ( runnable && lock != task->m_next ) {
+  if ( runnable && lock != t.m_next ) {
     // Is a runnable task has finished executing and requested respawn.
     // Schedule the task for subsequent execution.
 
@@ -607,7 +625,7 @@ void TaskQueue< ExecSpace >::complete
     // Stop other tasks from adding themselves to this task's wait queue
     // by locking the head of this task's wait queue.
 
-    task_root_type * x = Kokkos::atomic_exchange( & task->m_wait , lock );
+    task_root_type * x = Kokkos::atomic_exchange( & t.m_wait , lock );
 
     if ( x != (task_root_type *) lock ) {
 
@@ -627,9 +645,13 @@ void TaskQueue< ExecSpace >::complete
         // Have exclusive access to 'x' until it is scheduled
         // Set x->m_next = zero  <=  no dependence, not a respawn
 
-        task_root_type * const next = x->m_next ; x->m_next = 0 ;
+        task_root_type volatile & vx = *x ;
 
-        if ( task_root_type::Aggregate != x->m_task_type ) {
+        task_root_type * const next = vx.m_next ; vx.m_next = 0 ;
+
+        Kokkos::memory_fence();
+
+        if ( task_root_type::Aggregate != vx.m_task_type ) {
           schedule_runnable( x );
         }
         else {
diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewArray.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewArray.hpp
index c55636b64e..ed1a71bea7 100644
--- a/lib/kokkos/core/src/impl/Kokkos_ViewArray.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewArray.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -47,7 +47,6 @@
 #include <Kokkos_Array.hpp>
 
 namespace Kokkos {
-namespace Experimental {
 namespace Impl {
 
 template< class DataType , class ArrayLayout , class V , size_t N , class P >
@@ -94,13 +93,12 @@ public:
   typedef typename ViewDataType< non_const_scalar_type , array_scalar_dimension >::type  non_const_scalar_array_type ;
 };
 
-}}} // namespace Kokkos::Experimental::Impl
+}} // namespace Kokkos::Impl
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
-namespace Experimental {
 namespace Impl {
 
 /** \brief  View mapping for non-specialized data type and standard layout */
@@ -597,7 +595,7 @@ public:
     }
 };
 
-}}} // namespace Kokkos::Experimental::Impl
+}} // namespace Kokkos::Impl
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp
index 6381aee468..f32c6bb2ee 100644
--- a/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp
@@ -96,6 +96,27 @@ struct is_view_label< const char[N] > : public std::true_type {};
 template< typename ... P >
 struct ViewCtorProp ;
 
+// Forward declare
+template< typename Specialize , typename T >
+struct CommonViewAllocProp ;
+
+/* Common value_type stored as ViewCtorProp
+ */
+template< typename Specialize , typename T >
+struct ViewCtorProp< void , CommonViewAllocProp<Specialize,T> >
+{
+  ViewCtorProp() = default ;
+  ViewCtorProp( const ViewCtorProp & ) = default ;
+  ViewCtorProp & operator = ( const ViewCtorProp & ) = default ;
+
+  using type = CommonViewAllocProp<Specialize,T> ;
+
+  ViewCtorProp( const type & arg ) : value( arg ) {}
+  ViewCtorProp( type && arg ) : value( arg ) {}
+
+  type value ;
+};
+
 /*  std::integral_constant<unsigned,I> are dummy arguments
  *  that avoid duplicate base class errors
  */
diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
index 900bd88f1c..d346f9e639 100644
--- a/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
@@ -62,7 +62,6 @@
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
-namespace Experimental {
 namespace Impl {
 
 template< unsigned I , size_t ... Args >
@@ -250,7 +249,7 @@ struct ViewDimensionAssignable< ViewDimension< DstArgs ... >
 
 };
 
-}}} // namespace Kokkos::Experimental::Impl
+}} // namespace Kokkos::Impl
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -266,14 +265,11 @@ struct ALL_t {
 }} // namespace Kokkos::Impl
 
 namespace Kokkos {
-namespace Experimental {
 namespace Impl {
 
-using Kokkos::Impl::ALL_t ;
-
 template< class T >
 struct is_integral_extent_type
-{ enum { value = std::is_same<T,Kokkos::Experimental::Impl::ALL_t>::value ? 1 : 0 }; };
+{ enum { value = std::is_same<T,Kokkos::Impl::ALL_t>::value ? 1 : 0 }; };
 
 template< class iType >
 struct is_integral_extent_type< std::pair<iType,iType> >
@@ -314,10 +310,10 @@ struct SubviewLegalArgsCompileTime;
 
 template<int RankDest, int RankSrc, int CurrentArg, class Arg, class ... SubViewArgs>
 struct SubviewLegalArgsCompileTime<Kokkos::LayoutLeft, Kokkos::LayoutLeft, RankDest, RankSrc, CurrentArg, Arg, SubViewArgs...> {
-  enum { value      =(((CurrentArg==RankDest-1) && (Kokkos::Experimental::Impl::is_integral_extent_type<Arg>::value)) ||
+  enum { value      =(((CurrentArg==RankDest-1) && (Kokkos::Impl::is_integral_extent_type<Arg>::value)) ||
                       ((CurrentArg>=RankDest) && (std::is_integral<Arg>::value)) ||
                       ((CurrentArg<RankDest) && (std::is_same<Arg,Kokkos::Impl::ALL_t>::value)) ||
-                      ((CurrentArg==0) && (Kokkos::Experimental::Impl::is_integral_extent_type<Arg>::value))
+                      ((CurrentArg==0) && (Kokkos::Impl::is_integral_extent_type<Arg>::value))
                      ) && (SubviewLegalArgsCompileTime<Kokkos::LayoutLeft, Kokkos::LayoutLeft, RankDest, RankSrc, CurrentArg+1, SubViewArgs...>::value)};
 };
 
@@ -331,7 +327,7 @@ struct SubviewLegalArgsCompileTime<Kokkos::LayoutLeft, Kokkos::LayoutLeft, RankD
 
 template<int RankDest, int RankSrc, int CurrentArg, class Arg, class ... SubViewArgs>
 struct SubviewLegalArgsCompileTime<Kokkos::LayoutRight, Kokkos::LayoutRight, RankDest, RankSrc, CurrentArg, Arg, SubViewArgs...> {
-  enum { value      =(((CurrentArg==RankSrc-RankDest) && (Kokkos::Experimental::Impl::is_integral_extent_type<Arg>::value)) ||
+  enum { value      =(((CurrentArg==RankSrc-RankDest) && (Kokkos::Impl::is_integral_extent_type<Arg>::value)) ||
                       ((CurrentArg<RankSrc-RankDest) && (std::is_integral<Arg>::value)) ||
                       ((CurrentArg>=RankSrc-RankDest) && (std::is_same<Arg,Kokkos::Impl::ALL_t>::value))
                      ) && (SubviewLegalArgsCompileTime<Kokkos::LayoutRight, Kokkos::LayoutRight, RankDest, RankSrc, CurrentArg+1, SubViewArgs...>::value)};
@@ -403,7 +399,7 @@ private:
   bool set( unsigned domain_rank
           , unsigned range_rank
           , const ViewDimension< DimArgs ... > & dim
-          , const Kokkos::Experimental::Impl::ALL_t
+          , const Kokkos::Impl::ALL_t
           , Args ... args )
     {
       m_begin[  domain_rank ] = 0 ;
@@ -519,7 +515,7 @@ private:
             , unsigned domain_rank
             , unsigned range_rank
             , const ViewDimension< DimArgs ... > & dim
-            , const Kokkos::Experimental::Impl::ALL_t
+            , const Kokkos::Impl::ALL_t
             , Args ... args ) const
     {
       const int n = std::min( buf_len ,
@@ -670,13 +666,12 @@ public:
     { return unsigned(i) < InternalRangeRank ? m_index[i] : ~0u ; }
 };
 
-}}} // namespace Kokkos::Experimental::Impl
+}} // namespace Kokkos::Impl
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
-namespace Experimental {
 namespace Impl {
 
 /** \brief  Given a value type and dimension generate the View data type */
@@ -814,13 +809,12 @@ public:
   typedef non_const_type  non_const_scalar_array_type ;
 };
 
-}}} // namespace Kokkos::Experimental::Impl
+}} // namespace Kokkos::Impl
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
-namespace Experimental {
 namespace Impl {
 
 template < class Dimension , class Layout , typename Enable = void >
@@ -1228,14 +1222,14 @@ private:
 
     // If memory alignment is a multiple of the trivial scalar size then attempt to align.
     enum { align = 0 != TrivialScalarSize && 0 == mod ? div : 0 };
-    enum { div_ok = div ? div : 1 }; // To valid modulo zero in constexpr
+    enum { div_ok = (div != 0) ? div : 1 }; // To valid modulo zero in constexpr
 
     KOKKOS_INLINE_FUNCTION
     static constexpr size_t stride( size_t const N )
-      {
-        return ( align && ( Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align < N ) && ( N % div_ok ) )
-               ? N + align - ( N % div_ok ) : N ;
-      }
+    {
+      return ( (align != 0) && ((Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align) < N) && ((N % div_ok) != 0) )
+             ? N + align - ( N % div_ok ) : N ;
+    }
   };
 
 public:
@@ -1707,12 +1701,12 @@ private:
 
     // If memory alignment is a multiple of the trivial scalar size then attempt to align.
     enum { align = 0 != TrivialScalarSize && 0 == mod ? div : 0 };
-    enum { div_ok = div ? div : 1 }; // To valid modulo zero in constexpr
+    enum { div_ok = (div != 0) ? div : 1 }; // To valid modulo zero in constexpr
 
     KOKKOS_INLINE_FUNCTION
     static constexpr size_t stride( size_t const N )
     {
-      return ( align && ( Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align < N ) && ( N % div_ok ) )
+      return ( (align != 0) && ((Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align) < N) && ((N % div_ok) != 0) )
              ? N + align - ( N % div_ok ) : N ;
     }
   };
@@ -2225,13 +2219,12 @@ public:
     {}
 };
 
-}}} // namespace Kokkos::Experimental::Impl
+}} // namespace Kokkos::Impl
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
-namespace Experimental {
 namespace Impl {
 
 /** \brief  ViewDataHandle provides the type of the 'data handle' which the view
@@ -2422,13 +2415,12 @@ struct ViewDataHandle< Traits ,
     return handle_type( arg_data_ptr + offset );
   }
 };
-}}} // namespace Kokkos::Experimental::Impl
+}} // namespace Kokkos::Impl
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
-namespace Experimental {
 namespace Impl {
 
 //----------------------------------------------------------------------------
@@ -2451,8 +2443,9 @@ template< class ExecSpace , class ValueType >
 struct ViewValueFunctor< ExecSpace , ValueType , false /* is_scalar */ >
 {
   typedef Kokkos::RangePolicy< ExecSpace > PolicyType ;
+  typedef typename ExecSpace::execution_space Exec;
 
-  ExecSpace   space ;
+  Exec        space ;
   ValueType * ptr ;
   size_t      n ;
   bool        destroy ;
@@ -2597,6 +2590,9 @@ private:
 
 public:
 
+  typedef void printable_label_typedef;
+  enum { is_managed = Traits::is_managed };
+
   //----------------------------------------
   // Domain dimensions
 
@@ -2944,7 +2940,7 @@ public:
           Kokkos::abort("View Assignment: trying to assign runtime dimension to non matching compile time dimension.");
       }
       dst.m_offset = dst_offset_type( src.m_offset );
-      dst.m_handle = Kokkos::Experimental::Impl::ViewDataHandle< DstTraits >::assign( src.m_handle , src_track );
+      dst.m_handle = Kokkos::Impl::ViewDataHandle< DstTraits >::assign( src.m_handle , src_track );
     }
 };
 
@@ -3102,7 +3098,7 @@ public:
 
 //----------------------------------------------------------------------------
 
-}}} // namespace Kokkos::Experimental::Impl
+}} // namespace Kokkos::Impl
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -3151,6 +3147,77 @@ void view_error_operator_bounds
   view_error_operator_bounds<R+1>(buf+n,len-n,map,args...);
 }
 
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+
+/* Check #3: is the View managed as determined by the MemoryTraits? */
+template< class MapType,
+  bool is_managed = (MapType::is_managed != 0) >
+struct OperatorBoundsErrorOnDevice;
+
+template< class MapType >
+struct OperatorBoundsErrorOnDevice< MapType, false > {
+KOKKOS_INLINE_FUNCTION
+static void run(MapType const&) {
+  Kokkos::abort("View bounds error");
+}
+};
+
+template< class MapType >
+struct OperatorBoundsErrorOnDevice< MapType, true > {
+KOKKOS_INLINE_FUNCTION
+static void run(MapType const& map) {
+  char const* const user_alloc_start = reinterpret_cast<char const*>(map.data());
+  char const* const header_start = user_alloc_start - sizeof(SharedAllocationHeader);
+  SharedAllocationHeader const* const header =
+    reinterpret_cast<SharedAllocationHeader const*>(header_start);
+  char const* const label = header->label();
+  enum { LEN = 128 };
+  char msg[LEN];
+  char const* const first_part = "View bounds error of view ";
+  char* p = msg;
+  char* const end = msg + LEN - 1;
+  for (char const* p2 = first_part; (*p2 != '\0') && (p < end); ++p, ++p2) {
+    *p = *p2;
+  }
+  for (char const* p2 = label; (*p2 != '\0') && (p < end); ++p, ++p2) {
+    *p = *p2;
+  }
+  *p = '\0';
+  Kokkos::abort(msg);
+}
+};
+
+/* Check #2: does the ViewMapping have the printable_label_typedef defined?
+   See above that only the non-specialized standard-layout ViewMapping has
+   this defined by default.
+   The existence of this typedef indicates the existence of MapType::is_managed */
+template< class T, class Enable = void >
+struct has_printable_label_typedef : public std::false_type {};
+
+template<class T>
+struct has_printable_label_typedef<
+  T, typename enable_if_type<typename T::printable_label_typedef>::type>
+  : public std::true_type
+{};
+
+template< class MapType >
+KOKKOS_INLINE_FUNCTION
+void operator_bounds_error_on_device(
+    MapType const&,
+    std::false_type) {
+  Kokkos::abort("View bounds error");
+}
+
+template< class MapType >
+KOKKOS_INLINE_FUNCTION
+void operator_bounds_error_on_device(
+    MapType const& map,
+    std::true_type) {
+  OperatorBoundsErrorOnDevice< MapType >::run(map);
+}
+
+#endif // ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+
 template< class MemorySpace , class MapType , class ... Args >
 KOKKOS_INLINE_FUNCTION
 void view_verify_operator_bounds
@@ -3166,7 +3233,17 @@ void view_verify_operator_bounds
     view_error_operator_bounds<0>( buffer + n , LEN - n , map , args ... );
     Kokkos::Impl::throw_runtime_exception(std::string(buffer));
 #else
-    Kokkos::abort("View bounds error");
+    /* Check #1: is there a SharedAllocationRecord?
+       (we won't use it, but if its not there then there isn't
+        a corresponding SharedAllocationHeader containing a label).
+       This check should cover the case of Views that don't
+       have the Unmanaged trait but were initialized by pointer. */
+    if (tracker.has_record()) {
+      operator_bounds_error_on_device<MapType>(
+          map, has_printable_label_typedef<MapType>());
+    } else {
+      Kokkos::abort("View bounds error");
+    }
 #endif
   }
 }
diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewTile.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewTile.hpp
index ecbcf72fe0..5a8600e0ae 100644
--- a/lib/kokkos/core/src/impl/Kokkos_ViewTile.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewTile.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -48,7 +48,6 @@
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
-namespace Experimental {
 namespace Impl {
 
 // View mapping for rank two tiled array
@@ -195,11 +194,9 @@ struct ViewMapping
 };
 
 } /* namespace Impl */
-} /* namespace Experimental */
 } /* namespace Kokkos */
 
 namespace Kokkos {
-namespace Experimental {
 
 template< typename T , unsigned N0 , unsigned N1 , class ... P >
 KOKKOS_INLINE_FUNCTION
@@ -217,7 +214,6 @@ tile_subview( const Kokkos::View<T**,Kokkos::LayoutTileLeft<N0,N1,true>,P...> &
     ( src , SrcLayout() , i_tile0 , i_tile1 );
 }
 
-} /* namespace Experimental */
 } /* namespace Kokkos */
 
 //----------------------------------------------------------------------------
diff --git a/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp b/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp
deleted file mode 100644
index 101b714fcd..0000000000
--- a/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <Kokkos_Macros.hpp>
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-
-#include <impl/Kokkos_spinwait.hpp>
-
-#include <Kokkos_Atomic.hpp>
-#include <impl/Kokkos_BitOps.hpp>
-
-/*--------------------------------------------------------------------------*/
-
-#if !defined( _WIN32 )
-  #if defined( KOKKOS_ENABLE_ASM )
-    #if defined( __arm__ ) || defined( __aarch64__ )
-      /* No-operation instruction to idle the thread. */
-      #define KOKKOS_INTERNAL_PAUSE
-    #else
-      /* Pause instruction to prevent excess processor bus usage */
-      #define KOKKOS_INTERNAL_PAUSE   asm volatile("pause\n":::"memory")
-    #endif
-    #define KOKKOS_INTERNAL_NOP2    asm volatile("nop\n" "nop\n")
-    #define KOKKOS_INTERNAL_NOP4    KOKKOS_INTERNAL_NOP2;  KOKKOS_INTERNAL_NOP2
-    #define KOKKOS_INTERNAL_NOP8    KOKKOS_INTERNAL_NOP4;  KOKKOS_INTERNAL_NOP4;
-    #define KOKKOS_INTERNAL_NOP16   KOKKOS_INTERNAL_NOP8;  KOKKOS_INTERNAL_NOP8;
-    #define KOKKOS_INTERNAL_NOP32   KOKKOS_INTERNAL_NOP16; KOKKOS_INTERNAL_NOP16;
-    namespace {
-    inline void kokkos_internal_yield( const unsigned i ) noexcept {
-      switch (Kokkos::Impl::bit_scan_reverse((i >> 2)+1u)) {
-      case 0u:  KOKKOS_INTERNAL_NOP2;  break;
-      case 1u:  KOKKOS_INTERNAL_NOP4;  break;
-      case 2u:  KOKKOS_INTERNAL_NOP8;  break;
-      case 3u:  KOKKOS_INTERNAL_NOP16; break;
-      default: KOKKOS_INTERNAL_NOP32;
-      }
-      KOKKOS_INTERNAL_PAUSE;
-    }
-    }
-  #else
-    #include <sched.h>
-    namespace {
-    inline void kokkos_internal_yield( const unsigned ) noexcept {
-      sched_yield();
-    }
-    }
-  #endif
-#else // defined( _WIN32 )
-  #if defined ( KOKKOS_ENABLE_WINTHREAD )
-    #include <process.h>
-    namespace {
-    inline void kokkos_internal_yield( const unsigned ) noexcept {
-      Sleep(0);
-    }
-    }
-  #elif defined( _MSC_VER )
-    #define NOMINMAX
-    #include <winsock2.h>
-    #include <windows.h>
-    namespace {
-    inline void kokkos_internal_yield( const unsigned ) noexcept {
-      YieldProcessor();
-    }
-    }
-  #else
-    #define KOKKOS_INTERNAL_PAUSE   __asm__ __volatile__("pause\n":::"memory")
-    #define KOKKOS_INTERNAL_NOP2    __asm__ __volatile__("nop\n" "nop")
-    #define KOKKOS_INTERNAL_NOP4    KOKKOS_INTERNAL_NOP2;  KOKKOS_INTERNAL_NOP2
-    #define KOKKOS_INTERNAL_NOP8    KOKKOS_INTERNAL_NOP4;  KOKKOS_INTERNAL_NOP4;
-    #define KOKKOS_INTERNAL_NOP16   KOKKOS_INTERNAL_NOP8;  KOKKOS_INTERNAL_NOP8;
-    #define KOKKOS_INTERNAL_NOP32   KOKKOS_INTERNAL_NOP16; KOKKOS_INTERNAL_NOP16;
-    namespace {
-    inline void kokkos_internal_yield( const unsigned i ) noexcept {
-      switch (Kokkos::Impl::bit_scan_reverse((i >> 2)+1u)) {
-      case 0:  KOKKOS_INTERNAL_NOP2;  break;
-      case 1:  KOKKOS_INTERNAL_NOP4;  break;
-      case 2:  KOKKOS_INTERNAL_NOP8;  break;
-      case 3:  KOKKOS_INTERNAL_NOP16; break;
-      default: KOKKOS_INTERNAL_NOP32;
-      }
-      KOKKOS_INTERNAL_PAUSE;
-    }
-    }
-  #endif
-#endif
-
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-
-void spinwait_while_equal( volatile int32_t & flag , const int32_t value )
-{
-  Kokkos::store_fence();
-  unsigned i = 0;
-  while ( value == flag ) {
-    kokkos_internal_yield(i);
-    ++i;
-  }
-  Kokkos::load_fence();
-}
-
-void spinwait_until_equal( volatile int32_t & flag , const int32_t value )
-{
-  Kokkos::store_fence();
-  unsigned i = 0;
-  while ( value != flag ) {
-    kokkos_internal_yield(i);
-    ++i;
-  }
-  Kokkos::load_fence();
-}
-
-void spinwait_while_equal( volatile int64_t & flag , const int64_t value )
-{
-  Kokkos::store_fence();
-  unsigned i = 0;
-  while ( value == flag ) {
-    kokkos_internal_yield(i);
-    ++i;
-  }
-  Kokkos::load_fence();
-}
-
-void spinwait_until_equal( volatile int64_t & flag , const int64_t value )
-{
-  Kokkos::store_fence();
-  unsigned i = 0;
-  while ( value != flag ) {
-    kokkos_internal_yield(i);
-    ++i;
-  }
-  Kokkos::load_fence();
-}
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-#else
-void KOKKOS_CORE_SRC_IMPL_SPINWAIT_PREVENT_LINK_ERROR() {}
-#endif
-
diff --git a/lib/kokkos/core/unit_test/CMakeLists.txt b/lib/kokkos/core/unit_test/CMakeLists.txt
index 5d6f25ac95..475b6bb48a 100644
--- a/lib/kokkos/core/unit_test/CMakeLists.txt
+++ b/lib/kokkos/core/unit_test/CMakeLists.txt
@@ -57,6 +57,7 @@ IF(Kokkos_ENABLE_Serial)
       serial/TestSerial_ViewMapping_b.cpp
       serial/TestSerial_ViewMapping_subview.cpp
       serial/TestSerial_ViewOfClass.cpp
+      serial/TestSerial_WorkGraph.cpp
     COMM serial mpi
     NUM_MPI_PROCS 1
     FAIL_REGULAR_EXPRESSION "  FAILED  "
@@ -102,6 +103,7 @@ IF(Kokkos_ENABLE_Pthread)
       threads/TestThreads_ViewMapping_b.cpp
       threads/TestThreads_ViewMapping_subview.cpp
       threads/TestThreads_ViewOfClass.cpp
+      threads/TestThreads_WorkGraph.cpp
     COMM serial mpi
     NUM_MPI_PROCS 1
     FAIL_REGULAR_EXPRESSION "  FAILED  "
@@ -147,6 +149,8 @@ IF(Kokkos_ENABLE_OpenMP)
       openmp/TestOpenMP_ViewMapping_b.cpp
       openmp/TestOpenMP_ViewMapping_subview.cpp
       openmp/TestOpenMP_ViewOfClass.cpp
+      openmp/TestOpenMP_WorkGraph.cpp
+      openmp/TestOpenMP_UniqueToken.cpp
     COMM serial mpi
     NUM_MPI_PROCS 1
     FAIL_REGULAR_EXPRESSION "  FAILED  "
@@ -237,6 +241,7 @@ IF(Kokkos_ENABLE_Cuda)
       cuda/TestCuda_ViewMapping_b.cpp
       cuda/TestCuda_ViewMapping_subview.cpp
       cuda/TestCuda_ViewOfClass.cpp
+      cuda/TestCuda_WorkGraph.cpp
     COMM serial mpi
     NUM_MPI_PROCS 1
     FAIL_REGULAR_EXPRESSION "  FAILED  "
@@ -253,6 +258,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
     default/TestDefaultDeviceType_b.cpp
     default/TestDefaultDeviceType_c.cpp
     default/TestDefaultDeviceType_d.cpp
+    default/TestDefaultDeviceTypeResize.cpp
   COMM serial mpi
   NUM_MPI_PROCS 1
   FAIL_REGULAR_EXPRESSION "  FAILED  "
diff --git a/lib/kokkos/core/unit_test/Makefile b/lib/kokkos/core/unit_test/Makefile
index 41f192a486..c877aa7dd2 100644
--- a/lib/kokkos/core/unit_test/Makefile
+++ b/lib/kokkos/core/unit_test/Makefile
@@ -62,8 +62,9 @@ endif
 	OBJ_CUDA += TestCuda_TeamReductionScan.o
 	OBJ_CUDA += TestCuda_Other.o
 	OBJ_CUDA += TestCuda_MDRange.o
-	OBJ_CUDA += TestCuda_Task.o
+	OBJ_CUDA += TestCuda_Task.o TestCuda_WorkGraph.o
 	OBJ_CUDA += TestCuda_Spaces.o
+	OBJ_CUDA += TestCuda_UniqueToken.o
 	
 	TARGETS += KokkosCore_UnitTest_Cuda
 
@@ -121,7 +122,8 @@ endif
 	OBJ_OPENMP += TestOpenMP_TeamReductionScan.o
 	OBJ_OPENMP += TestOpenMP_Other.o
 	OBJ_OPENMP += TestOpenMP_MDRange.o
-	OBJ_OPENMP += TestOpenMP_Task.o
+	OBJ_OPENMP += TestOpenMP_Task.o TestOpenMP_WorkGraph.o
+	OBJ_OPENMP += TestOpenMP_UniqueToken.o
 	
 	TARGETS += KokkosCore_UnitTest_OpenMP
 
@@ -208,7 +210,7 @@ endif
         OBJ_SERIAL += TestSerial_TeamReductionScan.o
         OBJ_SERIAL += TestSerial_Other.o
         OBJ_SERIAL += TestSerial_MDRange.o
-        OBJ_SERIAL += TestSerial_Task.o
+        OBJ_SERIAL += TestSerial_Task.o TestSerial_WorkGraph.o
 	
 	TARGETS += KokkosCore_UnitTest_Serial
 
diff --git a/lib/kokkos/core/unit_test/TestAggregate.hpp b/lib/kokkos/core/unit_test/TestAggregate.hpp
index 6896a27bfb..87440c36be 100644
--- a/lib/kokkos/core/unit_test/TestAggregate.hpp
+++ b/lib/kokkos/core/unit_test/TestAggregate.hpp
@@ -58,7 +58,7 @@ template< class DeviceType >
 void TestViewAggregate()
 {
   typedef Kokkos::Array< double, 32 >  value_type;
-  typedef Kokkos::Experimental::Impl::ViewDataAnalysis< value_type *, Kokkos::LayoutLeft, value_type > analysis_1d;
+  typedef Kokkos::Impl::ViewDataAnalysis< value_type *, Kokkos::LayoutLeft, value_type > analysis_1d;
 
   static_assert( std::is_same< typename analysis_1d::specialize, Kokkos::Array<> >::value, "" );
 
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
index 401da58a58..68864c8d66 100644
--- a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
@@ -186,6 +186,21 @@ void check_correct_initialization( const Kokkos::InitArguments & argstruct ) {
   // Figure out the number of threads the HostSpace ExecutionSpace should have initialized to.
   int expected_nthreads = argstruct.num_threads;
 
+#ifdef KOKKOS_ENABLE_OPENMP
+  if ( std::is_same< Kokkos::HostSpace::execution_space, Kokkos::OpenMP >::value ) {
+    // use openmp default num threads
+    if ( expected_nthreads < 0 || ( expected_nthreads == 0 && !Kokkos::hwloc::available() ) ) {
+      expected_nthreads = omp_get_max_threads();
+    }
+    // use hwloc if available
+    else if ( expected_nthreads == 0 && Kokkos::hwloc::available() ) {
+      expected_nthreads = Kokkos::hwloc::get_available_numa_count()
+                        * Kokkos::hwloc::get_available_cores_per_numa()
+                        * Kokkos::hwloc::get_available_threads_per_core();
+    }
+  }
+#endif
+
   if ( expected_nthreads < 1 ) {
     if ( Kokkos::hwloc::available() ) {
       expected_nthreads = Kokkos::hwloc::get_available_numa_count()
@@ -193,12 +208,6 @@ void check_correct_initialization( const Kokkos::InitArguments & argstruct ) {
                         * Kokkos::hwloc::get_available_threads_per_core();
     }
     else {
-#ifdef KOKKOS_ENABLE_OPENMP
-      if ( std::is_same< Kokkos::HostSpace::execution_space, Kokkos::OpenMP >::value ) {
-        expected_nthreads = omp_get_max_threads();
-      }
-      else
-#endif
         expected_nthreads = 1;
     }
 
diff --git a/lib/kokkos/core/unit_test/TestMDRange.hpp b/lib/kokkos/core/unit_test/TestMDRange.hpp
index 091591bcbf..f579ddf02c 100644
--- a/lib/kokkos/core/unit_test/TestMDRange.hpp
+++ b/lib/kokkos/core/unit_test/TestMDRange.hpp
@@ -51,6 +51,180 @@ namespace Test {
 
 namespace {
 
+template <typename ExecSpace >
+struct TestMDRange_ReduceArray_2D {
+
+  using DataType       = int;
+  using ViewType_2     = typename Kokkos::View< DataType**, ExecSpace >;
+  using HostViewType_2 = typename ViewType_2::HostMirror;
+
+  ViewType_2 input_view;
+
+  using scalar_type = double;
+  using value_type = scalar_type[];
+  const unsigned value_count;
+
+  TestMDRange_ReduceArray_2D( const int N0, const int N1, const unsigned array_size ) 
+    : input_view( "input_view", N0, N1 ) 
+    , value_count( array_size )
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void init( scalar_type dst[] ) const
+  {
+    for ( unsigned i = 0; i < value_count; ++i ) {
+      dst[i] = 0.0;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile scalar_type dst[],
+             const volatile scalar_type src[] ) const
+  {
+    for ( unsigned i = 0; i < value_count; ++i ) {
+        dst[i] += src[i];
+    }
+  }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j ) const
+  {
+    input_view( i, j ) = 1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, value_type lsum ) const
+  {
+    lsum[0] += input_view( i, j ) * 2; //+=6 each time if InitTag => N0*N1*6
+    lsum[1] += input_view( i, j ) ;    //+=3 each time if InitTag => N0*N1*3
+  }
+
+  // tagged operators
+  struct InitTag {};
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const InitTag &, const int i, const int j ) const
+  {
+    input_view( i, j ) = 3;
+  }
+
+  static void test_arrayreduce2( const int N0, const int N1 )
+  {
+    using namespace Kokkos::Experimental;
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType<int>, InitTag > range_type_init;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type_init range_init( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
+
+      const unsigned array_size = 2;
+
+      TestMDRange_ReduceArray_2D functor( N0, N1, array_size );
+
+      parallel_for( range_init, functor ); // Init the view to 3's
+
+      double sums[ array_size ];
+      parallel_reduce( range, functor, sums );
+
+      // Check output
+      //printf("Array Reduce result. N0 = %d  N1 = %d  N0*N1 = %d  sums[0] = %lf  sums[1] = %lf \n", N0, N1, N0*N1, sums[0], sums[1]);
+
+      ASSERT_EQ( sums[0], 6 * N0 * N1 );
+      ASSERT_EQ( sums[1], 3 * N0 * N1 );
+    }
+  }
+};
+
+template <typename ExecSpace >
+struct TestMDRange_ReduceArray_3D {
+
+  using DataType       = int;
+  using ViewType_3     = typename Kokkos::View< DataType***, ExecSpace >;
+  using HostViewType_3 = typename ViewType_3::HostMirror;
+
+  ViewType_3 input_view;
+
+  using scalar_type = double;
+  using value_type = scalar_type[];
+  const unsigned value_count;
+
+  TestMDRange_ReduceArray_3D( const int N0, const int N1, const int N2, const unsigned array_size ) 
+    : input_view( "input_view", N0, N1, N2 ) 
+    , value_count( array_size )
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void init( scalar_type dst[] ) const
+  {
+    for ( unsigned i = 0; i < value_count; ++i ) {
+      dst[i] = 0.0;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile scalar_type dst[],
+             const volatile scalar_type src[] ) const
+  {
+    for ( unsigned i = 0; i < value_count; ++i ) {
+        dst[i] += src[i];
+    }
+  }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k ) const
+  {
+    input_view( i, j, k ) = 1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k, value_type lsum ) const
+  {
+    lsum[0] += input_view( i, j, k ) * 2; //+=6 each time if InitTag => N0*N1*N2*6
+    lsum[1] += input_view( i, j, k ) ;    //+=3 each time if InitTag => N0*N1*N2*3
+  }
+
+  // tagged operators
+  struct InitTag {};
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const InitTag &, const int i, const int j, const int k ) const
+  {
+    input_view( i, j, k ) = 3;
+  }
+
+  static void test_arrayreduce3( const int N0, const int N1, const int N2 )
+  {
+    using namespace Kokkos::Experimental;
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3>, Kokkos::IndexType<int>, InitTag > range_type_init;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type_init range_init( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
+
+      const unsigned array_size = 2;
+
+      TestMDRange_ReduceArray_3D functor( N0, N1, N2, array_size );
+
+      parallel_for( range_init, functor ); // Init the view to 3's
+
+      double sums[ array_size ];
+      parallel_reduce( range, functor, sums );
+
+      ASSERT_EQ( sums[0], 6 * N0 * N1 * N2 );
+      ASSERT_EQ( sums[1], 3 * N0 * N1 * N2 );
+    }
+  }
+};
+
+
 template <typename ExecSpace >
 struct TestMDRange_2D {
   using DataType     = int;
@@ -58,6 +232,7 @@ struct TestMDRange_2D {
   using HostViewType = typename ViewType::HostMirror;
 
   ViewType input_view;
+  using value_type = double;
 
   TestMDRange_2D( const DataType N0, const DataType N1 ) : input_view( "input_view", N0, N1 ) {}
 
@@ -68,7 +243,7 @@ struct TestMDRange_2D {
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const int i, const int j, double &lsum ) const
+  void operator()( const int i, const int j, value_type &lsum ) const
   {
     lsum += input_view( i, j ) * 2;
   }
@@ -81,6 +256,13 @@ struct TestMDRange_2D {
     input_view( i, j ) = 3;
   }
 
+  // reduction tagged operators
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const InitTag &, const int i, const int j, value_type &lsum ) const
+  {
+    lsum += input_view( i, j ) * 3;
+  }
+
   static void test_reduce2( const int N0, const int N1 )
   {
     using namespace Kokkos::Experimental;
@@ -94,13 +276,85 @@ struct TestMDRange_2D {
 
       TestMDRange_2D functor( N0, N1 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
       double sum = 0.0;
-      md_parallel_reduce( range, functor, sum );
+      parallel_reduce( range, functor, sum );
 
       ASSERT_EQ( sum, 2 * N0 * N1 );
     }
 
+    // Test with reducers - scalar
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType<int> > range_type;
+      range_type range( {{ 0, 0 }}, {{ N0, N1 }}, {{ 3, 3 }} );
+
+      TestMDRange_2D functor( N0, N1 );
+
+      parallel_for( range, functor );
+
+      value_type sum = 0.0;
+      Kokkos::Experimental::Sum< value_type > reducer_scalar( sum );
+
+      parallel_reduce( range, functor, reducer_scalar );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 );
+    }
+    // Test with reducers - scalar view
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType<int> > range_type;
+      range_type range( {{ 0, 0 }}, {{ N0, N1 }}, {{ 3, 3 }} );
+
+      TestMDRange_2D functor( N0, N1 );
+
+      parallel_for( range, functor );
+
+      value_type sum = 0.0;
+      Kokkos::View< value_type, Kokkos::HostSpace > sum_view("sum_view");
+      sum_view() = sum;
+      Kokkos::Experimental::Sum< value_type > reducer_view( sum_view );
+
+      parallel_reduce( range, functor, reducer_view);
+      sum = sum_view();
+
+      ASSERT_EQ( sum, 2 * N0 * N1 );
+    }
+
+    // Tagged operator test
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2, Iterate::Default, Iterate::Default >, Kokkos::IndexType<int>, InitTag > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 2, 4 } } );
+
+      TestMDRange_2D functor( N0, N1 );
+
+      parallel_for( range, functor );
+
+      // check parallel_for results correct with InitTag
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      {
+        if ( h_view( i, j ) != 3 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Defaults + InitTag op(): Errors in test_for3; mismatches = %d\n\n", counter );
+      }
+      ASSERT_EQ( counter, 0 );
+
+
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 9 * N0 * N1 );
+    }
+
     {
       typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2, Iterate::Default, Iterate::Default>, Kokkos::IndexType<int> > range_type;
       typedef typename range_type::tile_type tile_type;
@@ -110,9 +364,9 @@ struct TestMDRange_2D {
 
       TestMDRange_2D functor( N0, N1 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
       double sum = 0.0;
-      md_parallel_reduce( range, functor, sum );
+      parallel_reduce( range, functor, sum );
 
       ASSERT_EQ( sum, 2 * N0 * N1 );
     }
@@ -126,9 +380,9 @@ struct TestMDRange_2D {
 
       TestMDRange_2D functor( N0, N1 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
       double sum = 0.0;
-      md_parallel_reduce( range, functor, sum );
+      parallel_reduce( range, functor, sum );
 
       ASSERT_EQ( sum, 2 * N0 * N1 );
     }
@@ -142,9 +396,9 @@ struct TestMDRange_2D {
 
       TestMDRange_2D functor( N0, N1 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
       double sum = 0.0;
-      md_parallel_reduce( range, functor, sum );
+      parallel_reduce( range, functor, sum );
 
       ASSERT_EQ( sum, 2 * N0 * N1 );
     }
@@ -158,9 +412,9 @@ struct TestMDRange_2D {
 
       TestMDRange_2D functor( N0, N1 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
       double sum = 0.0;
-      md_parallel_reduce( range, functor, sum );
+      parallel_reduce( range, functor, sum );
 
       ASSERT_EQ( sum, 2 * N0 * N1 );
     }
@@ -174,9 +428,9 @@ struct TestMDRange_2D {
 
       TestMDRange_2D functor( N0, N1 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
       double sum = 0.0;
-      md_parallel_reduce( range, functor, sum );
+      parallel_reduce( range, functor, sum );
 
       ASSERT_EQ( sum, 2 * N0 * N1 );
     }
@@ -194,7 +448,7 @@ struct TestMDRange_2D {
       range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
       TestMDRange_2D functor( N0, N1 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -223,7 +477,7 @@ struct TestMDRange_2D {
       range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
       TestMDRange_2D functor( N0, N1 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -251,7 +505,7 @@ struct TestMDRange_2D {
       range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } } );
       TestMDRange_2D functor( N0, N1 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -280,7 +534,7 @@ struct TestMDRange_2D {
       range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
       TestMDRange_2D functor( N0, N1 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -309,7 +563,7 @@ struct TestMDRange_2D {
       range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 4, 4 } } );
       TestMDRange_2D functor( N0, N1 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -338,7 +592,7 @@ struct TestMDRange_2D {
       range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
       TestMDRange_2D functor( N0, N1 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -367,7 +621,7 @@ struct TestMDRange_2D {
       range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 7, 7 } } );
       TestMDRange_2D functor( N0, N1 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -396,7 +650,7 @@ struct TestMDRange_2D {
       range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 16, 16 } } );
       TestMDRange_2D functor( N0, N1 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -425,7 +679,7 @@ struct TestMDRange_2D {
       range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 5, 16 } } );
       TestMDRange_2D functor( N0, N1 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -455,6 +709,7 @@ struct TestMDRange_3D {
   using HostViewType = typename ViewType::HostMirror;
 
   ViewType input_view;
+  using value_type = double;
 
   TestMDRange_3D( const DataType N0, const DataType N1, const DataType N2 ) : input_view( "input_view", N0, N1, N2 ) {}
 
@@ -478,6 +733,13 @@ struct TestMDRange_3D {
     input_view( i, j, k ) = 3;
   }
 
+  // reduction tagged operators
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const InitTag &, const int i, const int j, const int k, value_type &lsum ) const
+  {
+    lsum += input_view( i, j, k ) * 3;
+  }
+
   static void test_reduce3( const int N0, const int N1, const int N2 )
   {
     using namespace Kokkos::Experimental;
@@ -491,13 +753,86 @@ struct TestMDRange_3D {
 
       TestMDRange_3D functor( N0, N1, N2 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
       double sum = 0.0;
-      md_parallel_reduce( range, functor, sum );
+      parallel_reduce( range, functor, sum );
 
       ASSERT_EQ( sum, 2 * N0 * N1 * N2 );
     }
 
+    // Test with reducers - scalar
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3>, Kokkos::IndexType<int> > range_type;
+      range_type range( {{ 0, 0, 0 }}, {{ N0, N1, N2 }}, {{ 3, 3, 3 }} );
+
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      parallel_for( range, functor );
+
+      value_type sum = 0.0;
+      Kokkos::Experimental::Sum< value_type > reducer_scalar( sum );
+
+      parallel_reduce( range, functor, reducer_scalar );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 );
+    }
+    // Test with reducers - scalar view
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3>, Kokkos::IndexType<int> > range_type;
+      range_type range( {{ 0, 0, 0 }}, {{ N0, N1, N2 }}, {{ 3, 3, 3 }} );
+
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      parallel_for( range, functor );
+
+      value_type sum = 0.0;
+      Kokkos::View< value_type, Kokkos::HostSpace > sum_view("sum_view");
+      sum_view() = sum;
+      Kokkos::Experimental::Sum< value_type > reducer_view( sum_view );
+
+      parallel_reduce( range, functor, reducer_view);
+      sum = sum_view();
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 );
+    }
+
+    // Tagged operator test
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3, Iterate::Default, Iterate::Default >, Kokkos::IndexType<int>, InitTag > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 2, 4, 6 } } );
+
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      parallel_for( range, functor );
+
+      // check parallel_for results correct with InitTag
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      {
+        if ( h_view( i, j, k ) != 3 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Defaults + InitTag op(): Errors in test_for3; mismatches = %d\n\n", counter );
+      }
+      ASSERT_EQ( counter, 0 );
+
+
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 9 * N0 * N1 * N2 );
+    }
+
     {
       typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3, Iterate::Default, Iterate::Default >, Kokkos::IndexType<int> > range_type;
       typedef typename range_type::tile_type tile_type;
@@ -507,9 +842,9 @@ struct TestMDRange_3D {
 
       TestMDRange_3D functor( N0, N1, N2 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
       double sum = 0.0;
-      md_parallel_reduce( range, functor, sum );
+      parallel_reduce( range, functor, sum );
 
       ASSERT_EQ( sum, 2 * N0 * N1 * N2 );
     }
@@ -523,9 +858,9 @@ struct TestMDRange_3D {
 
       TestMDRange_3D functor( N0, N1, N2 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
       double sum = 0.0;
-      md_parallel_reduce( range, functor, sum );
+      parallel_reduce( range, functor, sum );
 
       ASSERT_EQ( sum, 2 * N0 * N1 * N2 );
     }
@@ -539,9 +874,9 @@ struct TestMDRange_3D {
 
       TestMDRange_3D functor( N0, N1, N2 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
       double sum = 0.0;
-      md_parallel_reduce( range, functor, sum );
+      parallel_reduce( range, functor, sum );
 
       ASSERT_EQ( sum, 2 * N0 * N1 * N2 );
     }
@@ -555,9 +890,9 @@ struct TestMDRange_3D {
 
       TestMDRange_3D functor( N0, N1, N2 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
       double sum = 0.0;
-      md_parallel_reduce( range, functor, sum );
+      parallel_reduce( range, functor, sum );
 
       ASSERT_EQ( sum, 2 * N0 * N1 * N2 );
     }
@@ -571,9 +906,9 @@ struct TestMDRange_3D {
 
       TestMDRange_3D functor( N0, N1, N2 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
       double sum = 0.0;
-      md_parallel_reduce( range, functor, sum );
+      parallel_reduce( range, functor, sum );
 
       ASSERT_EQ( sum, 2 * N0 * N1 * N2 );
     }
@@ -590,7 +925,7 @@ struct TestMDRange_3D {
       range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } } );
       TestMDRange_3D functor( N0, N1, N2 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -620,7 +955,7 @@ struct TestMDRange_3D {
       range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
       TestMDRange_3D functor( N0, N1, N2 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -651,7 +986,7 @@ struct TestMDRange_3D {
 
       TestMDRange_3D functor( N0, N1, N2 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -681,7 +1016,7 @@ struct TestMDRange_3D {
       range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
       TestMDRange_3D functor( N0, N1, N2 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -711,7 +1046,7 @@ struct TestMDRange_3D {
       range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 2, 4, 2 } } );
       TestMDRange_3D functor( N0, N1, N2 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -741,7 +1076,7 @@ struct TestMDRange_3D {
       range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 5, 7 } } );
       TestMDRange_3D functor( N0, N1, N2 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -771,7 +1106,7 @@ struct TestMDRange_3D {
       range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 8, 8, 8 } } );
       TestMDRange_3D functor( N0, N1, N2 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -801,7 +1136,7 @@ struct TestMDRange_3D {
       range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 2, 4, 2 } } );
       TestMDRange_3D functor( N0, N1, N2 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -832,6 +1167,7 @@ struct TestMDRange_4D {
   using HostViewType = typename ViewType::HostMirror;
 
   ViewType input_view;
+  using value_type = double;
 
   TestMDRange_4D( const DataType N0, const DataType N1, const DataType N2, const DataType N3 ) : input_view( "input_view", N0, N1, N2, N3 ) {}
 
@@ -855,6 +1191,191 @@ struct TestMDRange_4D {
     input_view( i, j, k, l ) = 3;
   }
 
+  // reduction tagged operators
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const InitTag &, const int i, const int j, const int k, const int l, value_type &lsum ) const
+  {
+    lsum += input_view( i, j, k, l ) * 3;
+  }
+
+  static void test_reduce4( const int N0, const int N1, const int N2, const int N3 )
+  {
+    using namespace Kokkos::Experimental;
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 3, 3, 3 } } );
+
+      TestMDRange_4D functor( N0, N1, N2, N3 );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 );
+    }
+
+    // Test with reducers - scalar
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4>, Kokkos::IndexType<int> > range_type;
+      range_type range( {{ 0, 0, 0, 0 }}, {{ N0, N1, N2, N3 }}, {{ 3, 3, 3, 3 }} );
+
+      TestMDRange_4D functor( N0, N1, N2, N3 );
+
+      parallel_for( range, functor );
+
+      value_type sum = 0.0;
+      Kokkos::Experimental::Sum< value_type > reducer_scalar( sum );
+
+      parallel_reduce( range, functor, reducer_scalar );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 );
+    }
+
+    // Test with reducers - scalar view
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4>, Kokkos::IndexType<int> > range_type;
+      range_type range( {{ 0, 0, 0, 0 }}, {{ N0, N1, N2, N3 }}, {{ 3, 3, 3, 3 }} );
+
+      TestMDRange_4D functor( N0, N1, N2, N3 );
+
+      parallel_for( range, functor );
+
+      value_type sum = 0.0;
+      Kokkos::View< value_type, Kokkos::HostSpace > sum_view("sum_view");
+      sum_view() = sum;
+      Kokkos::Experimental::Sum< value_type > reducer_view( sum_view );
+
+      parallel_reduce( range, functor, reducer_view);
+      sum = sum_view();
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 );
+    }
+
+    // Tagged operator test
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4, Iterate::Default, Iterate::Default >, Kokkos::IndexType<int>, InitTag > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 2, 4, 6, 2 } } );
+
+      TestMDRange_4D functor( N0, N1, N2, N3 );
+
+      parallel_for( range, functor );
+
+      // check parallel_for results correct with InitTag
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      {
+        if ( h_view( i, j, k, l ) != 3 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Defaults + InitTag op(): Errors in test_reduce4 parallel_for init; mismatches = %d\n\n", counter );
+      }
+      ASSERT_EQ( counter, 0 );
+
+
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 9 * N0 * N1 * N2 * N3 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4, Iterate::Default, Iterate::Default >, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 2, 4, 6, 2 } } );
+
+      TestMDRange_4D functor( N0, N1, N2, N3 );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4, Iterate::Left, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 2, 4, 6, 2 } } );
+
+      TestMDRange_4D functor( N0, N1, N2, N3 );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4, Iterate::Left, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 2, 4, 6, 2 } } );
+
+      TestMDRange_4D functor( N0, N1, N2, N3 );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4, Iterate::Right, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 2, 4, 6, 2 } } );
+
+      TestMDRange_4D functor( N0, N1, N2, N3 );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4, Iterate::Right, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 2, 4, 6, 2 } } );
+
+      TestMDRange_4D functor( N0, N1, N2, N3 );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 );
+    }
+  } // end test_reduce
+
+
+
   static void test_for4( const int N0, const int N1, const int N2, const int N3 )
   {
     using namespace Kokkos::Experimental;
@@ -866,7 +1387,7 @@ struct TestMDRange_4D {
       range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } } );
       TestMDRange_4D functor( N0, N1, N2, N3 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -897,7 +1418,7 @@ struct TestMDRange_4D {
       range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 11, 3, 3 } } );
       TestMDRange_4D functor( N0, N1, N2, N3 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -929,7 +1450,7 @@ struct TestMDRange_4D {
 
       TestMDRange_4D functor( N0, N1, N2, N3 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -961,7 +1482,7 @@ struct TestMDRange_4D {
 
       TestMDRange_4D functor( N0, N1, N2, N3 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -993,7 +1514,7 @@ struct TestMDRange_4D {
 
       TestMDRange_4D functor( N0, N1, N2, N3 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -1025,7 +1546,7 @@ struct TestMDRange_4D {
 
       TestMDRange_4D functor( N0, N1, N2, N3 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -1057,7 +1578,7 @@ struct TestMDRange_4D {
 
       TestMDRange_4D functor( N0, N1, N2, N3 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -1089,7 +1610,7 @@ struct TestMDRange_4D {
 
       TestMDRange_4D functor( N0, N1, N2, N3 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -1121,6 +1642,7 @@ struct TestMDRange_5D {
   using HostViewType = typename ViewType::HostMirror;
 
   ViewType input_view;
+  using value_type = double;
 
   TestMDRange_5D( const DataType N0, const DataType N1, const DataType N2, const DataType N3, const DataType N4 ) : input_view( "input_view", N0, N1, N2, N3, N4 ) {}
 
@@ -1131,7 +1653,7 @@ struct TestMDRange_5D {
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const int i, const int j, const int k, const int l, const int m, double &lsum ) const
+  void operator()( const int i, const int j, const int k, const int l, const int m, value_type &lsum ) const
   {
     lsum += input_view( i, j, k, l, m ) * 2;
   }
@@ -1144,6 +1666,110 @@ struct TestMDRange_5D {
     input_view( i, j, k, l, m ) = 3;
   }
 
+  // reduction tagged operators
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const InitTag &, const int i, const int j, const int k, const int l, const int m, value_type &lsum ) const
+  {
+    lsum += input_view( i, j, k, l, m ) * 3;
+  }
+
+  static void test_reduce5( const int N0, const int N1, const int N2, const int N3, const int N4 )
+  {
+    using namespace Kokkos::Experimental;
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<5>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 3 } } );
+
+      TestMDRange_5D functor( N0, N1, N2, N3, N4 );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 );
+    }
+
+    // Test with reducers - scalar
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<5>, Kokkos::IndexType<int> > range_type;
+      range_type range( {{ 0, 0, 0, 0, 0 }}, {{ N0, N1, N2, N3, N4 }}, {{ 3, 3, 3, 3, 3 }} );
+
+      TestMDRange_5D functor( N0, N1, N2, N3, N4 );
+
+      parallel_for( range, functor );
+
+      value_type sum = 0.0;
+      Kokkos::Experimental::Sum< value_type > reducer_scalar( sum );
+
+      parallel_reduce( range, functor, reducer_scalar );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 );
+    }
+
+    // Test with reducers - scalar view
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<5>, Kokkos::IndexType<int> > range_type;
+      range_type range( {{ 0, 0, 0, 0, 0 }}, {{ N0, N1, N2, N3, N4 }}, {{ 3, 3, 3, 3, 3 }} );
+
+      TestMDRange_5D functor( N0, N1, N2, N3, N4 );
+
+      parallel_for( range, functor );
+
+      value_type sum = 0.0;
+      Kokkos::View< value_type, Kokkos::HostSpace > sum_view("sum_view");
+      sum_view() = sum;
+      Kokkos::Experimental::Sum< value_type > reducer_view( sum_view );
+
+      parallel_reduce( range, functor, reducer_view);
+      sum = sum_view();
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 );
+    }
+
+    // Tagged operator test
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<5, Iterate::Default, Iterate::Default >, Kokkos::IndexType<int>, InitTag > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 2, 4, 6, 2, 2 } } );
+
+      TestMDRange_5D functor( N0, N1, N2, N3, N4 );
+
+      parallel_for( range, functor );
+
+      // check parallel_for results correct with InitTag
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      {
+        if ( h_view( i, j, k, l, m ) != 3 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Defaults + InitTag op(): Errors in test_reduce5 parallel_for init; mismatches = %d\n\n", counter );
+      }
+      ASSERT_EQ( counter, 0 );
+
+
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 9 * N0 * N1 * N2 * N3 * N4 );
+    }
+  }
+
   static void test_for5( const int N0, const int N1, const int N2, const int N3, const int N4 )
   {
     using namespace Kokkos::Experimental;
@@ -1155,7 +1781,7 @@ struct TestMDRange_5D {
       range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } } );
       TestMDRange_5D functor( N0, N1, N2, N3, N4 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -1184,10 +1810,10 @@ struct TestMDRange_5D {
       typedef typename range_type::tile_type tile_type;
       typedef typename range_type::point_type point_type;
 
-      range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 7 } } );
+      range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 5 } } );
       TestMDRange_5D functor( N0, N1, N2, N3, N4 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -1220,7 +1846,7 @@ struct TestMDRange_5D {
 
       TestMDRange_5D functor( N0, N1, N2, N3, N4 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -1253,7 +1879,7 @@ struct TestMDRange_5D {
 
       TestMDRange_5D functor( N0, N1, N2, N3, N4 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -1286,7 +1912,7 @@ struct TestMDRange_5D {
 
       TestMDRange_5D functor( N0, N1, N2, N3, N4 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -1319,7 +1945,7 @@ struct TestMDRange_5D {
 
       TestMDRange_5D functor( N0, N1, N2, N3, N4 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -1352,7 +1978,7 @@ struct TestMDRange_5D {
 
       TestMDRange_5D functor( N0, N1, N2, N3, N4 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -1385,7 +2011,7 @@ struct TestMDRange_5D {
 
       TestMDRange_5D functor( N0, N1, N2, N3, N4 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -1418,6 +2044,7 @@ struct TestMDRange_6D {
   using HostViewType = typename ViewType::HostMirror;
 
   ViewType input_view;
+  using value_type = double;
 
   TestMDRange_6D( const DataType N0, const DataType N1, const DataType N2, const DataType N3, const DataType N4, const DataType N5 ) : input_view( "input_view", N0, N1, N2, N3, N4, N5 ) {}
 
@@ -1428,7 +2055,7 @@ struct TestMDRange_6D {
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const int i, const int j, const int k, const int l, const int m, const int n, double &lsum ) const
+  void operator()( const int i, const int j, const int k, const int l, const int m, const int n, value_type &lsum ) const
   {
     lsum += input_view( i, j, k, l, m, n ) * 2;
   }
@@ -1441,6 +2068,111 @@ struct TestMDRange_6D {
     input_view( i, j, k, l, m, n ) = 3;
   }
 
+  // reduction tagged operators
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const InitTag &, const int i, const int j, const int k, const int l, const int m, const int n, value_type &lsum ) const
+  {
+    lsum += input_view( i, j, k, l, m, n ) * 3;
+  }
+
+  static void test_reduce6( const int N0, const int N1, const int N2, const int N3, const int N4, const int N5 )
+  {
+    using namespace Kokkos::Experimental;
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<6>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 3, 2 } } );
+
+      TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
+
+      parallel_for( range, functor );
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 * N5 );
+    }
+
+    // Test with reducers - scalar
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<6>, Kokkos::IndexType<int> > range_type;
+      range_type range( {{ 0, 0, 0, 0, 0, 0 }}, {{ N0, N1, N2, N3, N4, N5 }}, {{ 3, 3, 3, 3, 3, 2 }} );
+
+      TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
+
+      parallel_for( range, functor );
+
+      value_type sum = 0.0;
+      Kokkos::Experimental::Sum< value_type > reducer_scalar( sum );
+
+      parallel_reduce( range, functor, reducer_scalar );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 * N5 );
+    }
+
+    // Test with reducers - scalar view
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<6>, Kokkos::IndexType<int> > range_type;
+      range_type range( {{ 0, 0, 0, 0, 0, 0 }}, {{ N0, N1, N2, N3, N4, N5 }}, {{ 3, 3, 3, 3, 3, 2 }} );
+
+      TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
+
+      parallel_for( range, functor );
+
+      value_type sum = 0.0;
+      Kokkos::View< value_type, Kokkos::HostSpace > sum_view("sum_view");
+      sum_view() = sum;
+      Kokkos::Experimental::Sum< value_type > reducer_view( sum_view );
+
+      parallel_reduce( range, functor, reducer_view);
+      sum = sum_view();
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 * N5 );
+    }
+
+    // Tagged operator test
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<6, Iterate::Default, Iterate::Default >, Kokkos::IndexType<int>, InitTag > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 2, 4, 6, 2, 2, 2 } } );
+
+      TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
+
+      parallel_for( range, functor );
+
+      // check parallel_for results correct with InitTag
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      for ( int n = 0; n < N5; ++n )
+      {
+        if ( h_view( i, j, k, l, m, n ) != 3 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Defaults + InitTag op(): Errors in test_reduce6 parallel_for init; mismatches = %d\n\n", counter );
+      }
+      ASSERT_EQ( counter, 0 );
+
+
+      double sum = 0.0;
+      parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 9 * N0 * N1 * N2 * N3 * N4 * N5 );
+    }
+  }
+
   static void test_for6( const int N0, const int N1, const int N2, const int N3, const int N4, const int N5 )
   {
     using namespace Kokkos::Experimental;
@@ -1452,7 +2184,7 @@ struct TestMDRange_6D {
       range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } } );
       TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -1485,7 +2217,7 @@ struct TestMDRange_6D {
       range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 2, 3 } } ); //tile dims 3,3,3,3,3,3 more than cuda can handle with debugging
       TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -1519,7 +2251,7 @@ struct TestMDRange_6D {
 
       TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -1553,7 +2285,7 @@ struct TestMDRange_6D {
 
       TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -1587,7 +2319,7 @@ struct TestMDRange_6D {
 
       TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -1621,7 +2353,7 @@ struct TestMDRange_6D {
 
       TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -1655,7 +2387,7 @@ struct TestMDRange_6D {
 
       TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -1689,7 +2421,7 @@ struct TestMDRange_6D {
 
       TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
 
-      md_parallel_for( range, functor );
+      parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
       Kokkos::deep_copy( h_view, functor.input_view );
@@ -1726,11 +2458,19 @@ TEST_F( TEST_CATEGORY , mdrange_for ) {
   TestMDRange_6D< TEST_EXECSPACE >::test_for6( 10, 10, 10, 10, 5, 5 );
 }
 
-#ifndef KOKKOS_ENABLE_CUDA
 TEST_F( TEST_CATEGORY , mdrange_reduce ) {
   TestMDRange_2D< TEST_EXECSPACE >::test_reduce2( 100, 100 );
   TestMDRange_3D< TEST_EXECSPACE >::test_reduce3( 100, 10, 100 );
+  TestMDRange_4D< TEST_EXECSPACE >::test_reduce4( 100, 10, 10, 10 );
+  TestMDRange_5D< TEST_EXECSPACE >::test_reduce5( 100, 10, 10, 10, 5 );
+  TestMDRange_6D< TEST_EXECSPACE >::test_reduce6( 100, 10, 10, 10, 5, 5 );
 }
-#endif
+
+//#ifndef KOKKOS_ENABLE_CUDA
+TEST_F( TEST_CATEGORY , mdrange_array_reduce ) {
+  TestMDRange_ReduceArray_2D< TEST_EXECSPACE >::test_arrayreduce2( 4, 5 );
+  TestMDRange_ReduceArray_3D< TEST_EXECSPACE >::test_arrayreduce3( 4, 5, 10 );
+}
+//#endif
 
 } // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestMemoryPool.hpp b/lib/kokkos/core/unit_test/TestMemoryPool.hpp
index 941cd6c26d..9f708390c2 100644
--- a/lib/kokkos/core/unit_test/TestMemoryPool.hpp
+++ b/lib/kokkos/core/unit_test/TestMemoryPool.hpp
@@ -54,6 +54,96 @@
 
 namespace TestMemoryPool {
 
+template< typename MemSpace = Kokkos::HostSpace >
+void test_host_memory_pool_defaults()
+{
+  typedef typename MemSpace::execution_space   Space ;
+  typedef typename Kokkos::MemoryPool< Space > MemPool ;
+
+  {
+    const size_t MemoryCapacity = 32000 ;
+    const size_t MinBlockSize   =    64 ;
+    const size_t MaxBlockSize   =  1024 ;
+    const size_t SuperBlockSize =  4096 ;
+
+    MemPool pool( MemSpace()
+                , MemoryCapacity
+                , MinBlockSize
+                , MaxBlockSize
+                , SuperBlockSize
+                );
+
+    typename MemPool::usage_statistics stats ;
+
+    pool.get_usage_statistics( stats );
+
+    ASSERT_LE( MemoryCapacity , stats.capacity_bytes );
+    ASSERT_LE( MinBlockSize , stats.min_block_bytes );
+    ASSERT_LE( MaxBlockSize , stats.max_block_bytes );
+    ASSERT_LE( SuperBlockSize , stats.superblock_bytes );
+  }
+
+  {
+    const size_t MemoryCapacity = 10000 ;
+
+    MemPool pool( MemSpace()
+                , MemoryCapacity
+                );
+
+    typename MemPool::usage_statistics stats ;
+
+    pool.get_usage_statistics( stats );
+
+    ASSERT_LE( MemoryCapacity , stats.capacity_bytes );
+    ASSERT_LE( 64u /* default */ , stats.min_block_bytes );
+    ASSERT_LE( stats.min_block_bytes , stats.max_block_bytes );
+    ASSERT_LE( stats.max_block_bytes , stats.superblock_bytes );
+    ASSERT_LE( stats.superblock_bytes , stats.capacity_bytes );
+  }
+
+  {
+    const size_t MemoryCapacity = 10000 ;
+    const size_t MinBlockSize   =    32 ; // power of two is exact
+
+    MemPool pool( MemSpace()
+                , MemoryCapacity
+                , MinBlockSize
+                );
+
+    typename MemPool::usage_statistics stats ;
+
+    pool.get_usage_statistics( stats );
+
+    ASSERT_LE( MemoryCapacity , stats.capacity_bytes );
+    ASSERT_EQ( MinBlockSize , stats.min_block_bytes );
+    ASSERT_LE( stats.min_block_bytes , stats.max_block_bytes );
+    ASSERT_LE( stats.max_block_bytes , stats.superblock_bytes );
+    ASSERT_LE( stats.superblock_bytes , stats.capacity_bytes );
+  }
+
+  {
+    const size_t MemoryCapacity = 32000 ;
+    const size_t MinBlockSize   =    32 ; // power of two is exact
+    const size_t MaxBlockSize   =  1024 ; // power of two is exact
+
+    MemPool pool( MemSpace()
+                , MemoryCapacity
+                , MinBlockSize
+                , MaxBlockSize
+                );
+
+    typename MemPool::usage_statistics stats ;
+
+    pool.get_usage_statistics( stats );
+
+    ASSERT_LE( MemoryCapacity , stats.capacity_bytes );
+    ASSERT_EQ( MinBlockSize , stats.min_block_bytes );
+    ASSERT_EQ( MaxBlockSize , stats.max_block_bytes );
+    ASSERT_LE( stats.max_block_bytes , stats.superblock_bytes );
+    ASSERT_LE( stats.superblock_bytes , stats.capacity_bytes );
+  }
+}
+
 template< typename MemSpace = Kokkos::HostSpace >
 void test_host_memory_pool_stats()
 {
@@ -188,8 +278,8 @@ void print_memory_pool_stats
             << "  bytes reserved = " << stats.reserved_bytes << std::endl
             << "  bytes free     = " << ( stats.capacity_bytes -
                ( stats.consumed_bytes + stats.reserved_bytes ) ) << std::endl
-            << "  alloc used     = " << stats.consumed_blocks << std::endl
-            << "  alloc reserved = " << stats.reserved_blocks << std::endl
+            << "  block used     = " << stats.consumed_blocks << std::endl
+            << "  block reserved = " << stats.reserved_blocks << std::endl
             << "  super used     = " << stats.consumed_superblocks << std::endl
             << "  super reserved = " << ( stats.capacity_superblocks -
                                     stats.consumed_superblocks ) << std::endl
@@ -302,15 +392,147 @@ void test_memory_pool_v2( const bool print_statistics
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
-} // namespace TestMemoryPool {
+template< class DeviceType >
+struct TestMemoryPoolCorners {
+
+  typedef Kokkos::View< uintptr_t * , DeviceType >  ptrs_type ;
+  typedef Kokkos::MemoryPool< DeviceType >          pool_type ;
+
+  pool_type pool ;
+  ptrs_type ptrs ;
+  uint32_t  size ;
+  uint32_t  stride ;
+
+  TestMemoryPoolCorners( const pool_type & arg_pool
+                       , const ptrs_type & arg_ptrs
+                       , const uint32_t arg_base
+                       , const uint32_t arg_stride
+                       )
+    : pool( arg_pool )
+    , ptrs( arg_ptrs )
+    , size( arg_base )
+    , stride( arg_stride )
+    {}
+
+  // Specify reduction argument value_type to
+  // avoid confusion with tag-dispatch.
+
+  using value_type = long ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int i , long & err ) const noexcept
+    {
+      unsigned alloc_size = size << ( i % stride );
+      if ( 0 == ptrs(i) ) {
+        ptrs(i) = (uintptr_t) pool.allocate( alloc_size );
+        if ( ptrs(i) && ! alloc_size ) { ++err ; }
+      }
+    }
+
+  struct TagDealloc {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int i ) const noexcept
+    {
+      unsigned alloc_size = size << ( i % stride );
+      if ( ptrs(i) ) { pool.deallocate( (void*) ptrs(i) , alloc_size ); }
+      ptrs(i) = 0 ;
+    }
+};
+
+template< class DeviceType >
+void test_memory_pool_corners( const bool print_statistics
+                             , const bool print_superblocks )
+{
+  typedef typename DeviceType::memory_space     memory_space ;
+  typedef typename DeviceType::execution_space  execution_space ;
+  typedef Kokkos::MemoryPool< DeviceType >      pool_type ;
+  typedef TestMemoryPoolCorners< DeviceType >   functor_type ;
+  typedef typename functor_type::ptrs_type      ptrs_type ;
+
+  {
+    // superblock size 1 << 14 
+    const size_t  min_superblock_size = 1u << 14 ;
+
+    // four superblocks
+    const size_t total_alloc_size = min_superblock_size * 4 ;
+
+    // block sizes  {  64 , 128 , 256 , 512 }
+    // block counts { 256 , 128 ,  64 ,  32 }
+    const unsigned  min_block_size  = 64 ;
+    const unsigned  max_block_size  = 512 ;
+    const unsigned  num_blocks      = 480 ;
+
+    pool_type pool( memory_space()
+                  , total_alloc_size
+                  , min_block_size
+                  , max_block_size
+                  , min_superblock_size );
+
+    // Allocate one block from each superblock to lock that
+    // superblock into the block size.
+
+    ptrs_type ptrs("ptrs",num_blocks);
+
+    long err = 0 ;
+
+    Kokkos::parallel_reduce
+      ( Kokkos::RangePolicy< execution_space >(0,4)
+      , functor_type( pool , ptrs , 64 , 4 )
+      , err
+      );
+
+    if ( print_statistics || err ) {
+
+      typename pool_type::usage_statistics stats ;
+
+      pool.get_usage_statistics( stats );
+
+      print_memory_pool_stats< pool_type >( stats );
+    }
+
+    if ( print_superblocks || err ) {
+      pool.print_state( std::cout );
+    }
+
+    // Now fill remaining allocations with small size
+
+    Kokkos::parallel_reduce
+      ( Kokkos::RangePolicy< execution_space >(0,num_blocks)
+      , functor_type( pool , ptrs , 64 , 1 )
+      , err
+      );
+
+    if ( print_statistics || err ) {
+
+      typename pool_type::usage_statistics stats ;
+
+      pool.get_usage_statistics( stats );
+
+      print_memory_pool_stats< pool_type >( stats );
+    }
+
+    if ( print_superblocks || err ) {
+      pool.print_state( std::cout );
+    }
+  }
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+} // namespace TestMemoryPool
 
 namespace Test {
 
 TEST_F( TEST_CATEGORY, memory_pool )
 {
+  TestMemoryPool::test_host_memory_pool_defaults<>();
   TestMemoryPool::test_host_memory_pool_stats<>();
   TestMemoryPool::test_memory_pool_v2< TEST_EXECSPACE >(false,false);
+  TestMemoryPool::test_memory_pool_corners< TEST_EXECSPACE >(false,false);
 }
+
 }
 
 #endif
diff --git a/lib/kokkos/core/unit_test/TestRange.hpp b/lib/kokkos/core/unit_test/TestRange.hpp
index f55574761b..3cea1ad4a0 100644
--- a/lib/kokkos/core/unit_test/TestRange.hpp
+++ b/lib/kokkos/core/unit_test/TestRange.hpp
@@ -72,8 +72,33 @@ struct TestRange {
     typename view_type::HostMirror host_flags = Kokkos::create_mirror_view( m_flags );
 
     Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace, ScheduleType >( 0, N ), *this );
+
+#if defined(KOKKOS_ENABLE_PROFILING)
+    {
+      typedef TestRange< ExecSpace, ScheduleType > ThisType;
+      std::string label("parallel_for");
+      Kokkos::Impl::ParallelConstructName< ThisType, void> pcn(label);
+      ASSERT_EQ( pcn.get(), label );
+      std::string empty_label("");
+      Kokkos::Impl::ParallelConstructName< ThisType, void> empty_pcn(empty_label);
+      ASSERT_EQ( empty_pcn.get(), typeid(ThisType).name() );
+    }
+#endif
+
     Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace, ScheduleType, VerifyInitTag >( 0, N ), *this );
 
+#if defined(KOKKOS_ENABLE_PROFILING)
+    {
+      typedef TestRange< ExecSpace, ScheduleType > ThisType;
+      std::string label("parallel_for");
+      Kokkos::Impl::ParallelConstructName< ThisType, VerifyInitTag> pcn(label);
+      ASSERT_EQ( pcn.get(), label );
+      std::string empty_label("");
+      Kokkos::Impl::ParallelConstructName< ThisType, VerifyInitTag> empty_pcn(empty_label);
+      ASSERT_EQ( empty_pcn.get(), std::string(typeid(ThisType).name()) + "/" + typeid(VerifyInitTag).name() );
+    }
+#endif
+
     Kokkos::deep_copy( host_flags, m_flags );
 
     int error_count = 0;
diff --git a/lib/kokkos/core/unit_test/TestResize.hpp b/lib/kokkos/core/unit_test/TestResize.hpp
new file mode 100644
index 0000000000..aaf0422b19
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestResize.hpp
@@ -0,0 +1,140 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#ifndef TESTVIEWSUBVIEW_HPP_
+#define TESTVIEWSUBVIEW_HPP_
+
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+
+namespace TestViewResize {
+
+template<class DeviceType>
+void testResize ()
+{
+  const int sizes[8] = {2, 3, 4, 5, 6, 7, 8, 9};
+
+  // Check #904 fix (no reallocation if dimensions didn't change).
+  {
+    typedef Kokkos::View<int*, DeviceType> view_type;
+    view_type view_1d ("view_1d", sizes[0]);
+    const int* oldPointer = view_1d.data ();
+    EXPECT_TRUE( oldPointer != NULL );
+    Kokkos::resize (view_1d, sizes[0]);
+    const int* newPointer = view_1d.data ();
+    EXPECT_TRUE( oldPointer == newPointer );
+  }
+  {
+    typedef Kokkos::View<int**, DeviceType> view_type;
+    view_type view_2d ("view_2d", sizes[0], sizes[1]);
+    const int* oldPointer = view_2d.data ();
+    EXPECT_TRUE( oldPointer != NULL );
+    Kokkos::resize (view_2d, sizes[0], sizes[1]);
+    const int* newPointer = view_2d.data ();
+    EXPECT_TRUE( oldPointer == newPointer );
+  }
+  {
+    typedef Kokkos::View<int***, DeviceType> view_type;
+    view_type view_3d ("view_3d", sizes[0], sizes[1], sizes[2]);
+    const int* oldPointer = view_3d.data ();
+    EXPECT_TRUE( oldPointer != NULL );
+    Kokkos::resize (view_3d, sizes[0], sizes[1], sizes[2]);
+    const int* newPointer = view_3d.data ();
+    EXPECT_TRUE( oldPointer == newPointer );
+  }
+  {
+    typedef Kokkos::View<int****, DeviceType> view_type;
+    view_type view_4d ("view_4d", sizes[0], sizes[1], sizes[2], sizes[3]);
+    const int* oldPointer = view_4d.data ();
+    EXPECT_TRUE( oldPointer != NULL );
+    Kokkos::resize (view_4d, sizes[0], sizes[1], sizes[2], sizes[3]);
+    const int* newPointer = view_4d.data ();
+    EXPECT_TRUE( oldPointer == newPointer );
+  }
+  {
+    typedef Kokkos::View<int*****, DeviceType> view_type;
+    view_type view_5d ("view_5d", sizes[0], sizes[1], sizes[2], sizes[3],
+                       sizes[4]);
+    const int* oldPointer = view_5d.data ();
+    EXPECT_TRUE( oldPointer != NULL );
+    Kokkos::resize (view_5d, sizes[0], sizes[1], sizes[2], sizes[3], sizes[4]);
+    const int* newPointer = view_5d.data ();
+    EXPECT_TRUE( oldPointer == newPointer );
+  }
+  {
+    typedef Kokkos::View<int******, DeviceType> view_type;
+    view_type view_6d ("view_6d", sizes[0], sizes[1], sizes[2], sizes[3],
+                       sizes[4], sizes[5]);
+    const int* oldPointer = view_6d.data ();
+    EXPECT_TRUE( oldPointer != NULL );
+    Kokkos::resize (view_6d, sizes[0], sizes[1], sizes[2], sizes[3], sizes[4],
+                    sizes[5]);
+    const int* newPointer = view_6d.data ();
+    EXPECT_TRUE( oldPointer == newPointer );
+  }
+  {
+    typedef Kokkos::View<int*******, DeviceType> view_type;
+    view_type view_7d ("view_7d", sizes[0], sizes[1], sizes[2], sizes[3],
+                       sizes[4], sizes[5], sizes[6]);
+    const int* oldPointer = view_7d.data ();
+    EXPECT_TRUE( oldPointer != NULL );
+    Kokkos::resize (view_7d, sizes[0], sizes[1], sizes[2], sizes[3], sizes[4],
+                    sizes[5], sizes[6]);
+    const int* newPointer = view_7d.data ();
+    EXPECT_TRUE( oldPointer == newPointer );
+  }
+  {
+    typedef Kokkos::View<int********, DeviceType> view_type;
+    view_type view_8d ("view_8d", sizes[0], sizes[1], sizes[2], sizes[3],
+                       sizes[4], sizes[5], sizes[6], sizes[7]);
+    const int* oldPointer = view_8d.data ();
+    EXPECT_TRUE( oldPointer != NULL );
+    Kokkos::resize (view_8d, sizes[0], sizes[1], sizes[2], sizes[3], sizes[4],
+                    sizes[5], sizes[6], sizes[7]);
+    const int* newPointer = view_8d.data ();
+    EXPECT_TRUE( oldPointer == newPointer );
+  }
+}
+
+} // namespace TestViewSubview
+
+#endif // TESTVIEWSUBVIEW_HPP_
diff --git a/lib/kokkos/core/unit_test/TestTaskScheduler.hpp b/lib/kokkos/core/unit_test/TestTaskScheduler.hpp
index 3a88475620..4e66543857 100644
--- a/lib/kokkos/core/unit_test/TestTaskScheduler.hpp
+++ b/lib/kokkos/core/unit_test/TestTaskScheduler.hpp
@@ -250,13 +250,21 @@ struct TestTaskDependence {
     const int n = CHUNK < m_count ? CHUNK : m_count;
 
     if ( 1 < m_count ) {
-      future_type f[ CHUNK ];
+      // Test use of memory pool for temporary allocation:
+
+      // Raw allocation:
+      future_type * const f =
+        (future_type *) m_sched.memory()->allocate( sizeof(future_type) * n );
+
+      // In-place construction:
+      for ( int i = 0; i < n; ++i ) new(f+i) future_type();
 
       const int inc = ( m_count + n - 1 ) / n;
 
       for ( int i = 0; i < n; ++i ) {
         long begin = i * inc;
         long count = begin + inc < m_count ? inc : m_count - begin;
+
         f[i] = Kokkos::task_spawn( Kokkos::TaskSingle( m_sched )
                                  , TestTaskDependence( count, m_sched, m_accum ) );
       }
@@ -264,6 +272,12 @@ struct TestTaskDependence {
       m_count = 0;
 
       Kokkos::respawn( this, Kokkos::when_all( f, n ) );
+
+      // In-place destruction to release future:
+      for ( int i = 0; i < n; ++i ) (f+i)->~future_type();
+
+      // Raw deallocation:
+      m_sched.memory()->deallocate( f , sizeof(future_type) * n );
     }
     else if ( 1 == m_count ) {
       Kokkos::atomic_increment( & m_accum() );
@@ -641,19 +655,12 @@ namespace Test {
 
 TEST_F( TEST_CATEGORY, task_fib )
 {
-  const int N = 24 ; // 25 triggers tbd bug on Cuda/Pascal
+  const int N = 27 ;
   for ( int i = 0; i < N; ++i ) {
-    TestTaskScheduler::TestFib< TEST_EXECSPACE >::run( i , ( i + 1 ) * ( i + 1 ) * 10000 );
+    TestTaskScheduler::TestFib< TEST_EXECSPACE >::run( i , ( i + 1 ) * ( i + 1 ) * 2000 );
   }
 }
 
-#if defined(KOKKOS_ARCH_MAXWELL) || defined(KOKKOS_ARCH_PASCAL)
-  // TODO: Resolve bug in task DAG for Pascal
-  #define KOKKOS_IMPL_DISABLE_UNIT_TEST_TASK_DAG_PASCAL
-#endif
-
-#ifndef KOKKOS_IMPL_DISABLE_UNIT_TEST_TASK_DAG_PASCAL
-
 TEST_F( TEST_CATEGORY, task_depend )
 {
   for ( int i = 0; i < 25; ++i ) {
@@ -667,11 +674,8 @@ TEST_F( TEST_CATEGORY, task_team )
   //TestTaskScheduler::TestTaskTeamValue< TEST_EXECSPACE >::run( 1000 ); // Put back after testing.
 }
 
-#else //ndef KOKKOS_IMPL_DISABLE_UNIT_TEST_TASK_DAG_PASCAL
-#undef KOKKOS_IMPL_DISABLE_UNIT_TEST_TASK_DAG_PASCAL
-#endif //ndef KOKKOS_IMPL_DISABLE_UNIT_TEST_TASK_DAG_PASCAL
-
 }
+
 #endif // #if defined( KOKKOS_ENABLE_TASKDAG )
 #endif // #ifndef KOKKOS_UNITTEST_TASKSCHEDULER_HPP
 
diff --git a/lib/kokkos/core/unit_test/TestTeamVector.hpp b/lib/kokkos/core/unit_test/TestTeamVector.hpp
index e9e2f7548a..7f4663d0f9 100644
--- a/lib/kokkos/core/unit_test/TestTeamVector.hpp
+++ b/lib/kokkos/core/unit_test/TestTeamVector.hpp
@@ -838,6 +838,18 @@ public:
     }, result );
 
     const ScalarType solution = (ScalarType) nrows * (ScalarType) ncols;
+
+    if ( int64_t(solution) != int64_t(result) ) {
+      printf( "  TestTripleNestedReduce failed solution(%ld) != result(%ld), nrows(%d) ncols(%d) league_size(%d) team_size(%d)\n"
+            , int64_t(solution)
+            , int64_t(result)
+            , int32_t(nrows)
+            , int32_t(ncols)
+            , int32_t(nrows/chunk_size)
+            , int32_t(team_size)
+            );
+    }
+
     ASSERT_EQ( solution, result );
   }
 };
diff --git a/lib/kokkos/core/unit_test/TestTile.hpp b/lib/kokkos/core/unit_test/TestTile.hpp
index 8f57dfea75..f15667322f 100644
--- a/lib/kokkos/core/unit_test/TestTile.hpp
+++ b/lib/kokkos/core/unit_test/TestTile.hpp
@@ -94,7 +94,7 @@ struct ReduceTileErrors
     const size_t jtile = iwork / tile_dim0;
 
     if ( jtile < tile_dim1 ) {
-      tile_type tile = Kokkos::Experimental::tile_subview( m_array, itile, jtile );
+      tile_type tile = Kokkos::tile_subview( m_array, itile, jtile );
 
       if ( tile( 0, 0 ) != ptrdiff_t( ( itile + jtile * tile_dim0 ) * TileLayout::N0 * TileLayout::N1 ) ) {
         ++errors;
diff --git a/lib/kokkos/core/unit_test/TestUniqueToken.hpp b/lib/kokkos/core/unit_test/TestUniqueToken.hpp
new file mode 100644
index 0000000000..28add61a8a
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestUniqueToken.hpp
@@ -0,0 +1,138 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <iostream>
+
+#include <Kokkos_Core.hpp>
+
+namespace Test {
+
+template< class Space >
+class TestUniqueToken
+{
+public:
+  typedef typename Space::execution_space  execution_space;
+  typedef Kokkos::View< int * , execution_space > view_type ;
+
+  Kokkos::Experimental::UniqueToken< execution_space , Kokkos::Experimental::UniqueTokenScope::Global > tokens ;
+
+  view_type verify ;
+  view_type counts ;
+  view_type errors ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( long ) const
+  {
+    const int32_t t = tokens.acquire();
+
+    bool ok = true ;
+
+    ok = ok && 0 <= t ;
+    ok = ok && t < tokens.size();
+    ok = ok && 0 == Kokkos::atomic_fetch_add( & verify(t) , 1 );
+
+    Kokkos::atomic_fetch_add( & counts(t) , 1 );
+
+    ok = ok && 1 == Kokkos::atomic_fetch_add( & verify(t) , -1 );
+
+    if ( ! ok ) { Kokkos::atomic_fetch_add( & errors(0) , 1 ) ; }
+
+    tokens.release(t);
+  }
+
+  TestUniqueToken()
+    : tokens( execution_space() )
+    , verify( "TestUniqueTokenVerify" , tokens.size() )
+    , counts( "TestUniqueTokenCounts" , tokens.size() )
+    , errors( "TestUniqueTokenErrors" , 1 )
+    {}
+
+  static void run()
+    {
+      using policy = Kokkos::RangePolicy<execution_space> ;
+
+      TestUniqueToken self ;
+
+      {
+        const int duplicate = 100 ;
+        const long n = duplicate * self.tokens.size();
+
+        Kokkos::parallel_for( policy(0,n) , self );
+        Kokkos::parallel_for( policy(0,n) , self );
+        Kokkos::parallel_for( policy(0,n) , self );
+        Kokkos::fence();
+      }
+
+      typename view_type::HostMirror host_counts =
+        Kokkos::create_mirror_view( self.counts );
+
+      Kokkos::deep_copy( host_counts , self.counts );
+
+      int32_t max = 0 ;
+
+      {
+        const long n = host_counts.extent(0);
+        for ( long i = 0 ; i < n ; ++i ) {
+          if ( max < host_counts[i] ) max = host_counts[i] ;
+        }
+      }
+
+      std::cout << "TestUniqueToken max reuse = " << max << std::endl ;
+
+      typename view_type::HostMirror host_errors =
+        Kokkos::create_mirror_view( self.errors );
+
+      Kokkos::deep_copy( host_errors , self.errors );
+
+      ASSERT_EQ( host_errors(0) , 0 );
+    }
+};
+
+
+TEST_F( TEST_CATEGORY, unique_token )
+{
+  TestUniqueToken< TEST_EXECSPACE >::run();
+}
+
+} // namespace Test
+
diff --git a/lib/kokkos/core/unit_test/TestViewCtorPropEmbeddedDim.hpp b/lib/kokkos/core/unit_test/TestViewCtorPropEmbeddedDim.hpp
new file mode 100644
index 0000000000..305ddb2a1d
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestViewCtorPropEmbeddedDim.hpp
@@ -0,0 +1,160 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cstdio>
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#include <type_traits>
+#include <typeinfo>
+
+namespace Test {
+
+namespace {
+
+template <typename ExecSpace >
+struct TestViewCtorProp_EmbeddedDim {
+
+  using ViewIntType     = typename Kokkos::View< int**, ExecSpace >;
+  using ViewDoubleType     = typename Kokkos::View< double*, ExecSpace >;
+
+  // Cuda 7.0 has issues with using a lamda in parallel_for to initialize the view - replace with this functor
+  template < class ViewType >
+  struct Functor {
+
+    ViewType v;
+
+    Functor( const ViewType & v_ ) : v(v_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( const int i ) const {
+      v(i) = i;
+    }
+
+  };
+
+
+  static void test_vcpt( const int N0, const int N1 )
+  {
+
+    // Create views to test
+    {
+      using VIT = typename TestViewCtorProp_EmbeddedDim::ViewIntType ;
+      using VDT = typename TestViewCtorProp_EmbeddedDim::ViewDoubleType ;
+
+      VIT vi1("vi1", N0, N1);
+      VDT vd1("vd1", N0);
+
+      // TEST: Test for common type between two views, one with type double, other with type int
+      // Deduce common value_type and construct a view with that type
+      {
+        // Two views
+        auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1, vd1);
+        typedef typename decltype( view_alloc_arg )::value_type                    CommonViewValueType;
+        typedef typename Kokkos::View< CommonViewValueType*, ExecSpace >  CVT;
+        typedef typename CVT::HostMirror                                           HostCVT;
+
+        // Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
+        CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
+
+        Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1), 
+          Functor<CVT>(cv1)
+        );
+
+        HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
+        Kokkos::deep_copy( hcv1, cv1 );
+
+        ASSERT_EQ( (std::is_same< CommonViewValueType, double >::value) , true ) ;
+      #if 0
+      // debug output
+      for ( int i = 0; i < N0*N1; ++i ) {
+        printf(" Output check: hcv1(%d) = %lf\n ", i, hcv1(i) );
+      }
+
+      printf( " Common value type view: %s \n", typeid( CVT() ).name() );
+      printf( " Common value type: %s \n", typeid( CommonViewValueType() ).name() );
+      if ( std::is_same< CommonViewValueType, double >::value == true ) {
+        printf("Proper common value_type\n");
+      }
+      else {
+        printf("WRONG common value_type\n");
+      }
+      // end debug output
+      #endif
+      }
+
+      {
+        // Single view
+        auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1);
+        typedef typename decltype( view_alloc_arg )::value_type                    CommonViewValueType;
+        typedef typename Kokkos::View< CommonViewValueType*, ExecSpace >  CVT;
+        typedef typename CVT::HostMirror                                           HostCVT;
+
+        // Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
+        CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
+
+        Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1), 
+          Functor<CVT>(cv1)
+        );
+
+        HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
+        Kokkos::deep_copy( hcv1, cv1 );
+
+        ASSERT_EQ( (std::is_same< CommonViewValueType, int>::value) , true ) ;
+      }
+
+    }
+
+  } // end test_vcpt
+
+}; // end struct
+
+} // namespace
+
+TEST_F( TEST_CATEGORY , viewctorprop_embedded_dim ) {
+  TestViewCtorProp_EmbeddedDim< TEST_EXECSPACE >::test_vcpt( 2, 3 );
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestViewMapping_a.hpp b/lib/kokkos/core/unit_test/TestViewMapping_a.hpp
index 6830c2e049..810ae72e73 100644
--- a/lib/kokkos/core/unit_test/TestViewMapping_a.hpp
+++ b/lib/kokkos/core/unit_test/TestViewMapping_a.hpp
@@ -56,24 +56,24 @@ void test_view_mapping()
 {
   typedef typename Space::execution_space ExecSpace;
 
-  typedef Kokkos::Experimental::Impl::ViewDimension<>  dim_0;
-  typedef Kokkos::Experimental::Impl::ViewDimension< 2 > dim_s2;
-  typedef Kokkos::Experimental::Impl::ViewDimension< 2, 3 > dim_s2_s3;
-  typedef Kokkos::Experimental::Impl::ViewDimension< 2, 3, 4 > dim_s2_s3_s4;
+  typedef Kokkos::Impl::ViewDimension<>  dim_0;
+  typedef Kokkos::Impl::ViewDimension< 2 > dim_s2;
+  typedef Kokkos::Impl::ViewDimension< 2, 3 > dim_s2_s3;
+  typedef Kokkos::Impl::ViewDimension< 2, 3, 4 > dim_s2_s3_s4;
 
-  typedef Kokkos::Experimental::Impl::ViewDimension< 0 > dim_s0;
-  typedef Kokkos::Experimental::Impl::ViewDimension< 0, 3 > dim_s0_s3;
-  typedef Kokkos::Experimental::Impl::ViewDimension< 0, 3, 4 > dim_s0_s3_s4;
+  typedef Kokkos::Impl::ViewDimension< 0 > dim_s0;
+  typedef Kokkos::Impl::ViewDimension< 0, 3 > dim_s0_s3;
+  typedef Kokkos::Impl::ViewDimension< 0, 3, 4 > dim_s0_s3_s4;
 
-  typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0 > dim_s0_s0;
-  typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 4 > dim_s0_s0_s4;
+  typedef Kokkos::Impl::ViewDimension< 0, 0 > dim_s0_s0;
+  typedef Kokkos::Impl::ViewDimension< 0, 0, 4 > dim_s0_s0_s4;
 
-  typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 0 > dim_s0_s0_s0;
-  typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 0, 0 > dim_s0_s0_s0_s0;
-  typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 0, 0, 0 > dim_s0_s0_s0_s0_s0;
-  typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 0, 0, 0, 0 > dim_s0_s0_s0_s0_s0_s0;
-  typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 0, 0, 0, 0, 0 > dim_s0_s0_s0_s0_s0_s0_s0;
-  typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 0, 0, 0, 0, 0, 0 > dim_s0_s0_s0_s0_s0_s0_s0_s0;
+  typedef Kokkos::Impl::ViewDimension< 0, 0, 0 > dim_s0_s0_s0;
+  typedef Kokkos::Impl::ViewDimension< 0, 0, 0, 0 > dim_s0_s0_s0_s0;
+  typedef Kokkos::Impl::ViewDimension< 0, 0, 0, 0, 0 > dim_s0_s0_s0_s0_s0;
+  typedef Kokkos::Impl::ViewDimension< 0, 0, 0, 0, 0, 0 > dim_s0_s0_s0_s0_s0_s0;
+  typedef Kokkos::Impl::ViewDimension< 0, 0, 0, 0, 0, 0, 0 > dim_s0_s0_s0_s0_s0_s0_s0;
+  typedef Kokkos::Impl::ViewDimension< 0, 0, 0, 0, 0, 0, 0, 0 > dim_s0_s0_s0_s0_s0_s0_s0_s0;
 
   // Fully static dimensions should not be larger than an int.
   ASSERT_LE( sizeof( dim_0 ), sizeof( int ) );
@@ -186,12 +186,12 @@ void test_view_mapping()
 
   //----------------------------------------
 
-  typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s0, Kokkos::LayoutStride > stride_s0_s0_s0;
+  typedef Kokkos::Impl::ViewOffset< dim_s0_s0_s0, Kokkos::LayoutStride > stride_s0_s0_s0;
 
   //----------------------------------------
   // Static dimension.
   {
-    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s2_s3_s4, Kokkos::LayoutLeft > left_s2_s3_s4;
+    typedef Kokkos::Impl::ViewOffset< dim_s2_s3_s4, Kokkos::LayoutLeft > left_s2_s3_s4;
 
     ASSERT_EQ( sizeof( left_s2_s3_s4 ), sizeof( dim_s2_s3_s4 ) );
 
@@ -223,7 +223,7 @@ void test_view_mapping()
   //----------------------------------------
   // Small dimension is unpadded.
   {
-    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutLeft > left_s0_s0_s4;
+    typedef Kokkos::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutLeft > left_s0_s0_s4;
 
     left_s0_s0_s4 dyn_off3( std::integral_constant< unsigned, sizeof( int ) >()
                           , Kokkos::LayoutLeft( 2, 3, 0, 0, 0, 0, 0, 0 ) );
@@ -275,7 +275,7 @@ void test_view_mapping()
     constexpr int N0 = 2000;
     constexpr int N1 = 300;
 
-    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutLeft > left_s0_s0_s4;
+    typedef Kokkos::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutLeft > left_s0_s0_s4;
 
     left_s0_s0_s4 dyn_off3( std::integral_constant< unsigned, sizeof( int ) >()
                           , Kokkos::LayoutLeft( N0, N1, 0, 0, 0, 0, 0, 0 ) );
@@ -314,7 +314,7 @@ void test_view_mapping()
   //----------------------------------------
   // Static dimension.
   {
-    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s2_s3_s4, Kokkos::LayoutRight > right_s2_s3_s4;
+    typedef Kokkos::Impl::ViewOffset< dim_s2_s3_s4, Kokkos::LayoutRight > right_s2_s3_s4;
 
     ASSERT_EQ( sizeof( right_s2_s3_s4 ), sizeof( dim_s2_s3_s4 ) );
 
@@ -350,7 +350,7 @@ void test_view_mapping()
   //----------------------------------------
   // Small dimension is unpadded.
   {
-    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutRight > right_s0_s0_s4;
+    typedef Kokkos::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutRight > right_s0_s0_s4;
 
     right_s0_s0_s4 dyn_off3( std::integral_constant< unsigned, sizeof( int ) >()
                            , Kokkos::LayoutRight( 2, 3, 0, 0, 0, 0, 0, 0 ) );
@@ -391,7 +391,7 @@ void test_view_mapping()
     constexpr int N0 = 2000;
     constexpr int N1 = 300;
 
-    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutRight > right_s0_s0_s4;
+    typedef Kokkos::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutRight > right_s0_s0_s4;
 
     right_s0_s0_s4 dyn_off3( std::integral_constant< unsigned, sizeof( int ) >()
                            , Kokkos::LayoutRight( N0, N1, 0, 0, 0, 0, 0, 0 ) );
@@ -431,18 +431,18 @@ void test_view_mapping()
   // Subview.
   {
     // Mapping rank 4 to rank 3
-    typedef Kokkos::Experimental::Impl::SubviewExtents< 4, 3 > SubviewExtents;
+    typedef Kokkos::Impl::SubviewExtents< 4, 3 > SubviewExtents;
 
     constexpr int N0 = 1000;
     constexpr int N1 = 2000;
     constexpr int N2 = 3000;
     constexpr int N3 = 4000;
 
-    Kokkos::Experimental::Impl::ViewDimension< N0, N1, N2, N3 > dim;
+    Kokkos::Impl::ViewDimension< N0, N1, N2, N3 > dim;
 
     SubviewExtents tmp( dim
                       , N0 / 2
-                      , Kokkos::Experimental::ALL
+                      , Kokkos::ALL
                       , std::pair< int, int >( N2 / 4, 10 + N2 / 4 )
                       , Kokkos::pair< int, int >( N3 / 4, 20 + N3 / 4 )
                       );
@@ -469,12 +469,12 @@ void test_view_mapping()
     constexpr int sub_N1 = 200;
     constexpr int sub_N2 = 4;
 
-    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutLeft > left_s0_s0_s4;
+    typedef Kokkos::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutLeft > left_s0_s0_s4;
 
     left_s0_s0_s4 dyn_off3( std::integral_constant< unsigned, sizeof( int ) >()
                           , Kokkos::LayoutLeft( N0, N1, 0, 0, 0, 0, 0, 0 ) );
 
-    Kokkos::Experimental::Impl::SubviewExtents< 3, 3 >
+    Kokkos::Impl::SubviewExtents< 3, 3 >
       sub( dyn_off3.m_dim
          , Kokkos::pair< int, int >( 0, sub_N0 )
          , Kokkos::pair< int, int >( 0, sub_N1 )
@@ -509,12 +509,12 @@ void test_view_mapping()
     constexpr int sub_N1 = 200;
     constexpr int sub_N2 = 4;
 
-    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutRight > right_s0_s0_s4;
+    typedef Kokkos::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutRight > right_s0_s0_s4;
 
     right_s0_s0_s4 dyn_off3( std::integral_constant< unsigned, sizeof( int ) >()
                            , Kokkos::LayoutRight( N0, N1, 0, 0, 0, 0, 0, 0 ) );
 
-    Kokkos::Experimental::Impl::SubviewExtents< 3, 3 >
+    Kokkos::Impl::SubviewExtents< 3, 3 >
       sub( dyn_off3.m_dim
          , Kokkos::pair< int, int >( 0, sub_N0 )
          , Kokkos::pair< int, int >( 0, sub_N1 )
@@ -544,7 +544,7 @@ void test_view_mapping()
   //----------------------------------------
   // View data analysis.
   {
-    using namespace Kokkos::Experimental::Impl;
+    using namespace Kokkos::Impl;
 
     static_assert( rank_dynamic<>::value == 0, "" );
     static_assert( rank_dynamic< 1 >::value == 0, "" );
@@ -554,7 +554,7 @@ void test_view_mapping()
   }
 
   {
-    using namespace Kokkos::Experimental::Impl;
+    using namespace Kokkos::Impl;
 
     typedef ViewArrayAnalysis< int[] >                 a_int_r1;
     typedef ViewArrayAnalysis< int**[4][5][6] >        a_int_r5;
@@ -598,7 +598,7 @@ void test_view_mapping()
   }
 
   {
-    using namespace Kokkos::Experimental::Impl;
+    using namespace Kokkos::Impl;
 
     typedef int t_i4[4];
 
@@ -616,12 +616,12 @@ void test_view_mapping()
   }
 
   {
-    using namespace Kokkos::Experimental::Impl;
+    using namespace Kokkos::Impl;
 
     typedef ViewDataAnalysis< const int[], void >  a_const_int_r1;
 
     static_assert( std::is_same< typename a_const_int_r1::specialize, void >::value, "" );
-    static_assert( std::is_same< typename a_const_int_r1::dimension, Kokkos::Experimental::Impl::ViewDimension<0> >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r1::dimension, Kokkos::Impl::ViewDimension<0> >::value, "" );
 
     static_assert( std::is_same< typename a_const_int_r1::type, const int * >::value, "" );
     static_assert( std::is_same< typename a_const_int_r1::value_type, const int >::value, "" );
@@ -637,7 +637,7 @@ void test_view_mapping()
 
     static_assert( std::is_same< typename a_const_int_r3::specialize, void >::value, "" );
 
-    static_assert( std::is_same< typename a_const_int_r3::dimension, Kokkos::Experimental::Impl::ViewDimension<0, 0, 4> >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r3::dimension, Kokkos::Impl::ViewDimension<0, 0, 4> >::value, "" );
 
     static_assert( std::is_same< typename a_const_int_r3::type, const int**[4] >::value, "" );
     static_assert( std::is_same< typename a_const_int_r3::value_type, const int >::value, "" );
@@ -786,7 +786,7 @@ void test_view_mapping()
   // The execution space of the memory space must be available for view data initialization.
   if ( std::is_same< ExecSpace, typename ExecSpace::memory_space::execution_space >::value ) {
 
-    using namespace Kokkos::Experimental;
+    using namespace Kokkos;
 
     typedef typename ExecSpace::memory_space  memory_space;
     typedef View< int*, memory_space >        V;
@@ -811,8 +811,8 @@ void test_view_mapping()
 
   {
     typedef Kokkos::ViewTraits< int***, Kokkos::LayoutStride, ExecSpace >           traits_t;
-    typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 0 >                    dims_t;
-    typedef Kokkos::Experimental::Impl::ViewOffset< dims_t, Kokkos::LayoutStride >  offset_t;
+    typedef Kokkos::Impl::ViewDimension< 0, 0, 0 >                    dims_t;
+    typedef Kokkos::Impl::ViewOffset< dims_t, Kokkos::LayoutStride >  offset_t;
 
     Kokkos::LayoutStride stride;
 
@@ -836,8 +836,8 @@ void test_view_mapping()
     ASSERT_EQ( offset.span(), 60 );
     ASSERT_TRUE( offset.span_is_contiguous() );
 
-    Kokkos::Experimental::Impl::ViewMapping< traits_t, void >
-      v( Kokkos::Experimental::Impl::ViewCtorProp< int* >( (int*) 0 ), stride );
+    Kokkos::Impl::ViewMapping< traits_t, void >
+      v( Kokkos::Impl::ViewCtorProp< int* >( (int*) 0 ), stride );
   }
 
   {
@@ -849,8 +849,8 @@ void test_view_mapping()
     constexpr int N1 = 11;
 
     V a( "a", N0, N1 );
-    M b = Kokkos::Experimental::create_mirror( a );
-    M c = Kokkos::Experimental::create_mirror_view( a );
+    M b = Kokkos::create_mirror( a );
+    M c = Kokkos::create_mirror_view( a );
     M d;
 
     for ( int i0 = 0; i0 < N0; ++i0 )
@@ -859,8 +859,8 @@ void test_view_mapping()
       b( i0, i1 ) = 1 + i0 + i1 * N0;
     }
 
-    Kokkos::Experimental::deep_copy( a, b );
-    Kokkos::Experimental::deep_copy( c, a );
+    Kokkos::deep_copy( a, b );
+    Kokkos::deep_copy( c, a );
 
     for ( int i0 = 0; i0 < N0; ++i0 )
     for ( int i1 = 0; i1 < N1; ++i1 )
@@ -868,7 +868,7 @@ void test_view_mapping()
       ASSERT_EQ( b( i0, i1 ), c( i0, i1 ) );
     }
 
-    Kokkos::Experimental::resize( b, 5, 6 );
+    Kokkos::resize( b, 5, 6 );
 
     for ( int i0 = 0; i0 < 5; ++i0 )
     for ( int i1 = 0; i1 < 6; ++i1 )
@@ -878,8 +878,8 @@ void test_view_mapping()
       ASSERT_EQ( b( i0, i1 ), val );
     }
 
-    Kokkos::Experimental::realloc( c, 5, 6 );
-    Kokkos::Experimental::realloc( d, 5, 6 );
+    Kokkos::realloc( c, 5, 6 );
+    Kokkos::realloc( d, 5, 6 );
 
     ASSERT_EQ( b.dimension_0(), 5 );
     ASSERT_EQ( b.dimension_1(), 6 );
@@ -889,7 +889,7 @@ void test_view_mapping()
     ASSERT_EQ( d.dimension_1(), 6 );
 
     layout_type layout( 7, 8 );
-    Kokkos::Experimental::resize( b, layout );
+    Kokkos::resize( b, layout );
     for ( int i0 = 0; i0 < 7; ++i0 )
     for ( int i1 = 6; i1 < 8; ++i1 )
     {
@@ -909,8 +909,8 @@ void test_view_mapping()
        ASSERT_EQ( b( i0, i1 ), val );
     }
 
-    Kokkos::Experimental::realloc( c, layout );
-    Kokkos::Experimental::realloc( d, layout );
+    Kokkos::realloc( c, layout );
+    Kokkos::realloc( d, layout );
 
     ASSERT_EQ( b.dimension_0(), 7 );
     ASSERT_EQ( b.dimension_1(), 8 );
@@ -932,8 +932,8 @@ void test_view_mapping()
     const int order[] = { 1, 0 };
 
     V a( "a", Kokkos::LayoutStride::order_dimensions( 2, order, dimensions ) );
-    M b = Kokkos::Experimental::create_mirror( a );
-    M c = Kokkos::Experimental::create_mirror_view( a );
+    M b = Kokkos::create_mirror( a );
+    M c = Kokkos::create_mirror_view( a );
     M d;
 
     for ( int i0 = 0; i0 < N0; ++i0 )
@@ -942,8 +942,8 @@ void test_view_mapping()
       b( i0, i1 ) = 1 + i0 + i1 * N0;
     }
 
-    Kokkos::Experimental::deep_copy( a, b );
-    Kokkos::Experimental::deep_copy( c, a );
+    Kokkos::deep_copy( a, b );
+    Kokkos::deep_copy( c, a );
 
     for ( int i0 = 0; i0 < N0; ++i0 )
     for ( int i1 = 0; i1 < N1; ++i1 )
@@ -954,7 +954,7 @@ void test_view_mapping()
     const int dimensions2[] = { 7, 8 };
     const int order2[] = { 1, 0 };
     layout_type layout = layout_type::order_dimensions( 2, order2, dimensions2 );
-    Kokkos::Experimental::resize( b, layout );
+    Kokkos::resize( b, layout );
 
     for ( int i0 = 0; i0 < 7; ++i0 )
     for ( int i1 = 0; i1 < 8; ++i1 )
@@ -964,8 +964,8 @@ void test_view_mapping()
        ASSERT_EQ( b( i0, i1 ), val );
     }
 
-    Kokkos::Experimental::realloc( c, layout );
-    Kokkos::Experimental::realloc( d, layout );
+    Kokkos::realloc( c, layout );
+    Kokkos::realloc( d, layout );
 
     ASSERT_EQ( b.dimension_0(), 7 );
     ASSERT_EQ( b.dimension_1(), 8 );
diff --git a/lib/kokkos/core/unit_test/TestViewSubview.hpp b/lib/kokkos/core/unit_test/TestViewSubview.hpp
index e3a12e684e..106323492a 100644
--- a/lib/kokkos/core/unit_test/TestViewSubview.hpp
+++ b/lib/kokkos/core/unit_test/TestViewSubview.hpp
@@ -915,134 +915,134 @@ void test_3d_subview_5d_impl_layout() {
 
 inline
 void test_subview_legal_args_right() {
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, int >::value ) );
 
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
 
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) );
 
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) );
 
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int> >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::pair<int, int> >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int> >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::pair<int, int> >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t >::value ) );
 
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
-  ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
-  ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
 
-  ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
-  ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
 }
 
 inline
 void test_subview_legal_args_left() {
-  ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, int >::value ) );
-  ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, int >::value ) );
-  ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, int >::value ) );
-  ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, int >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, int >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, int >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, int >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, int >::value ) );
 
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
 
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) );
 
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) );
 
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int> >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::pair<int, int> >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int> >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::pair<int, int> >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t >::value ) );
 
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
 
-  ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
-  ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
-  ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
-  ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
-  ASSERT_EQ( 0, (  Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
-  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, (  Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
 }
 
 } // namespace Impl
diff --git a/lib/kokkos/core/unit_test/TestWorkGraph.hpp b/lib/kokkos/core/unit_test/TestWorkGraph.hpp
new file mode 100644
index 0000000000..70cf6b47c0
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestWorkGraph.hpp
@@ -0,0 +1,172 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <vector>
+#include <iostream>
+
+#include <Kokkos_Core.hpp>
+
+namespace Test {
+
+namespace {
+
+/* This test is meant to be the WorkGraph equivalent of the Task DAG Scheduler test,
+   please see TestTaskScheduler.hpp for that test.
+   The algorithm computes the N-th fibonacci number as follows:
+    - Each "task" or "work item" computes the i-th fibonacci number
+    - If a task as (i < 2), it will record the known answer ahead of time.
+    - If a taks has (i >= 2), it will "spawn" two more tasks to compute
+      the (i - 1) and (i - 2) fibonacci numbers.
+      We do NOT do any de-duplication of these tasks.
+      De-duplication would result in only (N - 2) tasks which must be run in serial.
+      We allow duplicates both to increase the number of tasks and to increase the
+      amount of available parallelism.
+ */
+
+template< class ExecSpace >
+struct TestWorkGraph {
+
+  using MemorySpace = typename ExecSpace::memory_space;
+  using Policy = Kokkos::Experimental::WorkGraphPolicy<std::int32_t, ExecSpace>;
+  using Graph = typename Policy::graph_type;
+  using RowMap = typename Graph::row_map_type;
+  using Entries = typename Graph::entries_type;
+  using Values = Kokkos::View<long*, MemorySpace>;
+
+  long m_input;
+  Graph m_graph;
+  Graph m_transpose;
+  Values m_values;
+
+  TestWorkGraph(long arg_input):m_input(arg_input) {
+    form_graph();
+    transpose_crs(m_transpose, m_graph);
+  }
+
+  inline
+  long full_fibonacci( long n ) {
+    constexpr long mask = 0x03;
+    long fib[4] = { 0, 1, 1, 2 };
+    for ( long i = 2; i <= n; ++i ) {
+      fib[ i & mask ] = fib[ ( i - 1 ) & mask ] + fib[ ( i - 2 ) & mask ];
+    }
+    return fib[ n & mask ];
+  }
+
+  struct HostEntry {
+    long input;
+    std::int32_t parent;
+  };
+  std::vector<HostEntry> form_host_graph() {
+    std::vector<HostEntry> g;
+    g.push_back({ m_input , -1 });
+    for (std::int32_t i = 0; i < std::int32_t(g.size()); ++i) {
+      auto e = g.at(std::size_t(i));
+      if (e.input < 2) continue;
+      /* This part of the host graph formation is the equivalent of task spawning
+         in the Task DAG system. Notice how each task which is not a base case
+         spawns two more tasks, without any de-duplication */
+      g.push_back({ e.input - 1, i });
+      g.push_back({ e.input - 2, i });
+    }
+    return g;
+  }
+
+  void form_graph() {
+    auto hg = form_host_graph();
+    m_graph.row_map = RowMap("row_map", hg.size() + 1); // row map always has one more
+    m_graph.entries = Entries("entries", hg.size() - 1); // all but the first have a parent
+    m_values = Values("values", hg.size());
+    auto h_row_map = Kokkos::create_mirror_view(m_graph.row_map);
+    auto h_entries = Kokkos::create_mirror_view(m_graph.entries);
+    auto h_values = Kokkos::create_mirror_view(m_values);
+    h_row_map(0) = 0;
+    for (std::int32_t i = 0; i < std::int32_t(hg.size()); ++i) {
+      auto& e = hg.at(std::size_t(i));
+      h_row_map(i + 1) = i;
+      if (e.input < 2) {
+        h_values(i) = e.input;
+      }
+      if (e.parent == -1) continue;
+      h_entries(i - 1) = e.parent;
+    }
+    Kokkos::deep_copy(m_graph.row_map, h_row_map);
+    Kokkos::deep_copy(m_graph.entries, h_entries);
+    Kokkos::deep_copy(m_values, h_values);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(std::int32_t i) const {
+    auto begin = m_transpose.row_map(i);
+    auto end = m_transpose.row_map(i + 1);
+    for (auto j = begin; j < end; ++j) {
+      auto k = m_transpose.entries(j);
+      m_values(i) += m_values( k );
+    }
+  }
+
+  void test_for() {
+    Kokkos::parallel_for(Policy(m_graph), *this);
+    auto h_values = Kokkos::create_mirror_view(m_values);
+    Kokkos::deep_copy(h_values, m_values);
+    ASSERT_EQ( h_values(0), full_fibonacci(m_input) );
+  }
+
+};
+
+} // anonymous namespace
+
+TEST_F( TEST_CATEGORY, DISABLED_workgraph_fib )
+{
+  #ifdef KOKKOS_IMPL_CUDA_CLANG_WORKAROUND
+  int limit = 15;
+  #else
+  int limit = 27;
+  #endif
+  for ( int i = 0; i < limit; ++i) {
+    TestWorkGraph< TEST_EXECSPACE > f(i);
+    f.test_for();
+  }
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/UnitTestMain.cpp b/lib/kokkos/core/unit_test/UnitTestMain.cpp
index 4f52fc9567..a7dc7c4973 100644
--- a/lib/kokkos/core/unit_test/UnitTestMain.cpp
+++ b/lib/kokkos/core/unit_test/UnitTestMain.cpp
@@ -42,6 +42,7 @@
 */
 
 #include <gtest/gtest.h>
+#include <cstdlib>
 
 int main( int argc, char *argv[] ) {
   ::testing::InitGoogleTest( &argc, argv );
diff --git a/lib/kokkos/core/unit_test/UnitTestMainInit.cpp b/lib/kokkos/core/unit_test/UnitTestMainInit.cpp
index 21f851274b..62a01e9033 100644
--- a/lib/kokkos/core/unit_test/UnitTestMainInit.cpp
+++ b/lib/kokkos/core/unit_test/UnitTestMainInit.cpp
@@ -42,6 +42,8 @@
 */
 
 #include <gtest/gtest.h>
+#include <cstdlib>
+
 #include <Kokkos_Core.hpp>
 
 int main( int argc, char *argv[] ) {
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_Other.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_Other.cpp
index ba06b71192..fa6722615c 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_Other.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_Other.cpp
@@ -48,3 +48,5 @@
 #include<TestMemoryPool.hpp>
 #include<TestCXX11.hpp>
 #include<TestTile.hpp>
+
+#include<TestViewCtorPropEmbeddedDim.hpp>
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_UniqueToken.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_UniqueToken.cpp
new file mode 100644
index 0000000000..8424ae10d6
--- /dev/null
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_UniqueToken.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<cuda/TestCuda_Category.hpp>
+#include<TestUniqueToken.hpp>
+
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_WorkGraph.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_WorkGraph.cpp
new file mode 100644
index 0000000000..663ca1d560
--- /dev/null
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_WorkGraph.cpp
@@ -0,0 +1,45 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<cuda/TestCuda_Category.hpp>
+#include<TestWorkGraph.hpp>
diff --git a/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeResize.cpp b/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeResize.cpp
new file mode 100644
index 0000000000..c02905535b
--- /dev/null
+++ b/lib/kokkos/core/unit_test/default/TestDefaultDeviceTypeResize.cpp
@@ -0,0 +1,57 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+#include "TestResize.hpp"
+
+namespace Test {
+
+TEST( kokkosresize, host_space_access )
+{
+  // Test with the default device type.
+  using TestViewResize::testResize;
+  typedef Kokkos::View<int*>::device_type device_type;
+  testResize<device_type> ();
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP.hpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP.hpp
index 2f8daf7ad7..c12574a65a 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP.hpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP.hpp
@@ -86,25 +86,26 @@ class openmp : public ::testing::Test {
 protected:
   static void SetUpTestCase()
   {
-    const unsigned numa_count       = Kokkos::hwloc::get_available_numa_count();
-    const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
-    const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
+    int threads_count = 0;
+    #pragma omp parallel
+    {
+      #pragma omp atomic
+      ++threads_count;
+    }
 
-    const unsigned threads_count = std::max( 1u, numa_count ) *
-                                   std::max( 2u, ( cores_per_numa * threads_per_core ) / 2 );
+    if (threads_count > 3) {
+      threads_count /= 2;
+    }
 
     Kokkos::OpenMP::initialize( threads_count );
     Kokkos::print_configuration( std::cout, true );
+
     srand( 10231 );
   }
 
   static void TearDownTestCase()
   {
     Kokkos::OpenMP::finalize();
-
-    omp_set_num_threads( 1 );
-
-    ASSERT_EQ( 1, omp_get_max_threads() );
   }
 };
 
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_Other.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_Other.cpp
index 5e9535638d..33e7402ce6 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_Other.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_Other.cpp
@@ -48,3 +48,93 @@
 #include<TestMemoryPool.hpp>
 #include<TestCXX11.hpp>
 #include<TestTile.hpp>
+
+#include<TestViewCtorPropEmbeddedDim.hpp>
+
+#include <mutex>
+
+namespace Test {
+
+TEST_F( openmp, partition_master )
+{
+  using Mutex = Kokkos::Experimental::MasterLock<Kokkos::OpenMP>;
+
+  Mutex mtx;
+  int errors = 0;
+
+  auto master = [&errors, &mtx](int partition_id, int num_partitions) {
+
+    const int pool_size = Kokkos::OpenMP::thread_pool_size();
+
+    {
+      std::unique_lock<Mutex> lock(mtx);
+      if ( Kokkos::OpenMP::in_parallel() ) {
+        ++errors;
+      }
+      if ( Kokkos::OpenMP::thread_pool_rank() != 0 ) {
+        ++errors;
+      }
+    }
+
+    {
+      int local_errors = 0;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy<Kokkos::OpenMP>(0,1000)
+                           , [pool_size]( const int , int & errs ) {
+          if ( Kokkos::OpenMP::thread_pool_size() != pool_size ) {
+            ++errs;
+          }
+        }
+        , local_errors
+      );
+      Kokkos::atomic_add( &errors, local_errors );
+    }
+
+    Kokkos::Experimental::UniqueToken< Kokkos::OpenMP > token;
+
+    Kokkos::View<int*, Kokkos::OpenMP> count( "",  token.size() );
+
+    Kokkos::parallel_for( Kokkos::RangePolicy<Kokkos::OpenMP>(0,1000),
+        [=] ( const int ) {
+      int i = token.acquire();
+      ++count[i];
+      token.release(i);
+    });
+
+    Kokkos::View<int,Kokkos::OpenMP> sum ("");
+    Kokkos::parallel_for( Kokkos::RangePolicy<Kokkos::OpenMP>(0,token.size()),
+        [=] ( const int i ) {
+      Kokkos::atomic_add( sum.data(), count[i] );
+    });
+
+    if (sum() != 1000) {
+      Kokkos::atomic_add( &errors, 1 );
+    }
+  };
+
+  master(0,1);
+
+  ASSERT_EQ( errors, 0 );
+
+  Kokkos::OpenMP::partition_master( master );
+  ASSERT_EQ( errors, 0 );
+
+  Kokkos::OpenMP::partition_master( master, 4, 0 );
+  ASSERT_EQ( errors, 0 );
+
+  Kokkos::OpenMP::partition_master( master, 0, 4 );
+  ASSERT_EQ( errors, 0 );
+
+  Kokkos::OpenMP::partition_master( master, 2, 2 );
+  ASSERT_EQ( errors, 0 );
+
+  Kokkos::OpenMP::partition_master( master, 8, 0 );
+  ASSERT_EQ( errors, 0 );
+
+  Kokkos::OpenMP::partition_master( master, 0, 8 );
+  ASSERT_EQ( errors, 0 );
+
+  Kokkos::OpenMP::partition_master( master, 8, 8 );
+  ASSERT_EQ( errors, 0 );
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_UniqueToken.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_UniqueToken.cpp
new file mode 100644
index 0000000000..143a6d9910
--- /dev/null
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_UniqueToken.cpp
@@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<openmp/TestOpenMP_Category.hpp>
+#include<TestUniqueToken.hpp>
+
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_WorkGraph.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_WorkGraph.cpp
new file mode 100644
index 0000000000..ec6fa1653c
--- /dev/null
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_WorkGraph.cpp
@@ -0,0 +1,45 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<openmp/TestOpenMP_Category.hpp>
+#include<TestWorkGraph.hpp>
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_Other.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_Other.cpp
index a6a76a03bd..bc39b1e160 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_Other.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_Other.cpp
@@ -48,3 +48,5 @@
 #include<TestMemoryPool.hpp>
 #include<TestCXX11.hpp>
 #include<TestTile.hpp>
+
+#include<TestViewCtorPropEmbeddedDim.hpp>
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_WorkGraph.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_WorkGraph.cpp
new file mode 100644
index 0000000000..de1638de5e
--- /dev/null
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_WorkGraph.cpp
@@ -0,0 +1,45 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<serial/TestSerial_Category.hpp>
+#include<TestWorkGraph.hpp>
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_Other.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_Other.cpp
index c11155c5c0..160b37a2c8 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_Other.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_Other.cpp
@@ -48,3 +48,5 @@
 #include<TestMemoryPool.hpp>
 #include<TestCXX11.hpp>
 #include<TestTile.hpp>
+
+#include<TestViewCtorPropEmbeddedDim.hpp>
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_WorkGraph.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_WorkGraph.cpp
new file mode 100644
index 0000000000..6b7dbb26db
--- /dev/null
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_WorkGraph.cpp
@@ -0,0 +1,45 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<threads/TestThreads_Category.hpp>
+#include<TestWorkGraph.hpp>
diff --git a/lib/kokkos/example/cmake_build/CMakeLists.txt b/lib/kokkos/example/cmake_build/CMakeLists.txt
index 4e149726ee..f92c5c6513 100644
--- a/lib/kokkos/example/cmake_build/CMakeLists.txt
+++ b/lib/kokkos/example/cmake_build/CMakeLists.txt
@@ -40,5 +40,7 @@ list(APPEND CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} -O3)
 
 add_subdirectory(${Example_SOURCE_DIR}/../.. ${Example_BINARY_DIR}/kokkos)
 
+include_directories(${Kokkos_INCLUDE_DIRS_RET})
+
 add_executable(example cmake_example.cpp)
 target_link_libraries(example kokkos)
diff --git a/lib/kokkos/example/feint/main.cpp b/lib/kokkos/example/feint/main.cpp
index 616e584bf6..57a8f8fafb 100644
--- a/lib/kokkos/example/feint/main.cpp
+++ b/lib/kokkos/example/feint/main.cpp
@@ -69,12 +69,26 @@ int main()
 
 #if defined( KOKKOS_ENABLE_OPENMP )
   {
-    // Use 4 cores per NUMA region, unless fewer available
 
-    const unsigned use_numa_count     = Kokkos::hwloc::get_available_numa_count();
-    const unsigned use_cores_per_numa = std::min( 4u , Kokkos::hwloc::get_available_cores_per_numa() );
+    int num_threads  = 0;
+    if ( Kokkos::hwloc::available() ) {
+      // Use 4 cores per NUMA region, unless fewer available
+      const unsigned use_numa_count     = Kokkos::hwloc::get_available_numa_count();
+      const unsigned use_cores_per_numa = std::min( 4u , Kokkos::hwloc::get_available_cores_per_numa() );
+      num_threads = use_numa_count * use_cores_per_numa;
 
-    Kokkos::OpenMP::initialize( use_numa_count * use_cores_per_numa );
+    }
+    else {
+      #pragma omp parallel
+      {
+        #pragma omp atomic
+        ++num_threads;
+      }
+      num_threads = std::max(4, num_threads/4);
+    }
+
+
+    Kokkos::OpenMP::initialize( num_threads );
 
     std::cout << "feint< OpenMP , NotUsingAtomic >" << std::endl ;
     Kokkos::Example::feint< Kokkos::OpenMP , false >();
diff --git a/lib/kokkos/example/global_2_local_ids/G2L_Main.cpp b/lib/kokkos/example/global_2_local_ids/G2L_Main.cpp
index fb33aef56e..b6b8b2f5e0 100644
--- a/lib/kokkos/example/global_2_local_ids/G2L_Main.cpp
+++ b/lib/kokkos/example/global_2_local_ids/G2L_Main.cpp
@@ -138,7 +138,16 @@ int main(int argc, char *argv[])
 #endif
 
 #ifdef KOKKOS_ENABLE_OPENMP
-  Kokkos::OpenMP::initialize( threads_count );
+  int num_threads = 0;
+  #pragma omp parallel
+  {
+    #pragma omp atomic
+    ++num_threads;
+  }
+  if( num_threads > 3 ) {
+    num_threads = std::max(4, num_threads/4);
+  }
+  Kokkos::OpenMP::initialize( num_threads );
   num_errors += G2L::run_openmp(num_ids,num_find_iterations);
   Kokkos::OpenMP::finalize();
 #endif
diff --git a/lib/kokkos/example/grow_array/main.cpp b/lib/kokkos/example/grow_array/main.cpp
index e7438a9bf4..3f1d534d93 100644
--- a/lib/kokkos/example/grow_array/main.cpp
+++ b/lib/kokkos/example/grow_array/main.cpp
@@ -88,7 +88,7 @@ int main( int argc , char ** argv )
 #if defined( KOKKOS_ENABLE_OPENMP )
   {
     std::cout << "Kokkos::OpenMP" << std::endl ;
-    Kokkos::OpenMP::initialize( num_threads , use_numa , use_core );
+    Kokkos::OpenMP::initialize();
     Example::grow_array< Kokkos::OpenMP >( length_array , span_values );
     Kokkos::OpenMP::finalize();
   }
diff --git a/lib/kokkos/example/tutorial/03_simple_view/Makefile b/lib/kokkos/example/tutorial/03_simple_view/Makefile
index e716b765e7..32483a2555 100644
--- a/lib/kokkos/example/tutorial/03_simple_view/Makefile
+++ b/lib/kokkos/example/tutorial/03_simple_view/Makefile
@@ -33,6 +33,7 @@ include $(KOKKOS_PATH)/Makefile.kokkos
 
 build: $(EXE)
 
+#for unit testing only, for best preformance with OpenMP 4.0 or better
 test: $(EXE)
 	./$(EXE)
 
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/Makefile b/lib/kokkos/example/tutorial/Advanced_Views/Makefile
index bc4012f68c..12ac5652e5 100644
--- a/lib/kokkos/example/tutorial/Advanced_Views/Makefile
+++ b/lib/kokkos/example/tutorial/Advanced_Views/Makefile
@@ -22,100 +22,102 @@ endif
 build:
 	mkdir -p 01_data_layouts
 	cd ./01_data_layouts; \
-	make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/01_data_layouts/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/01_data_layouts/Makefile ${KOKKOS_SETTINGS}
 	mkdir -p 02_memory_traits
 	cd ./02_memory_traits; \
-	make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/02_memory_traits/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/02_memory_traits/Makefile ${KOKKOS_SETTINGS}
 	mkdir -p 03_subviews
 	cd ./03_subviews; \
-	make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/03_subviews/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/03_subviews/Makefile ${KOKKOS_SETTINGS}
 	mkdir -p 04_dualviews
 	cd ./04_dualviews; \
-	make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/04_dualviews/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/04_dualviews/Makefile ${KOKKOS_SETTINGS}
 	mkdir -p 05_NVIDIA_UVM
 	cd ./05_NVIDIA_UVM; \
-	make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile ${KOKKOS_SETTINGS}
 	#mkdir -p 06_AtomicViews
 	#cd ./06_AtomicViews; \
-	#make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/06_AtomicViews/Makefile ${KOKKOS_SETTINGS}
+	#$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/06_AtomicViews/Makefile ${KOKKOS_SETTINGS}
 	#mkdir -p 07_Overlapping_DeepCopy
 	#cd ./07_Overlapping_DeepCopy; \
-	#make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile ${KOKKOS_SETTINGS}
+	#$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile ${KOKKOS_SETTINGS}
 
 build-insource:
 	cd ./01_data_layouts; \
-	make build -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) build ${KOKKOS_SETTINGS}
 	cd ./02_memory_traits; \
-	make build -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) build ${KOKKOS_SETTINGS}
 	cd ./03_subviews; \
-	make build -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) build ${KOKKOS_SETTINGS}
 	cd ./04_dualviews; \
-	make build -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) build ${KOKKOS_SETTINGS}
 	cd ./05_NVIDIA_UVM; \
-	make build -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) build ${KOKKOS_SETTINGS}
 	#cd ./06_AtomicViews; \
-	#make build -j 4 ${KOKKOS_SETTINGS}
+	#$(MAKE) build ${KOKKOS_SETTINGS}
 	#cd ./07_Overlapping_DeepCopy; \
-	#make build -j 4 ${KOKKOS_SETTINGS}
+	#$(MAKE) build ${KOKKOS_SETTINGS}
+
 test:
 	cd ./01_data_layouts; \
-	make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/01_data_layouts/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/01_data_layouts/Makefile ${KOKKOS_SETTINGS}
 	cd ./02_memory_traits; \
-	make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/02_memory_traits/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/02_memory_traits/Makefile ${KOKKOS_SETTINGS}
 	cd ./03_subviews; \
-	make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/03_subviews/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/03_subviews/Makefile ${KOKKOS_SETTINGS}
 	cd ./04_dualviews; \
-	make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/04_dualviews/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/04_dualviews/Makefile ${KOKKOS_SETTINGS}
 	cd ./05_NVIDIA_UVM; \
-	make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile ${KOKKOS_SETTINGS}
 	#cd ./06_AtomicViews; \
-	#make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/06_AtomicViews/Makefile ${KOKKOS_SETTINGS}
+	#$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/06_AtomicViews/Makefile ${KOKKOS_SETTINGS}
 	#cd ./07_Overlapping_DeepCopy; \
-	#make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile ${KOKKOS_SETTINGS}
+	#$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile ${KOKKOS_SETTINGS}
 
 test-insource:
 	cd ./01_data_layouts; \
-	make test -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) test ${KOKKOS_SETTINGS}
 	cd ./02_memory_traits; \
-	make test -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) test ${KOKKOS_SETTINGS}
 	cd ./03_subviews; \
-	make test -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) test ${KOKKOS_SETTINGS}
 	cd ./04_dualviews; \
-	make test -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) test ${KOKKOS_SETTINGS}
 	cd ./05_NVIDIA_UVM; \
-	make test -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) test ${KOKKOS_SETTINGS}
 	#cd ./06_AtomicViews; \
-	#make test -j 4 ${KOKKOS_SETTINGS}
+	#$(MAKE) test ${KOKKOS_SETTINGS}
 	#cd ./07_Overlapping_DeepCopy; \
-	#make test -j 4 ${KOKKOS_SETTINGS}
+	#$(MAKE) test ${KOKKOS_SETTINGS}
+
 clean:
 	cd ./01_data_layouts; \
-	make clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/01_data_layouts/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/01_data_layouts/Makefile ${KOKKOS_SETTINGS}
 	cd ./02_memory_traits; \
-	make clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/02_memory_traits/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/02_memory_traits/Makefile ${KOKKOS_SETTINGS}
 	cd ./03_subviews; \
-	make clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/03_subviews/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/03_subviews/Makefile ${KOKKOS_SETTINGS}
 	cd ./04_dualviews; \
-	make clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/04_dualviews/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/04_dualviews/Makefile ${KOKKOS_SETTINGS}
 	cd ./05_NVIDIA_UVM; \
-	make clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile ${KOKKOS_SETTINGS}
 	#cd ./06_AtomicViews; \
-	#make clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/06_AtomicViews/Makefile ${KOKKOS_SETTINGS}
+	#$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/06_AtomicViews/Makefile ${KOKKOS_SETTINGS}
 	#cd ./07_Overlapping_DeepCopy; \
-	#make clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile ${KOKKOS_SETTINGS}
+	#$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile ${KOKKOS_SETTINGS}
 
 clean-insource:
 	cd ./01_data_layouts; \
-	make clean ${KOKKOS_SETTINGS}
+	$(MAKE) clean ${KOKKOS_SETTINGS}
 	cd ./02_memory_traits; \
-	make clean ${KOKKOS_SETTINGS}
+	$(MAKE) clean ${KOKKOS_SETTINGS}
 	cd ./03_subviews; \
-	make clean ${KOKKOS_SETTINGS}
+	$(MAKE) clean ${KOKKOS_SETTINGS}
 	cd ./04_dualviews; \
-	make clean ${KOKKOS_SETTINGS}
+	$(MAKE) clean ${KOKKOS_SETTINGS}
 	cd ./05_NVIDIA_UVM; \
-	make clean ${KOKKOS_SETTINGS}
+	$(MAKE) clean ${KOKKOS_SETTINGS}
 	#cd ./06_AtomicViews; \
-	#make clean ${KOKKOS_SETTINGS}
+	#$(MAKE) clean ${KOKKOS_SETTINGS}
 	#cd ./07_Overlapping_DeepCopy; \
-	#make clean ${KOKKOS_SETTINGS}
+	#$(MAKE) clean ${KOKKOS_SETTINGS}
diff --git a/lib/kokkos/example/tutorial/Algorithms/Makefile b/lib/kokkos/example/tutorial/Algorithms/Makefile
index ad0b76f9d6..4e70ba7d97 100644
--- a/lib/kokkos/example/tutorial/Algorithms/Makefile
+++ b/lib/kokkos/example/tutorial/Algorithms/Makefile
@@ -22,22 +22,22 @@ endif
 build:
 	mkdir -p 01_random_numbers
 	cd ./01_random_numbers; \
-	make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Algorithms/01_random_numbers/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Algorithms/01_random_numbers/Makefile ${KOKKOS_SETTINGS}
 
 build-insource:
 	cd ./01_random_numbers; \
-	make build -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) build ${KOKKOS_SETTINGS}
 test:
 	cd ./01_random_numbers; \
-	make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Algorithms/01_random_numbers/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Algorithms/01_random_numbers/Makefile ${KOKKOS_SETTINGS}
 
 test-insource:
 	cd ./01_random_numbers; \
-	make test -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) test ${KOKKOS_SETTINGS}
 clean:
 	cd ./01_random_numbers; \
-	make clean -f ${KOKKOS_PATH}/example/tutorial/Algorithms/01_random_numbers/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Algorithms/01_random_numbers/Makefile ${KOKKOS_SETTINGS}
 
 clean-insource:
 	cd ./01_random_numbers; \
-	make clean ${KOKKOS_SETTINGS}
+	$(MAKE) clean ${KOKKOS_SETTINGS}
diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/Makefile b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/Makefile
index 44fdf90f8a..4bf6d487ae 100644
--- a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/Makefile
+++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/Makefile
@@ -22,74 +22,74 @@ endif
 build:
 	mkdir -p 01_thread_teams
 	cd ./01_thread_teams; \
-	make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile ${KOKKOS_SETTINGS}
 	mkdir -p 01_thread_teams_lambda
 	cd ./01_thread_teams_lambda; \
-	make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile ${KOKKOS_SETTINGS}
 	mkdir -p 02_nested_parallel_for
 	cd ./02_nested_parallel_for; \
-	make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile ${KOKKOS_SETTINGS}
 	mkdir -p 03_vectorization
 	cd ./03_vectorization; \
-	make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile ${KOKKOS_SETTINGS}
 	mkdir -p 04_team_scan
 	cd ./04_team_scan; \
-	make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile ${KOKKOS_SETTINGS}
 
 build-insource:
 	cd ./01_thread_teams; \
-	make build -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) build ${KOKKOS_SETTINGS}
 	cd ./01_thread_teams_lambda; \
-	make build -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) build ${KOKKOS_SETTINGS}
 	cd ./02_nested_parallel_for; \
-	make build -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) build ${KOKKOS_SETTINGS}
 	cd ./03_vectorization; \
-	make build -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) build ${KOKKOS_SETTINGS}
 	cd ./04_team_scan; \
-	make build -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) build ${KOKKOS_SETTINGS}
 test:
 	cd ./01_thread_teams; \
-	make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile ${KOKKOS_SETTINGS}
 	cd ./01_thread_teams_lambda; \
-	make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile ${KOKKOS_SETTINGS}
 	cd ./02_nested_parallel_for; \
-	make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile ${KOKKOS_SETTINGS}
 	cd ./03_vectorization; \
-	make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile ${KOKKOS_SETTINGS}
 	cd ./04_team_scan; \
-	make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile ${KOKKOS_SETTINGS}
 
 test-insource:
 	cd ./01_thread_teams; \
-	make test -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) test ${KOKKOS_SETTINGS}
 	cd ./01_thread_teams_lambda; \
-	make test -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) test ${KOKKOS_SETTINGS}
 	cd ./02_nested_parallel_for; \
-	make test -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) test ${KOKKOS_SETTINGS}
 	cd ./03_vectorization; \
-	make test -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) test ${KOKKOS_SETTINGS}
 	cd ./04_team_scan; \
-	make test -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) test ${KOKKOS_SETTINGS}
 clean:
 	cd ./01_thread_teams; \
-	make clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile ${KOKKOS_SETTINGS}
 	cd ./01_thread_teams_lambda; \
-	make clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile ${KOKKOS_SETTINGS}
 	cd ./02_nested_parallel_for; \
-	make clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile ${KOKKOS_SETTINGS}
 	cd ./03_vectorization; \
-	make clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile ${KOKKOS_SETTINGS}
 	cd ./04_team_scan; \
-	make clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile ${KOKKOS_SETTINGS}
 
 clean-insource:
 	cd ./01_thread_teams; \
-	make clean ${KOKKOS_SETTINGS}
+	$(MAKE) clean ${KOKKOS_SETTINGS}
 	cd ./01_thread_teams_lambda; \
-	make clean ${KOKKOS_SETTINGS}
+	$(MAKE) clean ${KOKKOS_SETTINGS}
 	cd ./02_nested_parallel_for; \
-	make clean ${KOKKOS_SETTINGS}
+	$(MAKE) clean ${KOKKOS_SETTINGS}
 	cd ./03_vectorization; \
-	make clean ${KOKKOS_SETTINGS}
+	$(MAKE) clean ${KOKKOS_SETTINGS}
 	cd ./04_team_scan; \
-	make clean ${KOKKOS_SETTINGS}
+	$(MAKE) clean ${KOKKOS_SETTINGS}
diff --git a/lib/kokkos/example/tutorial/Makefile b/lib/kokkos/example/tutorial/Makefile
index 063ace8aab..7b2732eeed 100644
--- a/lib/kokkos/example/tutorial/Makefile
+++ b/lib/kokkos/example/tutorial/Makefile
@@ -23,152 +23,152 @@ endif
 build:
 	mkdir -p 01_hello_world
 	cd ./01_hello_world; \
-	make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/01_hello_world/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/01_hello_world/Makefile ${KOKKOS_SETTINGS}
 	mkdir -p 01_hello_world_lambda
 	cd ./01_hello_world_lambda; \
-	make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/01_hello_world_lambda/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/01_hello_world_lambda/Makefile ${KOKKOS_SETTINGS}
 	mkdir -p 02_simple_reduce
 	cd ./02_simple_reduce; \
-	make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce/Makefile ${KOKKOS_SETTINGS}
 	mkdir -p 02_simple_reduce_lambda
 	cd ./02_simple_reduce_lambda; \
-	make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce_lambda/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce_lambda/Makefile ${KOKKOS_SETTINGS}
 	mkdir -p 03_simple_view
 	cd ./03_simple_view; \
-	make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/03_simple_view/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/03_simple_view/Makefile ${KOKKOS_SETTINGS}
 	mkdir -p 03_simple_view_lambda
 	cd ./03_simple_view_lambda; \
-	make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/03_simple_view_lambda/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/03_simple_view_lambda/Makefile ${KOKKOS_SETTINGS}
 	mkdir -p 04_simple_memoryspaces
 	cd ./04_simple_memoryspaces; \
-	make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/04_simple_memoryspaces/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/04_simple_memoryspaces/Makefile ${KOKKOS_SETTINGS}
 	mkdir -p 05_simple_atomics
 	cd ./05_simple_atomics; \
-	make build -j 4 -f ${KOKKOS_PATH}/example/tutorial/05_simple_atomics/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/05_simple_atomics/Makefile ${KOKKOS_SETTINGS}
 	mkdir -p Advanced_Views
 	cd ./Advanced_Views; \
-	make build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
 	mkdir -p Algorithms
 	cd ./Algorithms; \
-	make build -f ${KOKKOS_PATH}/example/tutorial/Algorithms/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Algorithms/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
 	mkdir -p Hierarchical_Parallelism
 	cd ./Hierarchical_Parallelism; \
-	make build -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
+	$(MAKE) build -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
 
 build-insource:
 	cd ./01_hello_world; \
-	make build -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) build ${KOKKOS_SETTINGS}
 	cd ./01_hello_world_lambda; \
-	make build -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) build ${KOKKOS_SETTINGS}
 	cd ./02_simple_reduce; \
-	make build -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) build ${KOKKOS_SETTINGS}
 	cd ./02_simple_reduce_lambda; \
-	make build -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) build ${KOKKOS_SETTINGS}
 	cd ./03_simple_view; \
-	make build -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) build ${KOKKOS_SETTINGS}
 	cd ./03_simple_view_lambda; \
-	make build -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) build ${KOKKOS_SETTINGS}
 	cd ./04_simple_memoryspaces; \
-	make build -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) build ${KOKKOS_SETTINGS}
 	cd ./05_simple_atomics; \
-	make build -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) build ${KOKKOS_SETTINGS}
 	cd ./Advanced_Views; \
-	make build KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
+	$(MAKE) build KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
 	cd ./Algorithms; \
-	make build KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
+	$(MAKE) build KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
 	cd ./Hierarchical_Parallelism; \
-	make build KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
+	$(MAKE) build KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
 test:
 	cd ./01_hello_world; \
-	make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/01_hello_world/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/01_hello_world/Makefile ${KOKKOS_SETTINGS}
 	cd ./01_hello_world_lambda; \
-	make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/01_hello_world_lambda/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/01_hello_world_lambda/Makefile ${KOKKOS_SETTINGS}
 	cd ./02_simple_reduce; \
-	make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce/Makefile ${KOKKOS_SETTINGS}
 	cd ./02_simple_reduce_lambda; \
-	make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce_lambda/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce_lambda/Makefile ${KOKKOS_SETTINGS}
 	cd ./03_simple_view; \
-	make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/03_simple_view/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/03_simple_view/Makefile ${KOKKOS_SETTINGS}
 	cd ./03_simple_view_lambda; \
-	make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/03_simple_view_lambda/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/03_simple_view_lambda/Makefile ${KOKKOS_SETTINGS}
 	cd ./04_simple_memoryspaces; \
-	make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/04_simple_memoryspaces/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/04_simple_memoryspaces/Makefile ${KOKKOS_SETTINGS}
 	cd ./05_simple_atomics; \
-	make test -j 4 -f ${KOKKOS_PATH}/example/tutorial/05_simple_atomics/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/05_simple_atomics/Makefile ${KOKKOS_SETTINGS}
 	cd ./Advanced_Views; \
-	make test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
 	cd ./Algorithms; \
-	make test -f ${KOKKOS_PATH}/example/tutorial/Algorithms/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Algorithms/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
 	cd ./Hierarchical_Parallelism; \
-	make test -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
+	$(MAKE) test -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
 
 test-insource:
 	cd ./01_hello_world; \
-	make test -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) test ${KOKKOS_SETTINGS}
 	cd ./01_hello_world_lambda; \
-	make test -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) test ${KOKKOS_SETTINGS}
 	cd ./02_simple_reduce; \
-	make test -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) test ${KOKKOS_SETTINGS}
 	cd ./02_simple_reduce_lambda; \
-	make test -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) test ${KOKKOS_SETTINGS}
 	cd ./03_simple_view; \
-	make test -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) test ${KOKKOS_SETTINGS}
 	cd ./03_simple_view_lambda; \
-	make test -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) test ${KOKKOS_SETTINGS}
 	cd ./04_simple_memoryspaces; \
-	make test -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) test ${KOKKOS_SETTINGS}
 	cd ./05_simple_atomics; \
-	make test -j 4 ${KOKKOS_SETTINGS}
+	$(MAKE) test ${KOKKOS_SETTINGS}
 	cd ./Advanced_Views; \
-	make test KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
+	$(MAKE) test KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
 	cd ./Algorithms; \
-	make test KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
+	$(MAKE) test KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
 	cd ./Hierarchical_Parallelism; \
-	make test KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
+	$(MAKE) test KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
 clean:
 	cd ./01_hello_world; \
-	make clean -f ${KOKKOS_PATH}/example/tutorial/01_hello_world/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/01_hello_world/Makefile ${KOKKOS_SETTINGS}
 	cd ./01_hello_world_lambda; \
-	make clean -f ${KOKKOS_PATH}/example/tutorial/01_hello_world_lambda/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/01_hello_world_lambda/Makefile ${KOKKOS_SETTINGS}
 	cd ./02_simple_reduce; \
-	make clean -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce/Makefile ${KOKKOS_SETTINGS}
 	cd ./02_simple_reduce_lambda; \
-	make clean -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce_lambda/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/02_simple_reduce_lambda/Makefile ${KOKKOS_SETTINGS}
 	cd ./03_simple_view; \
-	make clean -f ${KOKKOS_PATH}/example/tutorial/03_simple_view/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/03_simple_view/Makefile ${KOKKOS_SETTINGS}
 	cd ./03_simple_view_lambda; \
-	make clean -f ${KOKKOS_PATH}/example/tutorial/03_simple_view_lambda/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/03_simple_view_lambda/Makefile ${KOKKOS_SETTINGS}
 	cd ./04_simple_memoryspaces; \
-	make clean -f ${KOKKOS_PATH}/example/tutorial/04_simple_memoryspaces/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/04_simple_memoryspaces/Makefile ${KOKKOS_SETTINGS}
 	cd ./05_simple_atomics; \
-	make clean -f ${KOKKOS_PATH}/example/tutorial/05_simple_atomics/Makefile ${KOKKOS_SETTINGS}
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/05_simple_atomics/Makefile ${KOKKOS_SETTINGS}
 	cd ./Advanced_Views; \
-	make clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Advanced_Views/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
 	cd ./Algorithms; \
-	make clean -f ${KOKKOS_PATH}/example/tutorial/Algorithms/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Algorithms/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
 	cd ./Hierarchical_Parallelism; \
-	make clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
+	$(MAKE) clean -f ${KOKKOS_PATH}/example/tutorial/Hierarchical_Parallelism/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
 
 clean-insource:
 	cd ./01_hello_world; \
-	make clean ${KOKKOS_SETTINGS}
+	$(MAKE) clean ${KOKKOS_SETTINGS}
 	cd ./01_hello_world_lambda; \
-	make clean ${KOKKOS_SETTINGS}
+	$(MAKE) clean ${KOKKOS_SETTINGS}
 	cd ./02_simple_reduce; \
-	make clean ${KOKKOS_SETTINGS}
+	$(MAKE) clean ${KOKKOS_SETTINGS}
 	cd ./02_simple_reduce_lambda; \
-	make clean ${KOKKOS_SETTINGS}
+	$(MAKE) clean ${KOKKOS_SETTINGS}
 	cd ./03_simple_view; \
-	make clean ${KOKKOS_SETTINGS}
+	$(MAKE) clean ${KOKKOS_SETTINGS}
 	cd ./03_simple_view_lambda; \
-	make clean ${KOKKOS_SETTINGS}
+	$(MAKE) clean ${KOKKOS_SETTINGS}
 	cd ./04_simple_memoryspaces; \
-	make clean ${KOKKOS_SETTINGS}
+	$(MAKE) clean ${KOKKOS_SETTINGS}
 	cd ./05_simple_atomics; \
-	make clean ${KOKKOS_SETTINGS}
+	$(MAKE) clean ${KOKKOS_SETTINGS}
 	cd ./Advanced_Views; \
-	make clean KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
+	$(MAKE) clean KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
 	cd ./Algorithms; \
-	make clean KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
+	$(MAKE) clean KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
 	cd ./Hierarchical_Parallelism; \
-	make clean KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
+	$(MAKE) clean KOKKOS_SETTINGS='${KOKKOS_SETTINGS}'
diff --git a/lib/kokkos/example/tutorial/launch_bounds/CMakeLists.txt b/lib/kokkos/example/tutorial/launch_bounds/CMakeLists.txt
new file mode 100644
index 0000000000..7c78db840f
--- /dev/null
+++ b/lib/kokkos/example/tutorial/launch_bounds/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+# This is a tutorial, not a test, so we don't ask CTest to run it.
+TRIBITS_ADD_EXECUTABLE(
+  tutorial_02_simple_reduce
+  SOURCES simple_reduce.cpp
+  COMM serial mpi
+  )
diff --git a/lib/kokkos/example/tutorial/launch_bounds/Makefile b/lib/kokkos/example/tutorial/launch_bounds/Makefile
new file mode 100644
index 0000000000..5b605a4119
--- /dev/null
+++ b/lib/kokkos/example/tutorial/launch_bounds/Makefile
@@ -0,0 +1,56 @@
+KOKKOS_PATH = ../../..
+KOKKOS_SRC_PATH = ${KOKKOS_PATH}
+SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/launch_bounds/*.cpp)
+vpath %.cpp $(sort $(dir $(SRC)))
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS = 
+EXE = launch_bounds.cuda
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS =  
+EXE = launch_bounds.host
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+
+# WAR for "undefined memcpy" w/ Ubuntu + CUDA 7.5
+CXXFLAGS += -D_FORCE_INLINES
+# Additional compile-time information
+CXXFLAGS += -Xptxas=-v
+
+DEPFLAGS = -M
+
+OBJ = $(notdir $(SRC:.cpp=.o))
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+temp:
+	echo $(KOKKOS_INTERNAL_USE_CUDA) $(CUDA_PATH)
+
+build: $(EXE)
+
+test: $(EXE)
+	./$(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@)
diff --git a/lib/kokkos/example/tutorial/launch_bounds/launch_bounds_reduce.cpp b/lib/kokkos/example/tutorial/launch_bounds/launch_bounds_reduce.cpp
new file mode 100644
index 0000000000..9a26eda507
--- /dev/null
+++ b/lib/kokkos/example/tutorial/launch_bounds/launch_bounds_reduce.cpp
@@ -0,0 +1,173 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <cstdio>
+
+//
+// First reduction (parallel_reduce) example:
+//   1. Start up Kokkos
+//   2. Execute a parallel_reduce loop in the default execution space,
+//      using a functor to define the loop body
+//   3. Shut down Kokkos
+//
+struct collision {
+// Reduction functor
+// For each i, we generate 10 hashes, look for and count collisions
+// We use parallel_reduce to count the total collisions
+// Note that we're just counting collisions within the 10 generated
+// one i.
+// This function was chosen as one that very simply can increase the
+// register count.
+  typedef int value_type;
+
+  KOKKOS_INLINE_FUNCTION
+  int hash(int q) const {
+	  // A simple hash by Justin Sobel
+	  // Thanks to Arash Partow (partow.net)
+	  char* fourchars = (char*)&q;
+	  int hash = 1315423911;
+	  for (int i=0; i<4; fourchars++, i++) {
+		  hash ^= ((hash<<5) + *fourchars + (hash >> 2));
+	  }
+	  return hash;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator () (const int i, int& lsum) const {
+	  //This is a silly function which generates 10 hashes
+	  // then checks for collisions
+	  int a = hash(i)%64;
+	  int b = hash(i*3)%64;
+	  int c = hash(i*5)%64;
+	  int d = hash(i*7)%64;
+	  int e = hash(i*11)%64;
+	  int f = hash(i*17)%64;
+	  int g = hash(i*23)%64;
+	  int h = hash(i*29)%64;
+	  int j = hash(i*31)%64;
+	  int k = hash(i*37)%64;
+
+
+	  if (a==b) lsum++;
+	  if (a==c) lsum++;
+	  if (a==d) lsum++;
+	  if (a==e) lsum++;
+	  if (a==f) lsum++;
+	  if (a==g) lsum++;
+	  if (a==h) lsum++;
+	  if (a==j) lsum++;
+	  if (a==k) lsum++;
+	  if (b==c) lsum++;
+	  if (b==d) lsum++;
+	  if (b==e) lsum++;
+	  if (b==f) lsum++;
+	  if (b==g) lsum++;
+	  if (b==h) lsum++;
+	  if (b==j) lsum++;
+	  if (b==k) lsum++;
+	  if (c==d) lsum++;
+	  if (c==e) lsum++;
+	  if (c==f) lsum++;
+	  if (c==g) lsum++;
+	  if (c==h) lsum++;
+	  if (c==j) lsum++;
+	  if (c==k) lsum++;
+	  if (d==e) lsum++;
+	  if (d==f) lsum++;
+	  if (d==g) lsum++;
+	  if (d==h) lsum++;
+	  if (d==j) lsum++;
+	  if (d==k) lsum++;
+	  if (e==f) lsum++;
+	  if (e==g) lsum++;
+	  if (e==h) lsum++;
+	  if (e==j) lsum++;
+	  if (e==k) lsum++;
+	  if (f==g) lsum++;
+	  if (f==h) lsum++;
+	  if (f==j) lsum++;
+	  if (f==k) lsum++;
+	  if (g==h) lsum++;
+	  if (g==j) lsum++;
+	  if (g==k) lsum++;
+	  if (h==j) lsum++;
+	  if (h==k) lsum++;
+	  if (j==k) lsum++;
+  }
+
+
+
+};
+
+int main (int argc, char* argv[]) {
+  Kokkos::initialize (argc, argv);
+  const int n = 10000;
+
+  // Compute and count hash collisions in
+  // parallel, using Kokkos.
+  // This is not really a useful algorithm, but it demonstrates the
+  // LaunchBounds functionality
+  int sum1 = 0;
+  int sum2 = 0;
+  
+  //Without LaunchBounds, the kernel uses 56 registers
+  Kokkos::parallel_reduce (n, collision (), sum1);
+
+  //With LaunchBounds, we can reduce the register usage to 32
+  Kokkos::parallel_reduce (Kokkos::RangePolicy<Kokkos::LaunchBounds<512,4>>(0,n), collision (), sum2);
+
+  printf ("Number of collisions, "
+          "computed in parallel, is %i\n", sum1);
+
+  if (sum1 != sum2) {
+	  printf( "Uh-oh! Results do not match\n");
+	  return -1;
+  }
+
+  Kokkos::finalize();
+  
+
+  return 0;
+}
+
diff --git a/lib/kokkos/generate_makefile.bash b/lib/kokkos/generate_makefile.bash
index 5f2442102d..6d636dc7e4 100755
--- a/lib/kokkos/generate_makefile.bash
+++ b/lib/kokkos/generate_makefile.bash
@@ -1,7 +1,6 @@
 #!/bin/bash
 
 KOKKOS_DEVICES=""
-MAKE_J_OPTION="32"
 
 KOKKOS_DO_EXAMPLES="1"
 
@@ -70,7 +69,8 @@ do
       KOKKOS_DEBUG=yes
       ;;
     --make-j*)
-      MAKE_J_OPTION="${key#*=}"
+      echo "Warning: ${key} is deprecated"
+      echo "Call make with appropriate -j flag"
       ;;
     --no-examples)
       KOKKOS_DO_EXAMPLES="0"
@@ -110,23 +110,34 @@ do
       echo "--with-devices:                       Explicitly add a set of backends."
       echo ""
       echo "--arch=[OPT]:  Set target architectures. Options are:"
+      echo "               [AMD]"
+      echo "                 AMDAVX         = AMD CPU"
+      echo "               [ARM]"
       echo "                 ARMv80         = ARMv8.0 Compatible CPU"
       echo "                 ARMv81         = ARMv8.1 Compatible CPU"
       echo "                 ARMv8-ThunderX = ARMv8 Cavium ThunderX CPU"
+      echo "               [IBM]"
+      echo "                 Power8         = IBM POWER8 CPUs"
+      echo "                 Power9         = IBM POWER9 CPUs"
+      echo "               [Intel]"
+      echo "                 WSM            = Intel Westmere CPUs"
       echo "                 SNB            = Intel Sandy/Ivy Bridge CPUs"
       echo "                 HSW            = Intel Haswell CPUs"
       echo "                 BDW            = Intel Broadwell Xeon E-class CPUs"
       echo "                 SKX            = Intel Sky Lake Xeon E-class HPC CPUs (AVX512)"
+      echo "               [Intel Xeon Phi]"
       echo "                 KNC            = Intel Knights Corner Xeon Phi"
       echo "                 KNL            = Intel Knights Landing Xeon Phi"
+      echo "               [NVIDIA]"
       echo "                 Kepler30       = NVIDIA Kepler generation CC 3.0"
+      echo "                 Kepler32       = NVIDIA Kepler generation CC 3.2"
       echo "                 Kepler35       = NVIDIA Kepler generation CC 3.5"
       echo "                 Kepler37       = NVIDIA Kepler generation CC 3.7"
+      echo "                 Maxwell50      = NVIDIA Maxwell generation CC 5.0"
+      echo "                 Maxwell52      = NVIDIA Maxwell generation CC 5.2"
+      echo "                 Maxwell53      = NVIDIA Maxwell generation CC 5.3"
       echo "                 Pascal60       = NVIDIA Pascal generation CC 6.0"
       echo "                 Pascal61       = NVIDIA Pascal generation CC 6.1"
-      echo "                 Maxwell50      = NVIDIA Maxwell generation CC 5.0"
-      echo "                 Power8         = IBM POWER8 CPUs"
-      echo "                 Power9         = IBM POWER9 CPUs"
       echo ""
       echo "--compiler=/Path/To/Compiler  Set the compiler."
       echo "--debug,-dbg:                 Enable Debugging."
@@ -142,10 +153,14 @@ do
       echo "                                tests.)"
       echo "--with-hwloc=/Path/To/Hwloc:  Set path to hwloc."
       echo "--with-options=[OPT]:         Additional options to Kokkos:"
+      echo "                                compiler_warnings"
       echo "                                aggressive_vectorization = add ivdep on loops"
+      echo "                                disable_profiling = do not compile with profiling hooks"
+      echo "                                "
       echo "--with-cuda-options=[OPT]:    Additional options to CUDA:"
       echo "                                force_uvm, use_ldg, enable_lambda, rdc"
-      echo "--make-j=[NUM]:               Set -j flag used during build."
+      echo "--make-j=[NUM]:               DEPRECATED: call make with appropriate"
+      echo "                                -j flag"
       exit 0
       ;;
     *)
@@ -237,27 +252,27 @@ else
   KOKKOS_INSTALL_PATH=${KOKKOS_TEST_INSTALL_PATH}
 fi
 
-mkdir install
+mkdir -p install
 echo "#Makefile to satisfy existens of target kokkos-clean before installing the library" > install/Makefile.kokkos
 echo "kokkos-clean:" >> install/Makefile.kokkos
 echo "" >> install/Makefile.kokkos
-mkdir core
-mkdir core/unit_test
-mkdir core/perf_test
-mkdir containers
-mkdir containers/unit_tests
-mkdir containers/performance_tests
-mkdir algorithms
-mkdir algorithms/unit_tests
-mkdir algorithms/performance_tests
-mkdir example
-mkdir example/fixture
-mkdir example/feint
-mkdir example/fenl
-mkdir example/tutorial
+mkdir -p core
+mkdir -p core/unit_test
+mkdir -p core/perf_test
+mkdir -p containers
+mkdir -p containers/unit_tests
+mkdir -p containers/performance_tests
+mkdir -p algorithms
+mkdir -p algorithms/unit_tests
+mkdir -p algorithms/performance_tests
+mkdir -p example
+mkdir -p example/fixture
+mkdir -p example/feint
+mkdir -p example/fenl
+mkdir -p example/tutorial
 
 if [ ${#KOKKOS_ENABLE_EXAMPLE_ICHOL} -gt 0 ]; then
-  mkdir example/ichol
+  mkdir -p example/ichol
 fi
 
 KOKKOS_SETTINGS="${KOKKOS_SETTINGS_NO_KOKKOS_PATH} KOKKOS_PATH=${KOKKOS_PATH}"
@@ -266,115 +281,115 @@ KOKKOS_SETTINGS="${KOKKOS_SETTINGS_NO_KOKKOS_PATH} KOKKOS_PATH=${KOKKOS_PATH}"
 echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > core/unit_test/Makefile
 echo "" >> core/unit_test/Makefile
 echo "all:" >> core/unit_test/Makefile
-echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/core/unit_test/Makefile ${KOKKOS_SETTINGS}" >> core/unit_test/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/unit_test/Makefile ${KOKKOS_SETTINGS}" >> core/unit_test/Makefile
 echo "" >> core/unit_test/Makefile
 echo "test: all" >> core/unit_test/Makefile
-echo -e "\tmake -f ${KOKKOS_PATH}/core/unit_test/Makefile ${KOKKOS_SETTINGS} test" >> core/unit_test/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/unit_test/Makefile ${KOKKOS_SETTINGS} test" >> core/unit_test/Makefile
 echo "" >> core/unit_test/Makefile
 echo "clean:" >> core/unit_test/Makefile
-echo -e "\tmake -f ${KOKKOS_PATH}/core/unit_test/Makefile ${KOKKOS_SETTINGS} clean" >> core/unit_test/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/unit_test/Makefile ${KOKKOS_SETTINGS} clean" >> core/unit_test/Makefile
 
 echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > core/perf_test/Makefile
 echo "" >> core/perf_test/Makefile
 echo "all:" >> core/perf_test/Makefile
-echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/core/perf_test/Makefile ${KOKKOS_SETTINGS}" >> core/perf_test/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/perf_test/Makefile ${KOKKOS_SETTINGS}" >> core/perf_test/Makefile
 echo "" >> core/perf_test/Makefile
 echo "test: all" >> core/perf_test/Makefile
-echo -e "\tmake -f ${KOKKOS_PATH}/core/perf_test/Makefile ${KOKKOS_SETTINGS} test" >> core/perf_test/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/perf_test/Makefile ${KOKKOS_SETTINGS} test" >> core/perf_test/Makefile
 echo "" >> core/perf_test/Makefile
 echo "clean:" >> core/perf_test/Makefile
-echo -e "\tmake -f ${KOKKOS_PATH}/core/perf_test/Makefile ${KOKKOS_SETTINGS} clean" >> core/perf_test/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/perf_test/Makefile ${KOKKOS_SETTINGS} clean" >> core/perf_test/Makefile
 
 echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > containers/unit_tests/Makefile
 echo "" >> containers/unit_tests/Makefile
 echo "all:" >> containers/unit_tests/Makefile
-echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/containers/unit_tests/Makefile ${KOKKOS_SETTINGS}" >> containers/unit_tests/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/containers/unit_tests/Makefile ${KOKKOS_SETTINGS}" >> containers/unit_tests/Makefile
 echo "" >> containers/unit_tests/Makefile
 echo "test: all" >> containers/unit_tests/Makefile
-echo -e "\tmake -f ${KOKKOS_PATH}/containers/unit_tests/Makefile ${KOKKOS_SETTINGS} test" >> containers/unit_tests/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/containers/unit_tests/Makefile ${KOKKOS_SETTINGS} test" >> containers/unit_tests/Makefile
 echo "" >> containers/unit_tests/Makefile
 echo "clean:" >> containers/unit_tests/Makefile
-echo -e "\tmake -f ${KOKKOS_PATH}/containers/unit_tests/Makefile ${KOKKOS_SETTINGS} clean" >> containers/unit_tests/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/containers/unit_tests/Makefile ${KOKKOS_SETTINGS} clean" >> containers/unit_tests/Makefile
 
 echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > containers/performance_tests/Makefile
 echo "" >> containers/performance_tests/Makefile
 echo "all:" >> containers/performance_tests/Makefile
-echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/containers/performance_tests/Makefile ${KOKKOS_SETTINGS}" >> containers/performance_tests/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/containers/performance_tests/Makefile ${KOKKOS_SETTINGS}" >> containers/performance_tests/Makefile
 echo "" >> containers/performance_tests/Makefile
 echo "test: all" >> containers/performance_tests/Makefile
-echo -e "\tmake -f ${KOKKOS_PATH}/containers/performance_tests/Makefile ${KOKKOS_SETTINGS} test" >> containers/performance_tests/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/containers/performance_tests/Makefile ${KOKKOS_SETTINGS} test" >> containers/performance_tests/Makefile
 echo "" >> containers/performance_tests/Makefile
 echo "clean:" >> containers/performance_tests/Makefile
-echo -e "\tmake -f ${KOKKOS_PATH}/containers/performance_tests/Makefile ${KOKKOS_SETTINGS} clean" >> containers/performance_tests/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/containers/performance_tests/Makefile ${KOKKOS_SETTINGS} clean" >> containers/performance_tests/Makefile
 
 echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > algorithms/unit_tests/Makefile
 echo "" >> algorithms/unit_tests/Makefile
 echo "all:" >> algorithms/unit_tests/Makefile
-echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/algorithms/unit_tests/Makefile ${KOKKOS_SETTINGS}" >> algorithms/unit_tests/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/algorithms/unit_tests/Makefile ${KOKKOS_SETTINGS}" >> algorithms/unit_tests/Makefile
 echo "" >> algorithms/unit_tests/Makefile
 echo "test: all" >> algorithms/unit_tests/Makefile
-echo -e "\tmake -f ${KOKKOS_PATH}/algorithms/unit_tests/Makefile ${KOKKOS_SETTINGS} test" >> algorithms/unit_tests/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/algorithms/unit_tests/Makefile ${KOKKOS_SETTINGS} test" >> algorithms/unit_tests/Makefile
 echo "" >> algorithms/unit_tests/Makefile
 echo "clean:" >> algorithms/unit_tests/Makefile
-echo -e "\tmake -f ${KOKKOS_PATH}/algorithms/unit_tests/Makefile ${KOKKOS_SETTINGS} clean" >> algorithms/unit_tests/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/algorithms/unit_tests/Makefile ${KOKKOS_SETTINGS} clean" >> algorithms/unit_tests/Makefile
 
 KOKKOS_SETTINGS="${KOKKOS_SETTINGS_NO_KOKKOS_PATH} KOKKOS_PATH=${KOKKOS_TEST_INSTALL_PATH}"
 
 echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > example/fixture/Makefile
 echo "" >> example/fixture/Makefile
 echo "all:" >> example/fixture/Makefile
-echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/fixture/Makefile ${KOKKOS_SETTINGS}" >> example/fixture/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/fixture/Makefile ${KOKKOS_SETTINGS}" >> example/fixture/Makefile
 echo "" >> example/fixture/Makefile
 echo "test: all" >> example/fixture/Makefile
-echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/fixture/Makefile ${KOKKOS_SETTINGS} test" >> example/fixture/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/fixture/Makefile ${KOKKOS_SETTINGS} test" >> example/fixture/Makefile
 echo "" >> example/fixture/Makefile
 echo "clean:" >> example/fixture/Makefile
-echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/fixture/Makefile ${KOKKOS_SETTINGS} clean" >> example/fixture/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/fixture/Makefile ${KOKKOS_SETTINGS} clean" >> example/fixture/Makefile
 
 echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > example/feint/Makefile
 echo "" >> example/feint/Makefile
 echo "all:" >> example/feint/Makefile
-echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/feint/Makefile ${KOKKOS_SETTINGS}" >> example/feint/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/feint/Makefile ${KOKKOS_SETTINGS}" >> example/feint/Makefile
 echo "" >> example/feint/Makefile
 echo "test: all" >> example/feint/Makefile
-echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/feint/Makefile ${KOKKOS_SETTINGS} test" >> example/feint/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/feint/Makefile ${KOKKOS_SETTINGS} test" >> example/feint/Makefile
 echo "" >> example/feint/Makefile
 echo "clean:" >> example/feint/Makefile
-echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/feint/Makefile ${KOKKOS_SETTINGS} clean" >> example/feint/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/feint/Makefile ${KOKKOS_SETTINGS} clean" >> example/feint/Makefile
 
 echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > example/fenl/Makefile
 echo "" >> example/fenl/Makefile
 echo "all:" >> example/fenl/Makefile
-echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/fenl/Makefile ${KOKKOS_SETTINGS}" >> example/fenl/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/fenl/Makefile ${KOKKOS_SETTINGS}" >> example/fenl/Makefile
 echo "" >> example/fenl/Makefile
 echo "test: all" >> example/fenl/Makefile
-echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/fenl/Makefile ${KOKKOS_SETTINGS} test" >> example/fenl/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/fenl/Makefile ${KOKKOS_SETTINGS} test" >> example/fenl/Makefile
 echo "" >> example/fenl/Makefile
 echo "clean:" >> example/fenl/Makefile
-echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/fenl/Makefile ${KOKKOS_SETTINGS} clean" >> example/fenl/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/fenl/Makefile ${KOKKOS_SETTINGS} clean" >> example/fenl/Makefile
 
 echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > example/tutorial/Makefile
 echo "" >> example/tutorial/Makefile
 echo "build:" >> example/tutorial/Makefile
-echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/tutorial/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' KOKKOS_PATH=${KOKKOS_PATH} build">> example/tutorial/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/tutorial/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' KOKKOS_PATH=${KOKKOS_PATH} build">> example/tutorial/Makefile
 echo "" >> example/tutorial/Makefile
 echo "test: build" >> example/tutorial/Makefile
-echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/tutorial/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' KOKKOS_PATH=${KOKKOS_PATH} test" >> example/tutorial/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/tutorial/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' KOKKOS_PATH=${KOKKOS_PATH} test" >> example/tutorial/Makefile
 echo "" >> example/tutorial/Makefile
 echo "clean:" >> example/tutorial/Makefile
-echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/tutorial/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' KOKKOS_PATH=${KOKKOS_PATH} clean" >> example/tutorial/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/tutorial/Makefile KOKKOS_SETTINGS='${KOKKOS_SETTINGS}' KOKKOS_PATH=${KOKKOS_PATH} clean" >> example/tutorial/Makefile
 
 if [ ${#KOKKOS_ENABLE_EXAMPLE_ICHOL} -gt 0 ]; then
 echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > example/ichol/Makefile
 echo "" >> example/ichol/Makefile
 echo "all:" >> example/ichol/Makefile
-echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/ichol/Makefile ${KOKKOS_SETTINGS}" >> example/ichol/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/ichol/Makefile ${KOKKOS_SETTINGS}" >> example/ichol/Makefile
 echo "" >> example/ichol/Makefile
 echo "test: all" >> example/ichol/Makefile
-echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/ichol/Makefile ${KOKKOS_SETTINGS} test" >> example/ichol/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/ichol/Makefile ${KOKKOS_SETTINGS} test" >> example/ichol/Makefile
 echo "" >> example/ichol/Makefile
 echo "clean:" >> example/ichol/Makefile
-echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/example/ichol/Makefile ${KOKKOS_SETTINGS} clean" >> example/ichol/Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/example/ichol/Makefile ${KOKKOS_SETTINGS} clean" >> example/ichol/Makefile
 fi
 
 KOKKOS_SETTINGS="${KOKKOS_SETTINGS_NO_KOKKOS_PATH} KOKKOS_PATH=${KOKKOS_PATH}"
@@ -385,62 +400,64 @@ echo "KOKKOS_SETTINGS=${KOKKOS_SETTINGS}" > Makefile
 echo "" >> Makefile
 echo "kokkoslib:" >> Makefile
 echo -e "\tcd core; \\" >> Makefile
-echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_SETTINGS} PREFIX=${KOKKOS_INSTALL_PATH} build-lib" >> Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_SETTINGS} PREFIX=${KOKKOS_INSTALL_PATH} build-lib" >> Makefile
 echo "" >> Makefile
 echo "install: kokkoslib" >> Makefile
 echo -e "\tcd core; \\" >> Makefile
-echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_SETTINGS} PREFIX=${KOKKOS_INSTALL_PATH} install" >> Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_SETTINGS} PREFIX=${KOKKOS_INSTALL_PATH} install" >> Makefile
 echo "" >> Makefile
 echo "kokkoslib-test:" >> Makefile
 echo -e "\tcd core; \\" >> Makefile
-echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_SETTINGS} PREFIX=${KOKKOS_TEST_INSTALL_PATH} build-lib" >> Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_SETTINGS} PREFIX=${KOKKOS_TEST_INSTALL_PATH} build-lib" >> Makefile
 echo "" >> Makefile
 echo "install-test: kokkoslib-test" >> Makefile
 echo -e "\tcd core; \\" >> Makefile
-echo -e "\tmake -j ${MAKE_J_OPTION} -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_SETTINGS} PREFIX=${KOKKOS_TEST_INSTALL_PATH} install" >> Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_SETTINGS} PREFIX=${KOKKOS_TEST_INSTALL_PATH} install" >> Makefile
 echo "" >> Makefile
 echo "build-test: install-test" >> Makefile
-echo -e "\tmake -C core/unit_test" >> Makefile
-echo -e "\tmake -C core/perf_test" >> Makefile
-echo -e "\tmake -C containers/unit_tests" >> Makefile
-echo -e "\tmake -C containers/performance_tests" >> Makefile
-echo -e "\tmake -C algorithms/unit_tests" >> Makefile
+echo -e "\t\$(MAKE) -C core/unit_test" >> Makefile
+echo -e "\t\$(MAKE) -C core/perf_test" >> Makefile
+echo -e "\t\$(MAKE) -C containers/unit_tests" >> Makefile
+echo -e "\t\$(MAKE) -C containers/performance_tests" >> Makefile
+echo -e "\t\$(MAKE) -C algorithms/unit_tests" >> Makefile
 if [ ${KOKKOS_DO_EXAMPLES} -gt 0 ]; then
-echo -e "\tmake -C example/fixture" >> Makefile
-echo -e "\tmake -C example/feint" >> Makefile
-echo -e "\tmake -C example/fenl" >> Makefile
-echo -e "\tmake -C example/tutorial build" >> Makefile
+echo -e "\t\$(MAKE) -C example/fixture" >> Makefile
+echo -e "\t\$(MAKE) -C example/feint" >> Makefile
+echo -e "\t\$(MAKE) -C example/fenl" >> Makefile
+echo -e "\t\$(MAKE) -C example/tutorial build" >> Makefile
 fi
 echo "" >> Makefile
 echo "test: build-test" >> Makefile
-echo -e "\tmake -C core/unit_test test" >> Makefile
-echo -e "\tmake -C core/perf_test test" >> Makefile
-echo -e "\tmake -C containers/unit_tests test" >> Makefile
-echo -e "\tmake -C containers/performance_tests test" >> Makefile
-echo -e "\tmake -C algorithms/unit_tests test" >> Makefile
+echo -e "\t\$(MAKE) -C core/unit_test test" >> Makefile
+echo -e "\t\$(MAKE) -C core/perf_test test" >> Makefile
+echo -e "\t\$(MAKE) -C containers/unit_tests test" >> Makefile
+echo -e "\t\$(MAKE) -C containers/performance_tests test" >> Makefile
+echo -e "\t\$(MAKE) -C algorithms/unit_tests test" >> Makefile
 if [ ${KOKKOS_DO_EXAMPLES} -gt 0 ]; then
-echo -e "\tmake -C example/fixture test" >> Makefile
-echo -e "\tmake -C example/feint test" >> Makefile
-echo -e "\tmake -C example/fenl test" >> Makefile
-echo -e "\tmake -C example/tutorial test" >> Makefile
+echo -e "\t\$(MAKE) -C example/fixture test" >> Makefile
+echo -e "\t\$(MAKE) -C example/feint test" >> Makefile
+echo -e "\t\$(MAKE) -C example/fenl test" >> Makefile
+echo -e "\t\$(MAKE) -C example/tutorial test" >> Makefile
 fi
 echo "" >> Makefile
 echo "unit-tests-only:" >> Makefile
-echo -e "\tmake -C core/unit_test test" >> Makefile
-echo -e "\tmake -C containers/unit_tests test" >> Makefile
-echo -e "\tmake -C algorithms/unit_tests test" >> Makefile
+echo -e "\t\$(MAKE) -C core/unit_test test" >> Makefile
+echo -e "\t\$(MAKE) -C containers/unit_tests test" >> Makefile
+echo -e "\t\$(MAKE) -C algorithms/unit_tests test" >> Makefile
 echo "" >> Makefile
+
 echo "clean:" >> Makefile
-echo -e "\tmake -C core/unit_test clean" >> Makefile
-echo -e "\tmake -C core/perf_test clean" >> Makefile
-echo -e "\tmake -C containers/unit_tests clean" >> Makefile
-echo -e "\tmake -C containers/performance_tests clean" >> Makefile
-echo -e "\tmake -C algorithms/unit_tests clean" >> Makefile
+echo -e "\t\$(MAKE) -C core/unit_test clean" >> Makefile
+echo -e "\t\$(MAKE) -C core/perf_test clean" >> Makefile
+echo -e "\t\$(MAKE) -C containers/unit_tests clean" >> Makefile
+echo -e "\t\$(MAKE) -C containers/performance_tests clean" >> Makefile
+echo -e "\t\$(MAKE) -C algorithms/unit_tests clean" >> Makefile
 if [ ${KOKKOS_DO_EXAMPLES} -gt 0 ]; then
-echo -e "\tmake -C example/fixture clean" >> Makefile
-echo -e "\tmake -C example/feint clean" >> Makefile
-echo -e "\tmake -C example/fenl clean" >> Makefile
-echo -e "\tmake -C example/tutorial clean" >> Makefile
+echo -e "\t\$(MAKE) -C example/fixture clean" >> Makefile
+echo -e "\t\$(MAKE) -C example/feint clean" >> Makefile
+echo -e "\t\$(MAKE) -C example/fenl clean" >> Makefile
+echo -e "\t\$(MAKE) -C example/tutorial clean" >> Makefile
 fi
 echo -e "\tcd core; \\" >> Makefile
-echo -e "\tmake -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_SETTINGS} clean" >> Makefile
+echo -e "\t\$(MAKE) -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_SETTINGS} clean" >> Makefile
+
diff --git a/lib/mscg/Install.py b/lib/mscg/Install.py
index 154f5aa522..76c986ef6d 100644
--- a/lib/mscg/Install.py
+++ b/lib/mscg/Install.py
@@ -65,13 +65,27 @@ def which(program):
   return None
 
 def geturl(url,fname):
+  success = False
+
   if which('curl') != None:
     cmd = 'curl -L -o "%s" %s' % (fname,url)
-  elif which('wget') != None:
+    try:
+      subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
+      success = True
+    except subprocess.CalledProcessError as e:
+      print("Calling curl failed with: %s" % e.output.decode('UTF-8'))
+
+  if not success and which('wget') != None:
     cmd = 'wget -O "%s" %s' % (fname,url)
-  else: error("cannot find 'wget' or 'curl' to download source code")
-  txt = subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
-  return txt
+    try:
+      subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
+      success = True
+    except subprocess.CalledProcessError as e:
+      print("Calling wget failed with: %s" % e.output.decode('UTF-8'))
+
+  if not success:
+    error("Failed to download source code with 'curl' or 'wget'")
+  return
 
 # parse args
 
diff --git a/lib/smd/Install.py b/lib/smd/Install.py
index 00891339d0..9247cb449b 100644
--- a/lib/smd/Install.py
+++ b/lib/smd/Install.py
@@ -65,13 +65,27 @@ def which(program):
   return None
 
 def geturl(url,fname):
+  success = False
+
   if which('curl') != None:
     cmd = 'curl -L -o "%s" %s' % (fname,url)
-  elif which('wget') != None:
+    try:
+      subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
+      success = True
+    except subprocess.CalledProcessError as e:
+      print("Calling curl failed with: %s" % e.output.decode('UTF-8'))
+
+  if not success and which('wget') != None:
     cmd = 'wget -O "%s" %s' % (fname,url)
-  else: error("cannot find 'wget' or 'curl' to download source code")
-  txt = subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
-  return txt
+    try:
+      subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
+      success = True
+    except subprocess.CalledProcessError as e:
+      print("Calling wget failed with: %s" % e.output.decode('UTF-8'))
+
+  if not success:
+    error("Failed to download source code with 'curl' or 'wget'")
+  return
 
 # parse args
 
diff --git a/lib/voronoi/Install.py b/lib/voronoi/Install.py
index 4998358d27..f40eb53bc6 100644
--- a/lib/voronoi/Install.py
+++ b/lib/voronoi/Install.py
@@ -64,13 +64,27 @@ def which(program):
   return None
 
 def geturl(url,fname):
+  success = False
+
   if which('curl') != None:
     cmd = 'curl -L -o "%s" %s' % (fname,url)
-  elif which('wget') != None:
+    try:
+      subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
+      success = True
+    except subprocess.CalledProcessError as e:
+      print("Calling curl failed with: %s" % e.output.decode('UTF-8'))
+
+  if not success and which('wget') != None:
     cmd = 'wget -O "%s" %s' % (fname,url)
-  else: error("cannot find 'wget' or 'curl' to download source code")
-  txt = subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
-  return txt
+    try:
+      subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
+      success = True
+    except subprocess.CalledProcessError as e:
+      print("Calling wget failed with: %s" % e.output.decode('UTF-8'))
+
+  if not success:
+    error("Failed to download source code with 'curl' or 'wget'")
+  return
 
 # parse args
 
diff --git a/src/GPU/fix_gpu.cpp b/src/GPU/fix_gpu.cpp
index 22ec8dde3b..87db73bd12 100644
--- a/src/GPU/fix_gpu.cpp
+++ b/src/GPU/fix_gpu.cpp
@@ -71,6 +71,22 @@ static const char cite_gpu_package[] =
   " year =    2013,\n"
   " volume =  184,\n"
   " pages =   {2785--2793}\n"
+  "}\n\n"
+  "@Article{Trung15,\n"
+  " author = {T. D. Nguyen, S. J. Plimpton},\n"
+  " title = {Accelerating dissipative particle dynamics simulations for soft matter systems},\n"
+  " journal = {Comput.~Mater.~Sci.},\n"
+  " year =    2015,\n"
+  " volume =  100,\n"
+  " pages =   {173--180}\n"
+  "}\n\n"
+  "@Article{Trung17,\n"
+  " author = {T. D. Nguyen},\n"
+  " title = {GPU-accelerated Tersoff potentials for massively parallel Molecular Dynamics simulations},\n"
+  " journal = {Comp.~Phys.~Comm.},\n"
+  " year =    2017,\n"
+  " volume =  212,\n"
+  " pages =   {113--122}\n"
   "}\n\n";
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi b/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi
index ac8279949a..6a4c4c14be 100644
--- a/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi
+++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi
@@ -14,7 +14,7 @@ SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		mpiicpc
-LINKFLAGS =	-g -qopenmp $(OPTFLAGS)
+LINKFLAGS =	-qopenmp $(OPTFLAGS)
 LIB =           -ltbbmalloc
 SIZE =		size
 
diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich b/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich
index 389a578f72..d4cbdbdb03 100644
--- a/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich
+++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich
@@ -7,7 +7,7 @@ SHELL = /bin/sh
 # specify flags and libraries needed for your compiler
 
 CC =		mpicxx -cxx=icc
-OPTFLAGS =      -xAVX -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
+OPTFLAGS =      -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
 CCFLAGS =	-g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
                 -fno-alias -ansi-alias -restrict $(OPTFLAGS)
 SHFLAGS =	-fPIC
diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi b/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi
index b65905440d..50433ce4c6 100644
--- a/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi
+++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi
@@ -8,7 +8,7 @@ SHELL = /bin/sh
 
 export OMPI_CXX = icc
 CC =		mpicxx
-OPTFLAGS =      -xAVX -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
+OPTFLAGS =      -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
 CCFLAGS =	-g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
                 -fno-alias -ansi-alias -restrict $(OPTFLAGS)
 SHFLAGS =	-fPIC
diff --git a/src/Makefile b/src/Makefile
index 7dfc2c312a..3b67d2284f 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -339,17 +339,18 @@ no-%:
         fi;
 
 # download/build/install a package library
+# update the timestamp on main.cpp to trigger a relink with "make machine"
 
 lib-%:
 	@if [ -e ../lib/$(LIBDIR)/Install.py ]; then \
 	  echo "Installing lib $(@:lib-%=%)"; \
-	  cd ../lib/$(LIBDIR); $(PYTHON) Install.py $(args); \
+	  ( cd ../lib/$(LIBDIR); $(PYTHON) Install.py $(args) ); \
 	elif [ -e ../lib/$(LIBUSERDIR)/Install.py ]; then \
 	  echo "Installing lib $(@:lib-user-%=%)"; \
-	  cd ../lib/$(LIBUSERDIR); $(PYTHON) Install.py $(args); \
+	  ( cd ../lib/$(LIBUSERDIR); $(PYTHON) Install.py $(args) ); \
 	else \
 	  echo "Install script for lib $(@:lib-%=%) does not exist"; \
-	fi;
+	fi; touch main.cpp
 
 # status = list src files that differ from package files
 # update = replace src files with newer package files
diff --git a/src/USER-COLVARS/colvarproxy_lammps_version.h b/src/USER-COLVARS/colvarproxy_lammps_version.h
index 834bd1748a..0eb6f2d95a 100644
--- a/src/USER-COLVARS/colvarproxy_lammps_version.h
+++ b/src/USER-COLVARS/colvarproxy_lammps_version.h
@@ -1,5 +1,5 @@
 #ifndef COLVARPROXY_VERSION
-#define COLVARPROXY_VERSION "2017-07-15"
+#define COLVARPROXY_VERSION "2017-07-19"
 // This file is part of the Collective Variables module (Colvars).
 // The original version of Colvars and its updates are located at:
 // https://github.com/colvars/colvars
diff --git a/src/USER-INTEL/Install.sh b/src/USER-INTEL/Install.sh
index 736059aa06..275b4839f5 100644
--- a/src/USER-INTEL/Install.sh
+++ b/src/USER-INTEL/Install.sh
@@ -46,6 +46,7 @@ action npair_intel.h
 action npair_intel.cpp
 action intel_simd.h pair_sw_intel.cpp
 action intel_intrinsics.h pair_tersoff_intel.cpp
+action intel_intrinsics_airebo.h pair_airebo_intel.cpp
 action verlet_lrt_intel.h pppm.cpp
 action verlet_lrt_intel.cpp pppm.cpp
 
diff --git a/src/USER-INTEL/README b/src/USER-INTEL/README
index c02014d0ce..3b84446057 100644
--- a/src/USER-INTEL/README
+++ b/src/USER-INTEL/README
@@ -4,9 +4,9 @@
                      --------------------------------
                      
              W. Michael Brown (Intel) michael.w.brown at intel.com
+                  Markus Hohnerbach (RWTH Aachen University)
                    William McDoniel (RWTH Aachen University)
                    Rodrigo Canales (RWTH Aachen University)
-                  Markus H�hnerbach (RWTH Aachen University)
                            Stan Moore (Sandia)
 		   Ahmed E. Ismail (RWTH Aachen University)
                    Paolo Bientinesi (RWTH Aachen University)
diff --git a/src/USER-INTEL/TEST/README b/src/USER-INTEL/TEST/README
index 758c37bf56..434189dd26 100644
--- a/src/USER-INTEL/TEST/README
+++ b/src/USER-INTEL/TEST/README
@@ -8,6 +8,7 @@
 # in.intel.sw -	        Silicon benchmark with Stillinger-Weber
 # in.intel.tersoff -    Silicon benchmark with Tersoff
 # in.intel.water -      Coarse-grain water benchmark using Stillinger-Weber
+# in.intel.airebo -     Polyethelene benchmark with AIREBO
 #
 #############################################################################
 
@@ -24,6 +25,7 @@
 # in.intel.sw -	           132.4               161.9
 # in.intel.tersoff -        83.3               101.1
 # in.intel.water -          53.4                90.3
+# in.intel.airebo -          7.3                11.8
 #
 #############################################################################
 
diff --git a/src/USER-INTEL/TEST/in.intel.airebo b/src/USER-INTEL/TEST/in.intel.airebo
new file mode 100644
index 0000000000..fcd8af4707
--- /dev/null
+++ b/src/USER-INTEL/TEST/in.intel.airebo
@@ -0,0 +1,47 @@
+# AIREBO polyethelene benchmark
+
+variable        N index on      # Newton Setting
+variable	w index 10	# Warmup Timesteps
+variable	t index 550	# Main Run Timesteps
+variable	m index 1	# Main Run Timestep Multiplier
+variable	n index 0	# Use NUMA Mapping for Multi-Node
+variable	p index 0	# Use Power Measurement
+variable	x index 4
+variable	y index 2
+variable	z index 2
+
+variable	xx equal 17*$x
+variable	yy equal 16*$y
+variable	zz equal 2*$z
+variable	rr equal floor($t*$m)
+variable        root getenv LMP_ROOT
+
+newton          $N
+if "$n > 0"	then "processors * * * grid numa"
+
+variable            root getenv LMP_ROOT
+
+units		    metal
+atom_style	    atomic
+
+read_data	    ${root}/examples/airebo/data.airebo
+
+replicate	    ${xx} ${yy} ${zz}
+
+neighbor	    0.5 bin
+neigh_modify	    delay 5 every 1
+
+pair_style	    airebo 3.0 1 1
+pair_coeff	    * * ${root}/potentials/CH.airebo C H
+
+velocity	    all create 300.0 761341
+
+fix		    1 all nve
+timestep	    0.0005
+
+thermo		    50
+
+if "$p > 0"	then "run_style verlet/power"
+
+if "$w > 0"	then "run $w"
+run		${rr}
diff --git a/src/USER-INTEL/TEST/in.intel.eam b/src/USER-INTEL/TEST/in.intel.eam
index 5a3b3064af..6486b22ee9 100644
--- a/src/USER-INTEL/TEST/in.intel.eam
+++ b/src/USER-INTEL/TEST/in.intel.eam
@@ -5,7 +5,6 @@ variable	w index 10      # Warmup Timesteps
 variable	t index 3100    # Main Run Timesteps
 variable	m index 1       # Main Run Timestep Multiplier
 variable	n index 0       # Use NUMA Mapping for Multi-Node
-variable	b index 3       # Neighbor binsize
 variable	p index 0       # Use Power Measurement
 
 variable	x index 4
diff --git a/src/USER-INTEL/TEST/in.intel.rhodo b/src/USER-INTEL/TEST/in.intel.rhodo
index 05145d79c0..7ce7eb4452 100644
--- a/src/USER-INTEL/TEST/in.intel.rhodo
+++ b/src/USER-INTEL/TEST/in.intel.rhodo
@@ -5,7 +5,6 @@ variable	w index 10	# Warmup Timesteps
 variable	t index 520	# Main Run Timesteps
 variable	m index 1	# Main Run Timestep Multiplier
 variable	n index 0	# Use NUMA Mapping for Multi-Node
-variable        b index 3       # Neighbor binsize
 variable	p index 0	# Use Power Measurement
 variable	c index 0	# 1 to use collectives for PPPM
 variable        d index 1       # 1 to use 'diff ad' for PPPM
diff --git a/src/USER-INTEL/intel_buffers.cpp b/src/USER-INTEL/intel_buffers.cpp
index 3664bc248b..b4b664cb94 100644
--- a/src/USER-INTEL/intel_buffers.cpp
+++ b/src/USER-INTEL/intel_buffers.cpp
@@ -30,6 +30,9 @@ IntelBuffers<flt_t, acc_t>::IntelBuffers(class LAMMPS *lmp_in) :
   _off_map_listlocal = 0;
   _ccachex = 0;
   _ncache_alloc = 0;
+  _ncachetag = 0;
+  _cutneighsq = 0;
+  _cutneighghostsq = 0;
   #ifdef _LMP_INTEL_OFFLOAD
   _separate_buffers = 0;
   _off_f = 0;
@@ -447,12 +450,17 @@ void IntelBuffers<flt_t, acc_t>::free_ncache()
     flt_t *ncachez = _ncachez;
     int *ncachej = _ncachej;
     int *ncachejtype = _ncachejtype;
+    int *ncachetag = _ncachetag;
 
     #ifdef _LMP_INTEL_OFFLOAD
     if (_off_ncache) {
       #pragma offload_transfer target(mic:_cop) \
         nocopy(ncachex,ncachey,ncachez,ncachej:alloc_if(0) free_if(1)) \
         nocopy(ncachejtype:alloc_if(0) free_if(1))
+      if (ncachetag) {
+        #pragma offload_transfer target(mic:_cop) \
+          nocopy(ncachetag:alloc_if(0) free_if(1))
+      }
     }
     _off_ncache = 0;
     #endif
@@ -462,8 +470,10 @@ void IntelBuffers<flt_t, acc_t>::free_ncache()
     lmp->memory->destroy(ncachez);
     lmp->memory->destroy(ncachej);
     lmp->memory->destroy(ncachejtype);
-
+    if (ncachetag)
+      lmp->memory->destroy(ncachetag);
     _ncache_alloc = 0;
+    _ncachetag = 0;
   }
 }
 
@@ -480,7 +490,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ncache(const int off_flag,
   const int vsize = _ncache_stride * nt;
 
   if (_ncache_alloc) {
-    if (vsize > _ncache_alloc)
+    if (vsize > _ncache_alloc || (need_tag() && _ncachetag == 0))
       free_ncache();
     #ifdef _LMP_INTEL_OFFLOAD
     else if (off_flag && _off_ncache == 0)
@@ -495,6 +505,8 @@ void IntelBuffers<flt_t, acc_t>::grow_ncache(const int off_flag,
   lmp->memory->create(_ncachez, vsize, "_ncachez");
   lmp->memory->create(_ncachej, vsize, "_ncachej");
   lmp->memory->create(_ncachejtype, vsize, "_ncachejtype");
+  if (need_tag())
+    lmp->memory->create(_ncachetag, vsize, "_ncachetag");
 
   _ncache_alloc = vsize;
 
@@ -513,6 +525,14 @@ void IntelBuffers<flt_t, acc_t>::grow_ncache(const int off_flag,
         nocopy(ncachez,ncachej:length(vsize) alloc_if(1) free_if(0)) \
         nocopy(ncachejtype:length(vsize) alloc_if(1) free_if(0))
     }
+    int tsize = vsize;
+    if (!need_tag()) {
+      tsize = 16;
+      lmp->memory->create(_ncachetag, tsize, "_ncachetag");
+    }
+    int *ncachetag = _ncachetag;
+    #pragma offload_transfer target(mic:_cop)			\
+      nocopy(ncachetag:length(tsize) alloc_if(1) free_if(0))
     _off_ncache = 1;
   }
   #endif
@@ -548,7 +568,8 @@ void IntelBuffers<flt_t, acc_t>::fdotr_reduce(const int nall,
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
-void IntelBuffers<flt_t, acc_t>::set_ntypes(const int ntypes)
+void IntelBuffers<flt_t, acc_t>::set_ntypes(const int ntypes, 
+					    const int use_ghost_cut)
 {
   if (ntypes != _ntypes) {
     if (_ntypes > 0) {
@@ -558,16 +579,34 @@ void IntelBuffers<flt_t, acc_t>::set_ntypes(const int ntypes)
         #pragma offload_transfer target(mic:_cop) \
           nocopy(cutneighsqo:alloc_if(0) free_if(1))
       }
+      flt_t * cutneighghostsqo;
+      if (_cutneighghostsq && _off_threads > 0 && cutneighghostsqo != 0) {
+	cutneighghostsqo = _cutneighghostsq[0];
+        #pragma offload_transfer target(mic:_cop) \
+          nocopy(cutneighghostsqo:alloc_if(0) free_if(1))
+      }
       #endif
       lmp->memory->destroy(_cutneighsq);
+      if (_cutneighghostsq != 0) lmp->memory->destroy(_cutneighghostsq);
     }
     if (ntypes > 0) {
       lmp->memory->create(_cutneighsq, ntypes, ntypes, "_cutneighsq");
+      if (use_ghost_cut)
+	lmp->memory->create(_cutneighghostsq, ntypes, ntypes, 
+			    "_cutneighghostsq");
       #ifdef _LMP_INTEL_OFFLOAD
       flt_t * cutneighsqo = _cutneighsq[0];
+      const int ntypes2 = ntypes * ntypes;
       if (_off_threads > 0 && cutneighsqo != NULL) {
         #pragma offload_transfer target(mic:_cop) \
-          nocopy(cutneighsqo:length(ntypes * ntypes) alloc_if(1) free_if(0))
+          nocopy(cutneighsqo:length(ntypes2) alloc_if(1) free_if(0))
+      }
+      if (use_ghost_cut) {
+        flt_t * cutneighghostsqo = _cutneighghostsq[0];
+        if (_off_threads > 0 && cutneighghostsqo != NULL) {
+          #pragma offload_transfer target(mic:_cop) \
+            nocopy(cutneighghostsqo:length(ntypes2) alloc_if(1) free_if(0))
+        }
       }
       #endif
     }
diff --git a/src/USER-INTEL/intel_buffers.h b/src/USER-INTEL/intel_buffers.h
index 7a7640a203..8040715b2e 100644
--- a/src/USER-INTEL/intel_buffers.h
+++ b/src/USER-INTEL/intel_buffers.h
@@ -109,12 +109,14 @@ class IntelBuffers {
 
   void free_ncache();
   void grow_ncache(const int off_flag, const int nthreads);
+  void grow_ncachetag(const int off_flag, const int nthreads);
   inline int ncache_stride() { return _ncache_stride; }
   inline flt_t * get_ncachex() { return _ncachex; }
   inline flt_t * get_ncachey() { return _ncachey; }
   inline flt_t * get_ncachez() { return _ncachez; }
   inline int * get_ncachej() { return _ncachej; }
   inline int * get_ncachejtype() { return _ncachejtype; }
+  inline int * get_ncachetag() { return _ncachetag; }
 
   inline int get_max_nbors() {
     int mn = lmp->neighbor->oneatom * sizeof(int) /
@@ -131,7 +133,7 @@ class IntelBuffers {
       _grow_nbor_list(list, nlocal, nthreads, offload_end, pack_width);
   }
 
-  void set_ntypes(const int ntypes);
+  void set_ntypes(const int ntypes, const int use_ghost_cut = 0);
 
   inline int * firstneigh(const NeighList *list) { return _list_alloc; }
   inline int * cnumneigh(const NeighList *list) { return _cnumneigh; }
@@ -162,6 +164,7 @@ class IntelBuffers {
   inline void zero_ev()
     { for (int i = 0; i < 8; i++) _ev_global[i] = _ev_global_host[i] = 0.0; }
   inline flt_t ** get_cutneighsq() { return _cutneighsq; }
+  inline flt_t ** get_cutneighghostsq() { return _cutneighghostsq; }
   inline int get_off_threads() { return _off_threads; }
   #ifdef _LMP_INTEL_OFFLOAD
   inline void set_off_params(const int n, const int cop,
@@ -274,13 +277,10 @@ class IntelBuffers {
              used_ghost * sizeof(flt_t));
     }
   }
+  #endif
 
   inline int need_tag() { return _need_tag; }
   inline void need_tag(const int nt) { _need_tag = nt; }
-  #else
-  inline int need_tag() { return 0; }
-  inline void need_tag(const int nt) { }
-  #endif
 
   double memory_usage(const int nthreads);
 
@@ -298,7 +298,7 @@ class IntelBuffers {
   int _list_alloc_atoms;
   int *_list_alloc, *_cnumneigh, *_atombin, *_binpacked;
 
-  flt_t **_cutneighsq;
+  flt_t **_cutneighsq, **_cutneighghostsq;
   int _ntypes;
 
   int _ccache_stride;
@@ -307,7 +307,10 @@ class IntelBuffers {
 
   int _ncache_stride, _ncache_alloc;
   flt_t *_ncachex, *_ncachey, *_ncachez;
-  int *_ncachej, *_ncachejtype;
+  int *_ncachej, *_ncachejtype, *_ncachetag;
+
+  int _need_tag, _host_nmax;
+
   #ifdef LMP_USE_AVXCD
   int _ccache_stride3;
   acc_t * _ccachef;
@@ -324,7 +327,6 @@ class IntelBuffers {
   int *_off_map_special, *_off_map_nspecial, *_off_map_tag;
   int *_off_map_numneigh;
   bool _off_list_alloc;
-  int _need_tag, _host_nmax;
   #endif
 
   int _buf_size, _buf_local_size;
diff --git a/src/USER-INTEL/intel_intrinsics_airebo.h b/src/USER-INTEL/intel_intrinsics_airebo.h
new file mode 100644
index 0000000000..7b091a4ba1
--- /dev/null
+++ b/src/USER-INTEL/intel_intrinsics_airebo.h
@@ -0,0 +1,2279 @@
+#ifndef LMP_INTEL_AIREBO_SCALAR
+# ifdef __INTEL_COMPILER
+#  if defined(__MIC__) || defined(__AVX512F__)
+#   define LMP_INTEL_AIREBO_512
+#  elif defined(__AVX__)
+#   define LMP_INTEL_AIREBO_256
+#  else
+#   define LMP_INTEL_AIREBO_SCALAR
+#  endif
+# else
+#  define LMP_INTEL_AIREBO_SCALAR
+# endif
+#endif
+
+#ifdef LMP_INTEL_AIREBO_512
+
+#include <cassert>
+#include <immintrin.h>
+
+#define VEC_INLINE __attribute__((always_inline))
+
+
+#ifndef FVEC_FIRST_PASS
+#  define FVEC_LEN 8
+#  define FVEC_SUFFIX(a) a##pd
+#  define FVEC_SUFFIX_MASK(a) a##pd_mask
+#  define FVEC_MASK_T __mmask8
+#  define FVEC_VEC_T __m512d
+#  define FVEC_SCAL_T double
+#  define IVEC_NAME ivec8
+#  define FVEC_NAME fvec8pd
+#  define BVEC_NAME bvec8
+#  define AVEC_NAME avec8pd
+#else
+#  undef FVEC_LEN
+#  undef FVEC_SUFFIX
+#  undef FVEC_SUFFIX_MASK
+#  undef FVEC_MASK_T
+#  undef FVEC_VEC_T
+#  undef FVEC_SCAL_T
+#  undef IVEC_NAME
+#  undef FVEC_NAME
+#  undef BVEC_NAME
+#  undef AVEC_NAME
+
+#  define FVEC_LEN 16
+#  define FVEC_SUFFIX(a) a##ps
+#  define FVEC_SUFFIX_MASK(a) a##ps_mask
+#  define FVEC_MASK_T __mmask16
+#  define FVEC_VEC_T __m512
+#  define FVEC_SCAL_T float
+#  define IVEC_NAME ivec16
+#  define FVEC_NAME fvec16ps
+#  define BVEC_NAME bvec16
+#  define AVEC_NAME avec16ps
+#endif
+
+namespace mm512 {
+
+#ifndef __AVX512F__
+
+#ifndef FVEC_FIRST_PASS
+VEC_INLINE static inline __m512i _mm512_mask_expand_epi32(__m512i src, 
+							  __mmask16 k, 
+							  __m512i a) {
+  int buf[16] __attribute__((aligned(64)));
+  _mm512_store_epi32(buf, a);
+  return _mm512_mask_loadunpacklo_epi32(src, k, buf);
+}
+VEC_INLINE static inline __m512i _mm512_maskz_expand_epi32(__mmask16 k, 
+							   __m512i a) {
+  int buf[16] __attribute__((aligned(64)));
+  _mm512_store_epi32(buf, a);
+  return _mm512_mask_loadunpacklo_epi32(_mm512_setzero_epi32(), k, buf);
+}
+VEC_INLINE static inline __m512i _mm512_mask_compress_epi32(__m512i src, 
+							    __mmask16 k, 
+							    __m512i a) {
+  int buf[16] __attribute__((aligned(64)));
+  _mm512_store_epi32(buf, src);
+  _mm512_mask_packstorelo_epi32(buf, k, a);
+  return _mm512_load_epi32(buf);
+}
+VEC_INLINE static inline __m512i _mm512_maskz_compress_epi32(__mmask16 k, 
+							     __m512i a) {
+  int buf[16] __attribute__((aligned(64))) = {0};
+  _mm512_mask_packstorelo_epi32(buf, k, a);
+  return _mm512_load_epi32(buf);
+}
+
+VEC_INLINE static inline void _mm512_mask_compressstoreu_epi32(int * dest, 
+							       __mmask16 mask, 
+							       __m512i src) {
+  _mm512_mask_packstorelo_epi32(dest, mask, src);
+  _mm512_mask_packstorehi_epi32(dest + 16, mask, src);
+}
+
+VEC_INLINE static inline __m512i _mm512_mask_loadu_epi32(__m512i src, 
+							 __mmask16 k, 
+							 const int * mem_addr) {
+  assert((k & (k + 1)) == 0);
+  __m512i ret = _mm512_mask_loadunpacklo_epi32(src, k, mem_addr);
+  ret = _mm512_mask_loadunpackhi_epi32(ret, k, mem_addr + 16);
+  return ret;
+}
+VEC_INLINE static inline __m512i _mm512_maskz_loadu_epi32(__mmask16 k, 
+							const int * mem_addr) {
+  assert((k & (k + 1)) == 0);
+  __m512i ret = _mm512_mask_loadunpacklo_epi32(_mm512_setzero_epi32(), k, 
+					       mem_addr);
+  ret = _mm512_mask_loadunpackhi_epi32(ret, k, mem_addr + 16);
+  return ret;
+}
+VEC_INLINE static inline void _mm512_mask_storeu_epi32(int * dest, 
+						       __mmask16 mask, 
+						       __m512i src) {
+  assert((mask & (mask + 1)) == 0);
+  _mm512_mask_packstorelo_epi32(dest, mask, src);
+  _mm512_mask_packstorehi_epi32(dest + 16, mask, src);
+}
+#endif
+
+VEC_INLINE static inline FVEC_VEC_T FVEC_SUFFIX(_mm512_mask_expand_)
+  (FVEC_VEC_T src, __mmask16 k, FVEC_VEC_T a) {
+  FVEC_SCAL_T buf[FVEC_LEN] __attribute__((aligned(64)));
+  FVEC_SUFFIX(_mm512_store_)(buf, a);
+  return FVEC_SUFFIX(_mm512_mask_loadunpacklo_)(src, k, buf);
+}
+VEC_INLINE static inline FVEC_VEC_T FVEC_SUFFIX(_mm512_maskz_expand_)
+  (__mmask16 k, FVEC_VEC_T a) {
+  FVEC_SCAL_T buf[FVEC_LEN] __attribute__((aligned(64)));
+  FVEC_SUFFIX(_mm512_store_)(buf, a);
+  return FVEC_SUFFIX(_mm512_mask_loadunpacklo_)(FVEC_SUFFIX(_mm512_setzero_)(),
+						k, buf);
+}
+VEC_INLINE static inline FVEC_VEC_T FVEC_SUFFIX(_mm512_mask_compress_)
+  (FVEC_VEC_T src, __mmask16 k, FVEC_VEC_T a) {
+  FVEC_SCAL_T buf[FVEC_LEN] __attribute__((aligned(64)));
+  FVEC_SUFFIX(_mm512_store_)(buf, src);
+  FVEC_SUFFIX(_mm512_mask_packstorelo_)(buf, k, a);
+  return FVEC_SUFFIX(_mm512_load_)(buf);
+}
+VEC_INLINE static inline FVEC_VEC_T FVEC_SUFFIX(_mm512_maskz_compress_)
+  (__mmask16 k, FVEC_VEC_T a) {
+  FVEC_SCAL_T buf[FVEC_LEN] __attribute__((aligned(64))) = {0};
+  FVEC_SUFFIX(_mm512_mask_packstorelo_)(buf, k, a);
+  return FVEC_SUFFIX(_mm512_load_)(buf);
+}
+VEC_INLINE static inline void FVEC_SUFFIX(_mm512_mask_storeu_)
+  (FVEC_SCAL_T * dest, FVEC_MASK_T mask, FVEC_VEC_T src) {
+  assert((mask & (mask + 1)) == 0);
+  FVEC_SUFFIX(_mm512_mask_packstorelo_)(dest, mask, src);
+  FVEC_SUFFIX(_mm512_mask_packstorehi_)(dest + FVEC_LEN, mask, src);
+}
+#endif
+
+
+class FVEC_NAME;
+class IVEC_NAME;
+class AVEC_NAME;
+class BVEC_NAME {
+  friend class FVEC_NAME;
+  friend class IVEC_NAME;
+  friend class AVEC_NAME;
+# if FVEC_LEN==16
+  friend class avec16pd;
+# endif
+  FVEC_MASK_T val_;
+  VEC_INLINE BVEC_NAME(const FVEC_MASK_T &v) : val_(v) {}
+public:
+  VEC_INLINE BVEC_NAME() {}
+  VEC_INLINE static BVEC_NAME kand(const BVEC_NAME &a, const BVEC_NAME &b) {
+    return _mm512_kand(a.val_, b.val_);
+  }
+  VEC_INLINE static BVEC_NAME kandn(const BVEC_NAME &a, const BVEC_NAME &b) {
+    return _mm512_kandn(a.val_, b.val_);
+  }
+  VEC_INLINE static BVEC_NAME knot(const BVEC_NAME &a) {
+    return _mm512_knot(a.val_);
+  }
+  VEC_INLINE static int kortestz(const BVEC_NAME &a, const BVEC_NAME &b) {
+    return _mm512_kortestz(a.val_, b.val_);
+  }
+  VEC_INLINE static BVEC_NAME masku_compress(const BVEC_NAME &mask, 
+					     const BVEC_NAME &a) {
+    const __m512i c_i1 = _mm512_set1_epi32(1);
+    __m512i a_int_vec = _mm512_mask_blend_epi32(a.val_, _mm512_setzero_epi32(),
+						c_i1);
+    __m512i compressed = _mm512_mask_compress_epi32(_mm512_undefined_epi32(),
+						    mask.val_, a_int_vec);
+    return _mm512_cmpeq_epi32_mask(compressed, c_i1);
+  }
+  VEC_INLINE static BVEC_NAME mask_expand(const BVEC_NAME &src, 
+					  const BVEC_NAME &mask,
+					  const BVEC_NAME &a) {
+    const __m512i c_i1 = _mm512_set1_epi32(1);
+    __m512i a_int_vec = _mm512_mask_blend_epi32(a.val_, _mm512_setzero_epi32(),
+						c_i1);
+    __m512i src_int_vec = _mm512_mask_blend_epi32(src.val_, 
+						  _mm512_setzero_epi32(), c_i1);
+    __m512i compressed = _mm512_mask_expand_epi32(src_int_vec, mask.val_,
+						  a_int_vec);
+    return _mm512_cmpeq_epi32_mask(compressed, c_i1);
+  }
+  VEC_INLINE static BVEC_NAME full() {
+    return static_cast<FVEC_MASK_T>(0xFFFF);
+  }
+  VEC_INLINE static BVEC_NAME empty() {
+    return 0;
+  }
+  VEC_INLINE static BVEC_NAME only(int n) {
+    return full().val_ >> (FVEC_LEN - n);
+  }
+  VEC_INLINE static BVEC_NAME after(int n) {
+    return full().val_ << n;
+  }
+  VEC_INLINE static BVEC_NAME onlyafter(int only, int after) {
+    return (full().val_ >> (FVEC_LEN - only)) << after;
+  }
+  VEC_INLINE static int popcnt(const BVEC_NAME &a) {
+    return _popcnt32(a.val_);
+  }
+  VEC_INLINE static bool test_all_unset(const BVEC_NAME &a) {
+    return _mm512_kortestz(a.val_, a.val_);
+  }
+  VEC_INLINE static bool test_any_set(const BVEC_NAME &a) {
+    return ! test_all_unset(a);
+  }
+  VEC_INLINE static bool test_at(const BVEC_NAME &a, int i) {
+    assert(i < FVEC_LEN);
+    return a.val_ & (1 << i);
+  }
+  VEC_INLINE BVEC_NAME operator &(const BVEC_NAME &b) const {
+    return _mm512_kand(val_, b.val_);
+  }
+  VEC_INLINE BVEC_NAME operator |(const BVEC_NAME &b) const {
+    return _mm512_kor(val_, b.val_);
+  }
+  VEC_INLINE BVEC_NAME operator ~() const {
+    return _mm512_knot(val_);
+  }
+};
+
+class IVEC_NAME {
+  friend class FVEC_NAME;
+  friend class AVEC_NAME;
+# if FVEC_LEN==16
+  friend class avec16pd;
+# endif
+  __m512i val_;
+  VEC_INLINE IVEC_NAME(const __m512i &v) : val_(v) {}
+public:
+  static const int VL = 16;
+  VEC_INLINE IVEC_NAME() {}
+
+  #define IVEC_MASK_BINFN_B(the_name)                                \
+    VEC_INLINE static BVEC_NAME the_name(const IVEC_NAME &a,	     \
+      const IVEC_NAME &b) {					     \
+      return _mm512_##the_name##_epi32_mask(a.val_, b.val_);         \
+    }								     \
+    VEC_INLINE static BVEC_NAME mask_##the_name(			\
+						const BVEC_NAME &mask,	\
+						  const IVEC_NAME &a,	\
+						  const IVEC_NAME &b    \
+						  ) {			\
+      return _mm512_mask_##the_name##_epi32_mask(			\
+      mask.val_, a.val_, b.val_);					\
+    }
+  IVEC_MASK_BINFN_B(cmpeq)
+  IVEC_MASK_BINFN_B(cmplt)
+  IVEC_MASK_BINFN_B(cmpneq)
+  IVEC_MASK_BINFN_B(cmpgt)
+
+  #define IVEC_MASK_BINFN_I(the_name)					\
+    VEC_INLINE static IVEC_NAME mask_##the_name(			\
+        const IVEC_NAME &src, const BVEC_NAME &mask,                    \
+        const IVEC_NAME &a, const IVEC_NAME &b                          \
+    ) {                                                                 \
+       return _mm512_mask_##the_name##_epi32(				\
+        src.val_, mask.val_, a.val_, b.val_);                           \
+    }
+  IVEC_MASK_BINFN_I(add)
+  VEC_INLINE static IVEC_NAME mask_blend(
+      const BVEC_NAME &mask, const IVEC_NAME &a, const IVEC_NAME &b
+  ) {
+    return _mm512_mask_blend_epi32(mask.val_, a.val_, b.val_);
+  }
+
+  #define IVEC_BINFN_I(the_name)                                     \
+    VEC_INLINE static IVEC_NAME the_name(const IVEC_NAME &a,	     \
+					 const IVEC_NAME &b) {	     \
+      return _mm512_##the_name##_epi32(a.val_, b.val_);              \
+    }
+  IVEC_BINFN_I(mullo)
+  IVEC_BINFN_I(srlv)
+  VEC_INLINE static IVEC_NAME the_and(const IVEC_NAME &a, const IVEC_NAME &b) {
+    return _mm512_and_epi32(a.val_, b.val_);
+  }
+
+  VEC_INLINE static IVEC_NAME mask_expand(
+      const IVEC_NAME &src, const BVEC_NAME &a, const IVEC_NAME &b
+  ) {
+    return _mm512_mask_expand_epi32(src.val_,
+      a.val_, b.val_);
+  }
+  VEC_INLINE static IVEC_NAME masku_compress(
+      const BVEC_NAME &a, const IVEC_NAME &b
+  ) {
+    return _mm512_mask_compress_epi32(_mm512_undefined_epi32(), a.val_, b.val_);
+  }
+
+  VEC_INLINE static int at(const IVEC_NAME &a, int b) {
+    int data[16] __attribute__((aligned(64)));
+    _mm512_store_epi32(data, a.val_);
+    return data[b];
+  }
+
+  VEC_INLINE static IVEC_NAME load(const int * src) {
+    return _mm512_load_epi32(src);
+  }
+  VEC_INLINE static IVEC_NAME mask_loadu(const BVEC_NAME &mask, 
+                                         const int * src) {
+    assert((mask.val_ & (mask.val_ + 1)) == 0);
+    assert(mask.val_ <= BVEC_NAME::full().val_);
+    return _mm512_mask_loadu_epi32(_mm512_undefined_epi32(), mask.val_, src);
+  }
+  VEC_INLINE static IVEC_NAME maskz_loadu(const BVEC_NAME &mask, 
+                                          const int * src) {
+    assert((mask.val_ & (mask.val_ + 1)) == 0);
+    assert(mask.val_ <= BVEC_NAME::full().val_);
+    return _mm512_maskz_loadu_epi32(mask.val_, src);
+  }
+  VEC_INLINE static void mask_storeu(const BVEC_NAME &mask, int * dest, 
+    const IVEC_NAME &src) {
+    assert((mask.val_ & (mask.val_ + 1)) == 0);
+    assert(mask.val_ <= BVEC_NAME::full().val_);
+    _mm512_mask_storeu_epi32(dest, mask.val_, src.val_);
+  }
+  VEC_INLINE static void store(int * dest, const IVEC_NAME &src) {
+    _mm512_store_epi32(dest, src.val_);
+  }
+
+  VEC_INLINE static IVEC_NAME mask_gather(
+      const IVEC_NAME &src, const BVEC_NAME &mask, const IVEC_NAME &idx, 
+      const int * mem, const int scale
+  ) {
+    assert(mask.val_ <= BVEC_NAME::full().val_);
+    assert(scale == sizeof(int));
+    return _mm512_mask_i32gather_epi32(src.val_, mask.val_, idx.val_, mem, 
+      sizeof(int));
+  }
+  VEC_INLINE static void mask_i32scatter(
+      int * mem, const BVEC_NAME &mask, const IVEC_NAME &idx, 
+      const IVEC_NAME &a, const int scale
+  ) {
+    assert(mask.val_ <= BVEC_NAME::full().val_);
+    assert(scale == sizeof(int));
+    _mm512_mask_i32scatter_epi32(mem, mask.val_, idx.val_, a.val_, sizeof(int));
+  }
+
+  VEC_INLINE static void mask_compressstore(const BVEC_NAME &mask, int * dest,
+    const IVEC_NAME &src) {
+    _mm512_mask_compressstoreu_epi32(dest, mask.val_, src.val_);
+  }
+
+  VEC_INLINE static IVEC_NAME set1(int i) {
+    return _mm512_set1_epi32(i);
+  }
+  VEC_INLINE static IVEC_NAME setzero() {
+    return _mm512_setzero_epi32();
+  }
+  VEC_INLINE static IVEC_NAME undefined() {
+    return _mm512_undefined_epi32();
+  }
+
+  VEC_INLINE IVEC_NAME operator +(const IVEC_NAME &b) const {
+    return _mm512_add_epi32(this->val_, b.val_);
+  }
+  VEC_INLINE static void print(const char * str, const IVEC_NAME &a) {
+    int data[8] __attribute__((aligned(32)));
+    store(data, a);
+    printf("%s:", str);
+    for (int i = 0; i < FVEC_LEN; i++) {
+      printf(" %d", data[i]);
+    }
+    printf("\n");
+  }
+};
+
+class FVEC_NAME {
+  friend class AVEC_NAME;
+#if FVEC_LEN==16
+  friend class avec16pd;
+#endif
+  FVEC_VEC_T val_;
+  VEC_INLINE FVEC_NAME(const FVEC_VEC_T &v) : val_(v) {}
+public:
+  static const int VL = FVEC_LEN;
+  VEC_INLINE FVEC_NAME() {}
+  VEC_INLINE static FVEC_SCAL_T at(const FVEC_NAME &a, int i) {
+    assert(i < FVEC_LEN);
+    FVEC_SCAL_T data[FVEC_LEN] __attribute__((aligned(64)));
+    FVEC_SUFFIX(_mm512_store_)(data, a.val_);
+    return data[i];
+  }
+  VEC_INLINE static bool fast_compress() { return true; }
+
+  #define FVEC_MASK_BINFN_B(the_name)                                \
+    VEC_INLINE static BVEC_NAME the_name(const FVEC_NAME &a,         \
+                                         const FVEC_NAME &b) {	     \
+      return FVEC_SUFFIX_MASK(_mm512_##the_name##_)(a.val_, b.val_); \
+    }                                                                \
+    VEC_INLINE static BVEC_NAME mask_##the_name(                     \
+        const BVEC_NAME &mask,                                       \
+        const FVEC_NAME &a, const FVEC_NAME &b                       \
+    ) {                                                              \
+      return FVEC_SUFFIX_MASK(_mm512_mask_##the_name##_)(            \
+        mask.val_, a.val_, b.val_);                                  \
+    }
+  FVEC_MASK_BINFN_B(cmple)
+  FVEC_MASK_BINFN_B(cmplt)
+  FVEC_MASK_BINFN_B(cmpneq)
+  FVEC_MASK_BINFN_B(cmpnle)
+  FVEC_MASK_BINFN_B(cmpnlt)
+
+  #define FVEC_UNFN_F(the_name)                                      \
+    VEC_INLINE static FVEC_NAME the_name(const FVEC_NAME &a) {       \
+      return FVEC_SUFFIX(_mm512_##the_name##_)(a.val_);              \
+    }
+  FVEC_UNFN_F(abs)
+  FVEC_UNFN_F(exp)
+  FVEC_UNFN_F(invsqrt)
+  FVEC_UNFN_F(recip)
+  FVEC_UNFN_F(sqrt)
+
+  #define FVEC_MASK_UNFN_F(the_name)                                 \
+    VEC_INLINE static FVEC_NAME mask_##the_name(                     \
+        const FVEC_NAME &src, const BVEC_NAME &mask,                 \
+        const FVEC_NAME &a                                           \
+    ) {                                                              \
+      return FVEC_SUFFIX(_mm512_mask_##the_name##_)(                 \
+        src.val_, mask.val_, a.val_);                                \
+    }
+  FVEC_MASK_UNFN_F(cos)
+  FVEC_MASK_UNFN_F(recip)
+  FVEC_MASK_UNFN_F(sqrt)
+
+  #define FVEC_BINFN_F(the_name)                                     \
+    VEC_INLINE static FVEC_NAME the_name(const FVEC_NAME &a,         \
+                                         const FVEC_NAME &b) {       \
+      return FVEC_SUFFIX(_mm512_##the_name##_)(a.val_, b.val_);      \
+    }
+  FVEC_BINFN_F(max)
+  FVEC_BINFN_F(min)
+
+  #define FVEC_MASK_BINFN_F(the_name)                                \
+    VEC_INLINE static FVEC_NAME mask_##the_name(                     \
+        const FVEC_NAME &src, const BVEC_NAME &mask,                 \
+        const FVEC_NAME &a, const FVEC_NAME &b                       \
+    ) {                                                              \
+      return FVEC_SUFFIX(_mm512_mask_##the_name##_)(                 \
+        src.val_, mask.val_, a.val_, b.val_);                        \
+    }
+  FVEC_MASK_BINFN_F(add)
+  FVEC_MASK_BINFN_F(div)
+  FVEC_MASK_BINFN_F(mul)
+  FVEC_MASK_BINFN_F(sub)
+  VEC_INLINE static FVEC_NAME mask_blend(
+      const BVEC_NAME &mask, const FVEC_NAME &a, const FVEC_NAME &b
+  ) {
+    return FVEC_SUFFIX(_mm512_mask_blend_)(mask.val_, a.val_, b.val_);
+  }
+
+  VEC_INLINE static FVEC_NAME mask_expand(
+      const FVEC_NAME &src, const BVEC_NAME &a, const FVEC_NAME &b
+  ) {
+    return FVEC_SUFFIX(_mm512_mask_expand_)(src.val_,
+      a.val_, b.val_);
+  }
+  VEC_INLINE static FVEC_NAME masku_compress(
+      const BVEC_NAME &a, const FVEC_NAME &b
+  ) {
+    return FVEC_SUFFIX(_mm512_mask_compress_)(FVEC_SUFFIX(_mm512_undefined_)(),
+						a.val_, b.val_);
+  }
+
+  VEC_INLINE static FVEC_NAME set1(const FVEC_SCAL_T &a) {
+    return FVEC_SUFFIX(_mm512_set1_)(a);
+  }
+  VEC_INLINE static FVEC_NAME setzero() {
+    return FVEC_SUFFIX(_mm512_setzero_)();
+  }
+  VEC_INLINE static FVEC_NAME undefined() {
+    return FVEC_SUFFIX(_mm512_undefined_)();
+  }
+
+  VEC_INLINE static FVEC_NAME load(const FVEC_SCAL_T *mem) {
+    return FVEC_SUFFIX(_mm512_load_)(mem);
+  }
+  VEC_INLINE static void mask_storeu(const BVEC_NAME &mask, FVEC_SCAL_T * dest,
+				       const FVEC_NAME &a) {
+    FVEC_SUFFIX(_mm512_mask_storeu_)(dest, mask.val_, a.val_);
+  }
+  VEC_INLINE static void store(FVEC_SCAL_T * dest, const FVEC_NAME &a) {
+    FVEC_SUFFIX(_mm512_store_)(dest, a.val_);
+  }
+
+  VEC_INLINE static FVEC_NAME gather(const IVEC_NAME &idx, 
+				     const FVEC_SCAL_T * mem, 
+				     const int scale) {
+    assert(scale == sizeof(FVEC_SCAL_T));
+#   if FVEC_LEN==8
+    return FVEC_SUFFIX(_mm512_i32logather_)(idx.val_, mem, sizeof(FVEC_SCAL_T));
+#   else
+    return FVEC_SUFFIX(_mm512_i32gather_)(idx.val_, mem, sizeof(FVEC_SCAL_T));
+#   endif
+  }
+  VEC_INLINE static FVEC_NAME mask_gather(
+      const FVEC_NAME &src, const BVEC_NAME &mask, const IVEC_NAME &idx,
+      const FVEC_SCAL_T * mem, const int scale
+  ) {
+    assert(scale == sizeof(FVEC_SCAL_T));
+#   if FVEC_LEN==8
+    return FVEC_SUFFIX(_mm512_mask_i32logather_)(src.val_, mask.val_, idx.val_,
+                       mem, sizeof(FVEC_SCAL_T));
+#   else
+    return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_, idx.val_,
+                       mem, sizeof(FVEC_SCAL_T));
+#   endif
+  }
+
+  VEC_INLINE static void gather_3_adjacent(const IVEC_NAME &idx, 
+					   const FVEC_SCAL_T * mem, 
+					   const int scale, 
+					   FVEC_NAME * out_0, 
+					   FVEC_NAME * out_1, 
+					   FVEC_NAME * out_2) {
+    assert(scale == sizeof(FVEC_SCAL_T));
+    *out_0 = FVEC_NAME::gather(idx, mem + 0, scale);
+    *out_1 = FVEC_NAME::gather(idx, mem + 1, scale);
+    *out_2 = FVEC_NAME::gather(idx, mem + 2, scale);
+  }
+  VEC_INLINE static void gather_4_adjacent(const IVEC_NAME &idx, 
+					   const FVEC_SCAL_T * mem, 
+					   const int scale, FVEC_NAME * out_0,
+					   FVEC_NAME * out_1, 
+					   FVEC_NAME * out_2, 
+					   FVEC_NAME * out_3) {
+    assert(scale == sizeof(FVEC_SCAL_T));
+    *out_0 = FVEC_NAME::gather(idx, mem + 0, scale);
+    *out_1 = FVEC_NAME::gather(idx, mem + 1, scale);
+    *out_2 = FVEC_NAME::gather(idx, mem + 2, scale);
+    *out_3 = FVEC_NAME::gather(idx, mem + 3, scale);
+  }
+
+  VEC_INLINE static FVEC_SCAL_T mask_reduce_add(const BVEC_NAME &mask, 
+						const FVEC_NAME &a) {
+    return FVEC_SUFFIX(_mm512_mask_reduce_add_)(mask.val_, a.val_);
+  }
+  VEC_INLINE static FVEC_SCAL_T reduce_add(const FVEC_NAME &a) {
+    return FVEC_SUFFIX(_mm512_reduce_add_)(a.val_);
+  }
+
+  VEC_INLINE static IVEC_NAME unpackloepi32(const FVEC_NAME &a) {
+#   if FVEC_LEN==8
+    return _mm512_maskz_compress_epi32(0x5555, _mm512_castpd_si512(a.val_));
+#   else
+    return _mm512_castps_si512(a.val_);
+#   endif
+  }
+
+  VEC_INLINE static FVEC_NAME mask_sincos(
+      FVEC_NAME * cos, const FVEC_NAME &src_a, const FVEC_NAME &src_b,
+      const BVEC_NAME &mask, const FVEC_NAME &arg
+  ) {
+    return FVEC_SUFFIX(_mm512_mask_sincos_)(&cos->val_, src_a.val_, src_b.val_,
+      mask.val_, arg.val_);
+  }
+
+  #define FVEC_BINOP(the_sym, the_name)                                      \
+    VEC_INLINE inline FVEC_NAME operator the_sym(const FVEC_NAME &b) const { \
+    return FVEC_SUFFIX(_mm512_##the_name##_)(this->val_, b.val_);            \
+  }
+  FVEC_BINOP(+, add)
+  FVEC_BINOP(-, sub)
+  FVEC_BINOP(*, mul)
+  FVEC_BINOP(/, div)
+
+  VEC_INLINE static void gather_prefetch0(const IVEC_NAME &a, void * mem) {
+    #ifdef __AVX512PF__
+    _mm512_mask_prefetch_i32gather_ps(a.val_, BVEC_NAME::full().val_, mem, 
+      sizeof(FVEC_SCAL_T), _MM_HINT_T0);
+    #endif
+  }
+};
+
+class AVEC_NAME {
+  FVEC_VEC_T val_;
+  VEC_INLINE AVEC_NAME(const FVEC_VEC_T &a) : val_(a) {}
+public:
+  VEC_INLINE AVEC_NAME(const FVEC_NAME &a) : val_(a.val_) {}
+  VEC_INLINE static AVEC_NAME undefined() {
+    return FVEC_SUFFIX(_mm512_undefined_)();
+  }
+  VEC_INLINE static AVEC_NAME mask_gather(
+      const AVEC_NAME &src, const BVEC_NAME &mask, const IVEC_NAME &idx,
+      const FVEC_SCAL_T * mem, const int scale
+  ) {
+    assert(scale == sizeof(FVEC_SCAL_T));
+#   if FVEC_LEN==8
+    return FVEC_SUFFIX(_mm512_mask_i32logather_)(src.val_, mask.val_, idx.val_,
+						 mem, sizeof(FVEC_SCAL_T));
+#   else
+    return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_, idx.val_,
+                                               mem, sizeof(FVEC_SCAL_T));
+#   endif
+  }
+  VEC_INLINE static void mask_i32loscatter(
+      FVEC_SCAL_T * mem, const BVEC_NAME &mask, const IVEC_NAME &idx,
+      const AVEC_NAME &a, const int scale
+  ) {
+    assert(scale == sizeof(FVEC_SCAL_T));
+#   if FVEC_LEN==8
+    FVEC_SUFFIX(_mm512_mask_i32loscatter_)(mem, mask.val_, idx.val_, a.val_, 
+					   sizeof(FVEC_SCAL_T));
+#   else
+    FVEC_SUFFIX(_mm512_mask_i32scatter_)(mem, mask.val_, idx.val_, a.val_, 
+					 sizeof(FVEC_SCAL_T));
+#   endif
+  }
+
+  #define AVEC_BINOP(the_sym, the_name)                                      \
+    VEC_INLINE inline AVEC_NAME operator the_sym(const AVEC_NAME &b) const { \
+    return FVEC_SUFFIX(_mm512_##the_name##_)(this->val_, b.val_);            \
+  }
+  AVEC_BINOP(-, sub)
+
+  VEC_INLINE static void gather_prefetch0(const IVEC_NAME &a, void * mem) {
+    _mm512_mask_prefetch_i32gather_ps(a.val_, BVEC_NAME::full().val_, mem, 
+				      sizeof(FVEC_SCAL_T), _MM_HINT_T0);
+  }
+};
+
+#if FVEC_LEN==16
+class avec16pd {
+  __m512d lo_, hi_;
+  VEC_INLINE avec16pd(const __m512d &lo, const __m512d &hi) : lo_(lo), hi_(hi) 
+    {}
+  VEC_INLINE static __mmask8 get_bvec_hi(__mmask16 a) {
+    return a >> 8;
+  }
+  VEC_INLINE static __m512i get_ivec_hi(__m512i a) {
+    return _mm512_permute4f128_epi32(a, _MM_PERM_BADC);
+  }
+public:
+  VEC_INLINE avec16pd(const FVEC_NAME &a) {
+    lo_ = _mm512_cvtpslo_pd(a.val_);
+    hi_ = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(a.val_, _MM_PERM_BADC));
+  }
+  VEC_INLINE static avec16pd undefined() {
+    return avec16pd(_mm512_undefined_pd(), _mm512_undefined_pd());
+  }
+  VEC_INLINE static avec16pd mask_gather(
+      const avec16pd &src, const BVEC_NAME &mask, const IVEC_NAME &idx,
+      const double * mem, const int scale
+  ) {
+    assert(scale == sizeof(double));
+    __m512d lo = _mm512_mask_i32logather_pd(src.lo_, mask.val_, idx.val_, mem, 
+					    sizeof(double));
+    __m512d hi = _mm512_mask_i32logather_pd(src.hi_, get_bvec_hi(mask.val_), 
+					    get_ivec_hi(idx.val_), mem, 
+					    sizeof(double));
+    return avec16pd(lo, hi);
+  }
+  VEC_INLINE static void mask_i32loscatter(
+      double * mem, const BVEC_NAME &mask, const IVEC_NAME &idx,
+      const avec16pd &a, const int scale
+  ) {
+    assert(scale == sizeof(double));
+    _mm512_mask_i32loscatter_pd(mem, mask.val_, idx.val_, a.lo_, 
+				sizeof(double));
+    _mm512_mask_i32loscatter_pd(mem, get_bvec_hi(mask.val_), 
+				get_ivec_hi(idx.val_), a.hi_, sizeof(double));
+  }
+
+  #define AVEC2_BINOP(the_sym, the_name)                                    \
+    VEC_INLINE inline avec16pd operator the_sym(const avec16pd &b) const {  \
+    __m512d lo = _mm512_##the_name##_pd(this->lo_, b.lo_);                  \
+    __m512d hi = _mm512_##the_name##_pd(this->hi_, b.hi_);                  \
+    return avec16pd(lo, hi);                                                \
+  }
+  AVEC2_BINOP(-, sub)
+
+  VEC_INLINE static void gather_prefetch0(const IVEC_NAME &a, void * mem) {
+    _mm512_mask_prefetch_i32gather_ps(a.val_, BVEC_NAME::full().val_, mem, 
+				      sizeof(double), _MM_HINT_T0);
+  }
+};
+#endif
+
+}
+
+
+#ifdef FVEC_FIRST_PASS
+
+template<typename flt_t, typename acc_t>
+struct intr_types;
+
+template<>
+struct intr_types<double,double> {
+  typedef mm512::fvec8pd fvec;
+  typedef mm512::ivec8 ivec;
+  typedef mm512::bvec8 bvec;
+  typedef mm512::avec8pd avec;
+};
+
+template<>
+struct intr_types<float,float> {
+  typedef mm512::fvec16ps fvec;
+  typedef mm512::ivec16 ivec;
+  typedef mm512::bvec16 bvec;
+  typedef mm512::avec16ps avec;
+};
+
+template<>
+struct intr_types<float,double> {
+  typedef mm512::fvec16ps fvec;
+  typedef mm512::ivec16 ivec;
+  typedef mm512::bvec16 bvec;
+  typedef mm512::avec16pd avec;
+};
+
+#endif
+
+
+#ifndef FVEC_FIRST_PASS
+#  define FVEC_FIRST_PASS
+#  include "intel_intrinsics_airebo.h"
+#endif
+
+#endif
+
+#ifdef LMP_INTEL_AIREBO_256
+
+#include <cassert>
+#include <immintrin.h>
+#include <stdint.h>
+
+#define VEC_INLINE __attribute__((always_inline))
+
+
+#ifndef FVEC_FIRST_PASS
+#  define FVEC_LEN 4
+#  define FVEC_SUFFIX(a) a##pd
+#  define FVEC_MASK_T __m256d
+#  define FVEC_VEC_T __m256d
+#  define FVEC_SCAL_T double
+#  define IVEC_NAME ivec4
+#  define FVEC_NAME fvec4pd
+#  define BVEC_NAME bvec4
+#  define AVEC_NAME avec4pd
+#else
+#  undef FVEC_LEN
+#  undef FVEC_SUFFIX
+#  undef FVEC_SUFFIX_MASK
+#  undef FVEC_MASK_T
+#  undef FVEC_VEC_T
+#  undef FVEC_SCAL_T
+#  undef IVEC_NAME
+#  undef FVEC_NAME
+#  undef BVEC_NAME
+#  undef AVEC_NAME
+
+#  define FVEC_LEN 8
+#  define FVEC_SUFFIX(a) a##ps
+#  define FVEC_MASK_T __m256
+#  define FVEC_VEC_T __m256
+#  define FVEC_SCAL_T float
+#  define IVEC_NAME ivec8
+#  define FVEC_NAME fvec8ps
+#  define BVEC_NAME bvec8
+#  define AVEC_NAME avec8ps
+#endif
+
+
+
+namespace mm256 {
+
+//#define __AVX2__ __AVX2__
+
+#if !defined(__AVX2__) && !defined(FVEC_FIRST_PASS)
+
+#define IVEC_EM_BIN(op) \
+  __m128i a_lo = _mm256_castsi256_si128(a);  \
+  __m128i b_lo = _mm256_castsi256_si128(b);  \
+  __m128i a_hi = _mm256_extractf128_si256(a, 1);  \
+  __m128i b_hi = _mm256_extractf128_si256(b, 1);  \
+  __m128i c_lo = op(a_lo, b_lo); \
+  __m128i c_hi = op(a_hi, b_hi); \
+  __m256i ret = _mm256_setr_m128i(c_lo, c_hi); \
+  return ret;
+
+VEC_INLINE inline __m256i _cm256_add_epi32(const __m256i &a, const __m256i &b) {
+  IVEC_EM_BIN(_mm_add_epi32)
+}
+
+VEC_INLINE inline __m256i _cm256_and_si256(const __m256i &a, const __m256i &b) {
+  IVEC_EM_BIN(_mm_and_si128)
+}
+
+VEC_INLINE inline __m256i _cm256_andnot_si256(const __m256i &a, 
+					      const __m256i &b) {
+  IVEC_EM_BIN(_mm_andnot_si128)
+}
+
+VEC_INLINE inline __m256i _cm256_cmpeq_epi32(const __m256i &a, 
+					     const __m256i &b) {
+  IVEC_EM_BIN(_mm_cmpeq_epi32)
+}
+
+VEC_INLINE inline __m256i _cm256_cmpgt_epi32(const __m256i &a, 
+					     const __m256i &b) {
+  IVEC_EM_BIN(_mm_cmpgt_epi32)
+}
+
+VEC_INLINE inline __m256i _cm256_cvtepu8_epi32(const __m128i &a) {
+  __m128i a_hi = _mm_castps_si128(_mm_permute_ps(_mm_castsi128_ps(a), 1));
+  __m128i c_lo = _mm_cvtepu8_epi32(a);
+  __m128i c_hi = _mm_cvtepu8_epi32(a_hi);
+  __m256i ret = _mm256_setr_m128i(c_lo, c_hi);
+  return ret;
+
+}
+
+#define IVEC_EM_SCAL(op)                       \
+  int buf_a[8] __attribute__((aligned(32)));   \
+  int buf_b[8] __attribute__((aligned(32)));   \
+  int dest[8] __attribute__((aligned(32)));    \
+  _mm256_store_si256((__m256i*)buf_a, a);      \
+  _mm256_store_si256((__m256i*)buf_b, b);      \
+  for (int i = 0; i < 8; i++) {		       \
+    dest[i] = op;			       \
+  }					       \
+  return _mm256_load_si256((__m256i*) dest);
+
+VEC_INLINE inline __m256i _cm256_permutevar8x32_epi32(const __m256i &a, 
+						      const __m256i &b) {
+  IVEC_EM_SCAL(buf_a[buf_b[i]])
+}
+
+VEC_INLINE inline __m256i _cm256_mullo_epi32(__m256i a, __m256i b) {
+  IVEC_EM_BIN(_mm_mullo_epi32)
+}
+
+VEC_INLINE inline __m256i _cm256_srlv_epi32(__m256i a, __m256i b) {
+  IVEC_EM_SCAL(buf_a[i] >> buf_b[i])
+}
+
+
+VEC_INLINE inline __m256 _cm256_permutevar8x32_ps(const __m256 &a, 
+						  const __m256i &b) {
+  return _mm256_castsi256_ps(_cm256_permutevar8x32_epi32(_mm256_castps_si256(a),
+							 b));
+}
+
+VEC_INLINE inline __m128i _cm_maskload_epi32(int const * mem, __m128i mask) {
+  return _mm_castps_si128(_mm_maskload_ps((float const *) mem, mask));
+}
+
+VEC_INLINE inline __m256i _cm256_maskload_epi32(int const * mem, __m256i mask) {
+  __m128i a_lo = _mm256_castsi256_si128(mask);
+  __m128i a_hi = _mm256_extractf128_si256(mask, 1);
+  __m128i c_lo = _cm_maskload_epi32(mem, a_lo);
+  __m128i c_hi = _cm_maskload_epi32(mem + 4, a_hi);
+  __m256i ret = _mm256_setr_m128i(c_lo, c_hi);
+  return ret;
+}
+
+
+VEC_INLINE inline __m256i _cm256_mask_i32gather_epi32(__m256i src, 
+						      int const * base_addr, 
+						      __m256i index, 
+						      __m256i mask, 
+						      const int scale) {
+  assert(scale == sizeof(int));
+  int buf_index[8] __attribute__((aligned(32)));
+  int buf_mask[8] __attribute__((aligned(32)));
+  int dest[8] __attribute__((aligned(32)));
+  _mm256_store_si256((__m256i*)dest, src);
+  _mm256_store_si256((__m256i*)buf_index, index);
+  _mm256_store_si256((__m256i*)buf_mask, mask);
+  for (int i = 0; i < 8; i++) {
+    if (buf_mask[i]) dest[i] = base_addr[buf_index[i]];
+  }
+  return _mm256_load_si256((__m256i*) dest);
+}
+
+VEC_INLINE inline __m256 _cm256_mask_i32gather_ps(__m256 src, 
+						  float const * base_addr, 
+						  __m256i index, __m256 mask, 
+						  const int scale) {
+  return _mm256_castsi256_ps(_cm256_mask_i32gather_epi32(
+    _mm256_castps_si256(src), (const int *) base_addr, index,
+    _mm256_castps_si256(mask), scale));
+}
+
+VEC_INLINE inline __m256d _cm256_mask_i32gather_pd(__m256d src, 
+						   double const * base_addr,
+						   __m128i index, __m256d mask,
+						   const int scale) {
+  assert(scale == sizeof(double));
+  int buf_index[4] __attribute__((aligned(32)));
+  int buf_mask[8] __attribute__((aligned(32)));
+  double dest[4] __attribute__((aligned(32)));
+  _mm256_store_pd(dest, src);
+  _mm_store_si128((__m128i*)buf_index, index);
+  _mm256_store_si256((__m256i*)buf_mask, _mm256_castpd_si256(mask));
+  for (int i = 0; i < 4; i++) {
+    if (buf_mask[2*i]) dest[i] = base_addr[buf_index[i]];
+  }
+  return _mm256_load_pd(dest);
+}
+
+VEC_INLINE inline __m256i _cm256_i32gather_epi32(int const * base_addr,
+						 __m256i index, 
+						 const int scale) {
+  assert(scale == sizeof(int));
+  int buf_index[8] __attribute__((aligned(32)));
+  int dest[8] __attribute__((aligned(32)));
+  _mm256_store_si256((__m256i*)buf_index, index);
+  for (int i = 0; i < 8; i++) {
+    dest[i] = base_addr[buf_index[i]];
+  }
+  return _mm256_load_si256((__m256i*) dest);
+}
+
+VEC_INLINE inline __m256 _cm256_i32gather_ps(float const * base_addr,
+					     __m256i index, const int scale) {
+  return _mm256_castsi256_ps(_cm256_i32gather_epi32((const int *) base_addr,
+						    index, scale));
+}
+
+VEC_INLINE inline __m256d _cm256_i32gather_pd(double const * base_addr,
+					      __m128i index, const int scale) {
+  assert(scale == sizeof(double));
+  int buf_index[4] __attribute__((aligned(32)));
+  double dest[4] __attribute__((aligned(32)));
+  _mm_store_si128((__m128i*)buf_index, index);
+  for (int i = 0; i < 4; i++) {
+    dest[i] = base_addr[buf_index[i]];
+  }
+  return _mm256_load_pd(dest);
+}
+
+VEC_INLINE inline uint64_t _cdep_u64(uint64_t tmp, uint64_t mask) {
+  uint64_t dst = 0;
+  uint64_t k = 0;
+  const uint64_t one = 1;
+  const uint64_t zero = 0;
+  for (uint64_t m = 0; m < 64; m++) {
+    if (mask & (one << m)) {
+      dst |= static_cast<uint64_t>((tmp & (one << k)) != zero) << m;
+      k += 1;
+    }
+  }
+  return dst;
+}
+
+VEC_INLINE inline uint64_t _cext_u64(uint64_t tmp, uint64_t mask) {
+  uint64_t dst = 0;
+  uint64_t k = 0;
+  const uint64_t one = 1;
+  const uint64_t zero = 0;
+  for (uint64_t m = 0; m < 64; m++) {
+    if (mask & (one << m)) {
+      dst |= static_cast<uint64_t>((tmp & (one << m)) != zero) << k;
+      k += 1;
+    }
+  }
+  return dst;
+}
+
+#define _mm256_add_epi32 _cm256_add_epi32
+#define _mm256_and_si256 _cm256_and_si256
+#define _mm256_andnot_si256 _cm256_andnot_si256
+#define _mm256_cmpeq_epi32 _cm256_cmpeq_epi32
+#define _mm256_cmpgt_epi32 _cm256_cmpgt_epi32
+#define _mm256_permutevar8x32_epi32 _cm256_permutevar8x32_epi32
+#define _mm256_permutevar8x32_ps _cm256_permutevar8x32_ps
+#define _mm_maskload_epi32 _cm_maskload_epi32
+#define _mm256_maskload_epi32 _cm256_maskload_epi32
+#define _mm256_mullo_epi32 _cm256_mullo_epi32
+#define _mm256_srlv_epi32 _cm256_srlv_epi32
+#define _mm256_mask_i32gather_epi32 _cm256_mask_i32gather_epi32
+#define _mm256_mask_i32gather_pd _cm256_mask_i32gather_pd
+#define _mm256_mask_i32gather_ps _cm256_mask_i32gather_ps
+#define _mm256_i32gather_epi32 _cm256_i32gather_epi32
+#define _mm256_i32gather_pd _cm256_i32gather_pd
+#define _mm256_i32gather_ps _cm256_i32gather_ps
+#define _pdep_u64 _cdep_u64
+#define _pext_u64 _cext_u64
+#define _mm256_cvtepu8_epi32 _cm256_cvtepu8_epi32
+
+#endif
+
+#ifndef FVEC_FIRST_PASS
+
+VEC_INLINE inline __m256 _mm256_compress_ps(__m256 mask, __m256 a) {
+# ifdef __AVX2__
+  uint64_t expanded_mask = _pdep_u64(_mm256_movemask_ps(mask), 
+				     0x0101010101010101);
+  // unpack each bit to a byte
+  expanded_mask *= 0xFF;   // mask |= mask<<1 | mask<<2 | ... | mask<<7;
+  // the identity shuffle for vpermps, packed to one index per byte
+  const uint64_t identity_indices = 0x0706050403020100;   
+  uint64_t wanted_indices = _pext_u64(identity_indices, expanded_mask);
+
+  __m128i bytevec = _mm_cvtsi64_si128(wanted_indices);
+  __m256i shufmask = _mm256_cvtepu8_epi32(bytevec);
+
+  return _mm256_permutevar8x32_ps(a, shufmask);
+# else
+  int mask_buf[8] __attribute__((aligned(32)));
+  float a_buf[8] __attribute__((aligned(32)));
+  float dst_buf[8] __attribute__((aligned(32)));
+  _mm256_store_si256((__m256i*) mask_buf, _mm256_castps_si256(mask));
+  _mm256_store_ps(a_buf, a);
+  int k = 0;
+  for (int i = 0; i < 8; i++) {
+    if (mask[i]) {
+      dst_buf[k++] = a_buf[i];
+    }
+  }
+  return _mm256_load_ps(dst_buf);
+# endif
+}
+VEC_INLINE inline __m256 _mm256_expand_ps(__m256 mask, __m256 a) {
+# ifdef __AVX2__
+  uint64_t expanded_mask = _pdep_u64(_mm256_movemask_ps(mask), 
+				     0x0101010101010101);
+  expanded_mask *= 0xFF;
+  const uint64_t identity_indices = 0x0706050403020100;
+  uint64_t wanted_indices = _pdep_u64(identity_indices, expanded_mask);
+  __m128i bytevec = _mm_cvtsi64_si128(wanted_indices);
+  __m256i shufmask = _mm256_cvtepu8_epi32(bytevec);
+  return _mm256_permutevar8x32_ps(a, shufmask);
+# else
+  int mask_buf[8] __attribute__((aligned(32)));
+  float a_buf[8] __attribute__((aligned(32)));
+  float dst_buf[8] __attribute__((aligned(32))) = {0};
+  _mm256_store_si256((__m256i*) mask_buf, _mm256_castps_si256(mask));
+  _mm256_store_ps(a_buf, a);
+  int k = 0;
+  for (int i = 0; i < 8; i++) {
+    if (mask[i]) {
+      dst_buf[i] = a_buf[k++];
+    }
+  }
+  return _mm256_load_ps(dst_buf);
+# endif
+}
+
+VEC_INLINE inline __m256d _mm256_compress_pd(__m256d mask, __m256d a) {
+  return _mm256_castps_pd(_mm256_compress_ps(_mm256_castpd_ps(mask), 
+					     _mm256_castpd_ps(a)));
+}
+VEC_INLINE inline __m256d _mm256_expand_pd(__m256d mask, __m256d a) {
+  return _mm256_castps_pd(_mm256_expand_ps(_mm256_castpd_ps(mask), 
+                                           _mm256_castpd_ps(a)));
+}
+#endif
+
+
+class FVEC_NAME;
+class IVEC_NAME;
+class AVEC_NAME;
+class BVEC_NAME {
+  friend class FVEC_NAME;
+  friend class IVEC_NAME;
+  friend class AVEC_NAME;
+# if FVEC_LEN==8
+  friend class avec8pd;
+# endif
+  FVEC_MASK_T val_;
+  VEC_INLINE BVEC_NAME(const FVEC_MASK_T &v) : val_(v) {}
+  VEC_INLINE BVEC_NAME(const __m256i &v) : val_(FVEC_SUFFIX(_mm256_castsi256_)
+						(v)) {}
+public:
+  VEC_INLINE BVEC_NAME() {}
+  VEC_INLINE static BVEC_NAME kand(const BVEC_NAME &a, const BVEC_NAME &b) {
+    return FVEC_SUFFIX(_mm256_and_)(a.val_, b.val_);
+  }
+  VEC_INLINE static BVEC_NAME kandn(const BVEC_NAME &a, const BVEC_NAME &b) {
+    return FVEC_SUFFIX(_mm256_andnot_)(a.val_, b.val_);
+  }
+  VEC_INLINE static BVEC_NAME masku_compress(const BVEC_NAME &mask, 
+					     const BVEC_NAME &a) {
+    return FVEC_SUFFIX(_mm256_compress_)(mask.val_, a.val_);
+  }
+  VEC_INLINE static BVEC_NAME mask_expand(const BVEC_NAME &src, 
+					  const BVEC_NAME &mask, 
+					  const BVEC_NAME &a) {
+    FVEC_MASK_T ret = FVEC_SUFFIX(_mm256_expand_)(mask.val_, a.val_);
+    ret = FVEC_SUFFIX(_mm256_and_)(mask.val_, ret);
+    ret = FVEC_SUFFIX(_mm256_or_)(ret, FVEC_SUFFIX(_mm256_andnot_)
+				  (mask.val_, src.val_));
+    return ret;
+  }
+  VEC_INLINE static BVEC_NAME full() {
+    __m256i a = _mm256_undefined_si256();
+    return FVEC_SUFFIX(_mm256_castsi256_)(_mm256_cmpeq_epi32(a, a));
+  }
+  VEC_INLINE static BVEC_NAME empty() {
+    return FVEC_SUFFIX(_mm256_setzero_)();
+  }
+  VEC_INLINE static BVEC_NAME only(int n) {
+    static const unsigned int FULL_ps = (unsigned int) -1;
+    static const unsigned int LUT_ps[9][8] = {
+      {0, 0, 0, 0, 0, 0, 0, 0},
+      {FULL_ps, 0, 0, 0, 0, 0, 0, 0},
+      {FULL_ps, FULL_ps, 0, 0, 0, 0, 0, 0},
+      {FULL_ps, FULL_ps, FULL_ps, 0, 0, 0, 0, 0},
+      {FULL_ps, FULL_ps, FULL_ps, FULL_ps, 0, 0, 0, 0},
+      {FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, 0, 0, 0},
+      {FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, 0, 0},
+      {FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, 0},
+      {FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps},
+    };
+    static const unsigned long long FULL_pd = (unsigned long long) -1;
+    static const unsigned long long LUT_pd[5][4] = {
+      {0, 0, 0, 0},
+      {FULL_pd, 0, 0, 0},
+      {FULL_pd, FULL_pd, 0, 0},
+      {FULL_pd, FULL_pd, FULL_pd, 0},
+      {FULL_pd, FULL_pd, FULL_pd, FULL_pd},
+    };
+    return FVEC_SUFFIX(_mm256_load_)((const FVEC_SCAL_T*) FVEC_SUFFIX(LUT_)[n]);
+  }
+  VEC_INLINE static BVEC_NAME after(int n) {
+    static const unsigned int FULL_ps = (unsigned int) -1;
+    static const unsigned int LUT_ps[9][8] = {
+      {FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps},
+      {0, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps},
+      {0, 0, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps},
+      {0, 0, 0, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps},
+      {0, 0, 0, 0, FULL_ps, FULL_ps, FULL_ps, FULL_ps},
+      {0, 0, 0, 0, 0, FULL_ps, FULL_ps, FULL_ps},
+      {0, 0, 0, 0, 0, 0, FULL_ps, FULL_ps},
+      {0, 0, 0, 0, 0, 0, 0, FULL_ps},
+      {0, 0, 0, 0, 0, 0, 0, 0},
+    };
+    static const unsigned long long FULL_pd = (unsigned long long) -1;
+    static const unsigned long long LUT_pd[5][4] = {
+      {FULL_pd, FULL_pd, FULL_pd, FULL_pd},
+      {0, FULL_pd, FULL_pd, FULL_pd},
+      {0, 0, FULL_pd, FULL_pd},
+      {0, 0, 0, FULL_pd},
+      {0, 0, 0, 0},
+    };
+    return FVEC_SUFFIX(_mm256_load_)((const FVEC_SCAL_T*) FVEC_SUFFIX(LUT_)[n]);
+  }
+  VEC_INLINE static BVEC_NAME onlyafter(int only_, int after_) {
+    return kand(after(after_), only(after_ + only_));
+  }
+  VEC_INLINE static int popcnt(const BVEC_NAME &a) {
+    return _popcnt32(FVEC_SUFFIX(_mm256_movemask_)(a.val_));
+  }
+  VEC_INLINE static bool test_all_unset(const BVEC_NAME &a) {
+    return FVEC_SUFFIX(_mm256_testz_)(a.val_, a.val_);
+  }
+  VEC_INLINE static bool test_any_set(const BVEC_NAME &a) {
+    return ! test_all_unset(a);
+  }
+  VEC_INLINE static bool test_at(const BVEC_NAME &a, int i) {
+    assert(i < FVEC_LEN);
+    return FVEC_SUFFIX(_mm256_movemask_)(a.val_) & (1 << i);
+  }
+  VEC_INLINE BVEC_NAME operator &(const BVEC_NAME &b) const {
+    return FVEC_SUFFIX(_mm256_and_)(val_, b.val_);
+  }
+  VEC_INLINE BVEC_NAME operator |(const BVEC_NAME &b) const {
+    return FVEC_SUFFIX(_mm256_or_)(val_, b.val_);
+  }
+  VEC_INLINE BVEC_NAME operator ~() const {
+    return FVEC_SUFFIX(_mm256_andnot_)(val_, full().val_);
+  }
+};
+
+class IVEC_NAME {
+  friend class FVEC_NAME;
+  friend class AVEC_NAME;
+# if FVEC_LEN==8
+  friend class avec8pd;
+# endif
+  __m256i val_;
+  VEC_INLINE IVEC_NAME(const __m256i &v) : val_(v) {}
+  VEC_INLINE static __m256i to(const FVEC_VEC_T &a) {
+#   if FVEC_LEN==4
+    return _mm256_castpd_si256(a);
+#   else
+    return _mm256_castps_si256(a);
+#   endif
+  }
+  VEC_INLINE static FVEC_VEC_T from(const __m256i &a) {
+    return FVEC_SUFFIX(_mm256_castsi256_)(a);
+  }
+public:
+  static const int VL = 8;
+  VEC_INLINE IVEC_NAME() {}
+
+  #define IVEC_MASK_BINFN_B(the_name)                                \
+    VEC_INLINE static BVEC_NAME the_name(const IVEC_NAME &a,         \
+                                         const IVEC_NAME &b) {	     \
+      return _mm256_##the_name##_epi32(a.val_, b.val_);              \
+    }                                                                \
+    VEC_INLINE static BVEC_NAME mask_##the_name(                     \
+        const BVEC_NAME &mask,                                       \
+        const IVEC_NAME &a, const IVEC_NAME &b                       \
+    ) {                                                              \
+      BVEC_NAME ret = _mm256_##the_name##_epi32(                     \
+        a.val_, b.val_);                                             \
+      return mask & ret;                                             \
+    }
+  IVEC_MASK_BINFN_B(cmpeq)
+  IVEC_MASK_BINFN_B(cmpgt)
+
+  VEC_INLINE static __m256i _mm256_cmplt_epi32(__m256i a, __m256i b) {
+    __m256i le = _mm256_cmpgt_epi32(b, a);
+    __m256i eq = _mm256_cmpeq_epi32(a, b);
+    return _mm256_andnot_si256(eq, le);
+  }
+
+  VEC_INLINE static __m256i _mm256_cmpneq_epi32(__m256i a, __m256i b) {
+    __m256i eq = _mm256_cmpeq_epi32(a, b);
+    __m256i t = _mm256_undefined_si256();
+    __m256i f = _mm256_cmpeq_epi32(t, t);
+    return _mm256_andnot_si256(eq, f);
+  }
+
+  IVEC_MASK_BINFN_B(cmplt)
+  IVEC_MASK_BINFN_B(cmpneq)
+  #undef IVEC_MASK_BINFN_B
+
+  VEC_INLINE static IVEC_NAME mask_blend(
+      const BVEC_NAME &mask, const IVEC_NAME &a, const IVEC_NAME &b
+  ) {
+    return to(FVEC_SUFFIX(_mm256_blendv_)(from(a.val_), from(b.val_), 
+              mask.val_));
+  }
+  #define IVEC_MASK_BINFN_I(the_name)                                \
+    VEC_INLINE static IVEC_NAME mask_##the_name(                     \
+        const IVEC_NAME &src, const BVEC_NAME &mask,                 \
+        const IVEC_NAME &a, const IVEC_NAME &b                       \
+    ) {                                                              \
+      IVEC_NAME ret = _mm256_##the_name##_epi32(                     \
+						a.val_, b.val_);     \
+	return mask_blend(mask, src, ret);			     \
+    }
+  IVEC_MASK_BINFN_I(add)
+  #undef IVEC_MASK_BINFN_I
+
+  #define IVEC_BINFN_I(the_name)                                     \
+    VEC_INLINE static IVEC_NAME the_name(const IVEC_NAME &a,         \
+					 const IVEC_NAME &b) {	     \
+      return _mm256_##the_name##_epi32(a.val_, b.val_);              \
+    }
+  IVEC_BINFN_I(mullo)
+  IVEC_BINFN_I(srlv)
+  #undef IVEC_BINFN_I
+  VEC_INLINE static IVEC_NAME the_and(const IVEC_NAME &a, const IVEC_NAME &b) {
+    return _mm256_and_si256(a.val_, b.val_);
+  }
+
+  VEC_INLINE static IVEC_NAME masku_compress(const BVEC_NAME &mask, 
+					     const IVEC_NAME &b) {
+    return to(FVEC_SUFFIX(_mm256_compress_)(mask.val_, from(b.val_)));
+  }
+  VEC_INLINE static IVEC_NAME mask_expand(
+      const IVEC_NAME &src, const BVEC_NAME &mask, const IVEC_NAME &b
+  ) {
+    FVEC_VEC_T ret = FVEC_SUFFIX(_mm256_expand_)(mask.val_, from(b.val_));
+    ret = FVEC_SUFFIX(_mm256_and_)(mask.val_, ret);
+    ret = FVEC_SUFFIX(_mm256_or_)(ret, FVEC_SUFFIX(_mm256_andnot_)
+				    (mask.val_, from(src.val_)));
+    return to(ret);
+  }
+
+  VEC_INLINE static void store(int * dest, const IVEC_NAME &src) {
+    _mm256_store_si256((__m256i*)dest, src.val_);
+#   if FVEC_LEN==4
+    dest[1] = dest[2];
+    dest[2] = dest[4];
+    dest[3] = dest[6];
+#   endif
+  }
+
+  VEC_INLINE static int at(const IVEC_NAME &a, int b) {
+    int data[8] __attribute__((aligned(32)));
+    store(data, a);
+    return data[b];
+  }
+
+  VEC_INLINE static void print(const char * str, const IVEC_NAME &a) {
+    int data[8] __attribute__((aligned(32)));
+    store(data, a);
+    printf("%s:", str);
+    for (int i = 0; i < FVEC_LEN; i++) {
+      printf(" %d", data[i]);
+    }
+    printf("\n");
+  }
+
+  VEC_INLINE static IVEC_NAME maskz_loadu(const BVEC_NAME &mask, 
+					  const int * src) {
+    FVEC_VEC_T mask_val = mask.val_;
+#   if FVEC_LEN==4
+#    ifdef __AVX2__
+    static const unsigned int mask_shuffle[8] __attribute__((aligned(32))) =
+      {0, 2, 4, 6, 0, 0, 0, 0};
+    __m256 m = _mm256_castpd_ps(mask_val);
+    m = _mm256_permutevar8x32_ps(m, _mm256_load_si256((__m256i*)mask_shuffle));
+    __m128i ret = _mm_maskload_epi32(src, 
+       _mm256_castsi256_si128(_mm256_castps_si256(m)));
+    static const unsigned int load_shuffle[8] __attribute__((aligned(32))) =
+      {0, 0, 1, 1, 2, 2, 3, 3};
+    return _mm256_permutevar8x32_epi32(_mm256_castsi128_si256(ret), 
+      _mm256_load_si256((__m256i*)load_shuffle));
+#    else
+    int dest[8] __attribute__((aligned(32))) = {0};
+    int mask_buf[8] __attribute__((aligned(32)));
+    _mm256_store_pd((double*) mask_buf, mask.val_);
+    for (int i = 0; i < 4; i++) {
+      if (mask_buf[2*i]) {
+        int val = src[i];
+        dest[2*i+0] = val;
+        dest[2*i+1] = val;
+      }
+    }
+    return _mm256_load_si256((__m256i*) dest);
+#    endif
+#   else
+    return _mm256_maskload_epi32(src, to(mask_val));
+#   endif
+  }
+
+  VEC_INLINE static IVEC_NAME mask_gather(
+      const IVEC_NAME &src, const BVEC_NAME &mask, const IVEC_NAME &idx, 
+      const int * mem, const int scale
+  ) {
+    assert(scale == sizeof(int));
+    return _mm256_mask_i32gather_epi32(src.val_, mem, idx.val_, to(mask.val_), 
+				       sizeof(int));
+  }
+
+  VEC_INLINE static void mask_compressstore(const BVEC_NAME &mask, int * dest,
+					    const IVEC_NAME &src) {
+    int buf[8] __attribute__((aligned(64)));
+    const int stride = FVEC_LEN==4 ? 2 : 1;
+    _mm256_store_si256((__m256i*)buf, src.val_);
+    int mask_val = FVEC_SUFFIX(_mm256_movemask_)(mask.val_);
+    int k = 0;
+    #pragma unroll
+    for (int i = 0; i < FVEC_LEN; i++) {
+      if (mask_val & (1 << i))
+        dest[k++] = buf[stride*i];
+    }
+  }
+
+  VEC_INLINE static IVEC_NAME set1(int i) {
+    return _mm256_set1_epi32(i);
+  }
+  VEC_INLINE static IVEC_NAME setzero() {
+    return _mm256_setzero_si256();
+  }
+  VEC_INLINE static IVEC_NAME undefined() {
+    return _mm256_undefined_si256();
+  }
+
+  VEC_INLINE IVEC_NAME operator +(const IVEC_NAME &b) const {
+    return _mm256_add_epi32(this->val_, b.val_);
+  }
+};
+
+class FVEC_NAME {
+  friend class AVEC_NAME;
+#if FVEC_LEN==8
+  friend class avec8pd;
+#endif
+  FVEC_VEC_T val_;
+  VEC_INLINE FVEC_NAME(const FVEC_VEC_T &v) : val_(v) {}
+public:
+  static const int VL = FVEC_LEN;
+# if defined(__AVX2__) || defined(__MIC__) || defined(__AVX512F__)
+  VEC_INLINE static bool fast_compress() { return true; }
+# else
+  VEC_INLINE static bool fast_compress() { return false; }
+# endif
+  VEC_INLINE FVEC_NAME() {}
+  VEC_INLINE static FVEC_SCAL_T at(const FVEC_NAME &a, int i) {
+    assert(i < FVEC_LEN);
+    FVEC_SCAL_T data[FVEC_LEN] __attribute__((aligned(64)));
+    FVEC_SUFFIX(_mm256_store_)(data, a.val_);
+    return data[i];
+  }
+
+  #define FVEC_MASK_BINFN_B(the_name, the_imm)				\
+    VEC_INLINE static BVEC_NAME the_name(const FVEC_NAME &a,		\
+					 const FVEC_NAME &b) {		\
+      return FVEC_SUFFIX(_mm256_cmp_)(a.val_, b.val_, the_imm);         \
+    }									\
+    VEC_INLINE static BVEC_NAME mask_##the_name(                        \
+        const BVEC_NAME &mask,                                          \
+        const FVEC_NAME &a, const FVEC_NAME &b                          \
+    ) {                                                                 \
+      BVEC_NAME ret = FVEC_SUFFIX(_mm256_cmp_)(                         \
+        a.val_, b.val_, the_imm);                                       \
+      return mask & ret;						\
+    }
+  FVEC_MASK_BINFN_B(cmple, _CMP_LE_OS)
+  FVEC_MASK_BINFN_B(cmplt, _CMP_LT_OS)
+  FVEC_MASK_BINFN_B(cmpneq, _CMP_NEQ_UQ)
+  FVEC_MASK_BINFN_B(cmpnle, _CMP_NLE_US)
+  FVEC_MASK_BINFN_B(cmpnlt, _CMP_NLT_US)
+  #undef FVEC_MASK_BINFN_B
+
+  VEC_INLINE static __m256d _mm256_recip_pd(__m256d a) {
+    __m256d c_1 = _mm256_set1_pd(1);
+    return _mm256_div_pd(c_1, a);
+  }
+  VEC_INLINE static __m256 _mm256_recip_ps(__m256 a) {
+    return _mm256_rcp_ps(a);
+  }
+  VEC_INLINE static __m256d _mm256_abs_pd(__m256d a) {
+    const unsigned long long abs_mask = 0x7FFFFFFFFFFFFFFF;
+    const unsigned long long abs_full[8] =
+        {abs_mask, abs_mask, abs_mask, abs_mask, abs_mask, abs_mask, abs_mask, 
+	   abs_mask};
+    return _mm256_and_pd(_mm256_load_pd((double*)abs_full), a);
+  }
+  VEC_INLINE static __m256 _mm256_abs_ps(__m256 a) {
+    const unsigned long long abs_mask = 0x7FFFFFFF;
+    const unsigned long long abs_full[16] =
+        {abs_mask, abs_mask, abs_mask, abs_mask, abs_mask, abs_mask, abs_mask, 
+	   abs_mask, abs_mask, abs_mask, abs_mask, abs_mask, abs_mask, 
+	   abs_mask, abs_mask, abs_mask};
+    return _mm256_and_ps(_mm256_load_ps((float*)abs_full), a);
+  }
+
+  #define FVEC_UNFN_F(the_name)                                      \
+    VEC_INLINE static FVEC_NAME the_name(const FVEC_NAME &a) {       \
+      return FVEC_SUFFIX(_mm256_##the_name##_)(a.val_);              \
+    }
+  FVEC_UNFN_F(abs)
+  FVEC_UNFN_F(exp)
+  FVEC_UNFN_F(invsqrt)
+  FVEC_UNFN_F(recip)
+  FVEC_UNFN_F(sqrt)
+  #undef FVEC_UNFN_F
+
+  VEC_INLINE static FVEC_NAME mask_blend(
+      const BVEC_NAME &mask, const FVEC_NAME &a, const FVEC_NAME &b
+  ) {
+    return FVEC_SUFFIX(_mm256_blendv_)(a.val_, b.val_, mask.val_);
+  }
+  #define FVEC_MASK_UNFN_F(the_name)                                 \
+    VEC_INLINE static FVEC_NAME mask_##the_name(                     \
+        const FVEC_NAME &src, const BVEC_NAME &mask,                 \
+        const FVEC_NAME &a                                           \
+    ) {                                                              \
+      FVEC_NAME ret = FVEC_SUFFIX(_mm256_##the_name##_)(             \
+							a.val_);     \
+      return mask_blend(mask, src, ret);			     \
+    }
+  FVEC_MASK_UNFN_F(cos)
+  FVEC_MASK_UNFN_F(recip)
+  FVEC_MASK_UNFN_F(sqrt)
+  #undef FVEC_MASK_UNFN_F
+
+  #define FVEC_BINFN_F(the_name)                                     \
+    VEC_INLINE static FVEC_NAME the_name(const FVEC_NAME &a,         \
+                                         const FVEC_NAME &b) {	     \
+      return FVEC_SUFFIX(_mm256_##the_name##_)(a.val_, b.val_);	     \
+    }
+  FVEC_BINFN_F(max)
+  FVEC_BINFN_F(min)
+  #undef FVEC_BINFN_F
+
+  #define FVEC_MASK_BINFN_F(the_name)                                \
+    VEC_INLINE static FVEC_NAME mask_##the_name(	             \
+        const FVEC_NAME &src, const BVEC_NAME &mask,                 \
+        const FVEC_NAME &a, const FVEC_NAME &b                       \
+    ) {                                                              \
+      FVEC_NAME ret = FVEC_SUFFIX(_mm256_##the_name##_)(             \
+        a.val_, b.val_);                                             \
+      return mask_blend(mask, src, ret);                             \
+    }
+  FVEC_MASK_BINFN_F(add)
+  FVEC_MASK_BINFN_F(div)
+  FVEC_MASK_BINFN_F(mul)
+  FVEC_MASK_BINFN_F(sub)
+  #undef FVEC_MASK_BINFN_F
+
+  VEC_INLINE static FVEC_NAME mask_expand(
+      const FVEC_NAME &src, const BVEC_NAME &mask, const FVEC_NAME &b
+  ) {
+    FVEC_VEC_T ret = FVEC_SUFFIX(_mm256_expand_)(mask.val_, b.val_);
+    ret = FVEC_SUFFIX(_mm256_and_)(mask.val_, ret);
+    ret = FVEC_SUFFIX(_mm256_or_)(ret, FVEC_SUFFIX(_mm256_andnot_)
+      (mask.val_, src.val_));
+    return ret;
+  }
+  VEC_INLINE static FVEC_NAME masku_compress(
+      const BVEC_NAME &mask, const FVEC_NAME &b
+  ) {
+    return FVEC_SUFFIX(_mm256_compress_)(mask.val_, b.val_);
+  }
+
+  VEC_INLINE static FVEC_NAME set1(const FVEC_SCAL_T &a) {
+    return FVEC_SUFFIX(_mm256_set1_)(a);
+  }
+  VEC_INLINE static FVEC_NAME setzero() {
+    return FVEC_SUFFIX(_mm256_setzero_)();
+  }
+  VEC_INLINE static FVEC_NAME undefined() {
+    return FVEC_SUFFIX(_mm256_undefined_)();
+  }
+
+  VEC_INLINE static FVEC_NAME load(const FVEC_SCAL_T *mem) {
+    return FVEC_SUFFIX(_mm256_load_)(mem);
+  }
+  VEC_INLINE static void store(FVEC_SCAL_T * dest, const FVEC_NAME &a) {
+    FVEC_SUFFIX(_mm256_store_)(dest, a.val_);
+  }
+
+
+  VEC_INLINE static FVEC_NAME gather(const IVEC_NAME &idx, 
+    const FVEC_SCAL_T * mem, const int scale) {
+    assert(scale == sizeof(FVEC_SCAL_T));
+#   if FVEC_LEN==4
+#    ifdef __AVX2__
+    static const unsigned int mask_shuffle[8] __attribute__((aligned(32))) =
+      {0, 2, 4, 6, 0, 0, 0, 0};
+    __m256i m = _mm256_permutevar8x32_epi32(idx.val_, 
+      _mm256_load_si256((__m256i*)mask_shuffle));
+    __m128i idx_short = _mm256_castsi256_si128(m);
+    return FVEC_SUFFIX(_mm256_i32gather_)(mem, idx_short, sizeof(FVEC_SCAL_T));
+#    else
+    int idx_buf[8] __attribute__((aligned(32)));
+    _mm256_store_si256((__m256i*) idx_buf, idx.val_);
+    double dest[4] __attribute__((aligned(32)));
+    for (int i = 0; i < 4; i++) {
+      dest[i] = mem[idx_buf[2*i]];
+    }
+    return _mm256_load_pd(dest);
+#    endif
+#   else
+    return FVEC_SUFFIX(_mm256_i32gather_)(mem, idx.val_, sizeof(FVEC_SCAL_T));
+#   endif
+  }
+  VEC_INLINE static FVEC_NAME mask_gather(
+      const FVEC_NAME &src, const BVEC_NAME &mask, const IVEC_NAME &idx,
+      const FVEC_SCAL_T * mem, const int scale
+  ) {
+    assert(scale == sizeof(FVEC_SCAL_T));
+#   if FVEC_LEN==4
+#    ifdef __AVX2__
+    static const unsigned int mask_shuffle[8] __attribute__((aligned(32))) =
+      {0, 2, 4, 6, 0, 0, 0, 0};
+    __m256i m = _mm256_permutevar8x32_epi32(idx.val_, 
+      _mm256_load_si256((__m256i*)mask_shuffle));
+    __m128i idx_short = _mm256_castsi256_si128(m);
+    return FVEC_SUFFIX(_mm256_mask_i32gather_)(src.val_, mem, idx_short, 
+      mask.val_, sizeof(FVEC_SCAL_T));
+#    else
+    int idx_buf[8] __attribute__((aligned(32)));
+    int mask_buf[8] __attribute__((aligned(32)));
+    _mm256_store_si256((__m256i*) idx_buf, idx.val_);
+    _mm256_store_pd((double*) mask_buf, mask.val_);
+    double dest[4] __attribute__((aligned(32)));
+    _mm256_store_pd((double*) dest, src.val_);
+    for (int i = 0; i < 4; i++) {
+      if (mask_buf[2*i])
+        dest[i] = mem[idx_buf[2*i]];
+    }
+    return _mm256_load_pd(dest);
+#    endif
+#   else
+    return FVEC_SUFFIX(_mm256_mask_i32gather_)(src.val_, mem, idx.val_, 
+      mask.val_, sizeof(FVEC_SCAL_T));
+#   endif
+  }
+
+  VEC_INLINE static void gather_4_adjacent(const IVEC_NAME &idx, 
+      const FVEC_SCAL_T * mem, const int scale, FVEC_NAME * out_0, 
+      FVEC_NAME * out_1, FVEC_NAME * out_2, FVEC_NAME * out_3) {
+    assert(scale == sizeof(FVEC_SCAL_T));
+    int idx_buf[8] __attribute__((aligned(32)));
+    _mm256_store_si256((__m256i*) idx_buf, idx.val_);
+#   if FVEC_LEN==4
+    __m256d a0 = _mm256_load_pd(&mem[idx_buf[0]]);
+    __m256d a1 = _mm256_load_pd(&mem[idx_buf[2]]);
+    __m256d a2 = _mm256_load_pd(&mem[idx_buf[4]]);
+    __m256d a3 = _mm256_load_pd(&mem[idx_buf[6]]);
+    __m256d b0 = _mm256_unpacklo_pd(a0, a1);
+    __m256d b1 = _mm256_unpackhi_pd(a0, a1);
+    __m256d b2 = _mm256_unpacklo_pd(a2, a3);
+    __m256d b3 = _mm256_unpackhi_pd(a2, a3);
+    *out_0 = _mm256_permute2f128_pd(b0, b2, 0x20);
+    *out_1 = _mm256_permute2f128_pd(b1, b3, 0x20);
+    *out_2 = _mm256_permute2f128_pd(b0, b2, 0x31);
+    *out_3 = _mm256_permute2f128_pd(b1, b3, 0x31);
+#   else
+    const float *e0 = &mem[idx_buf[0]];
+    const float *e1 = &mem[idx_buf[1]];
+    const float *e2 = &mem[idx_buf[2]];
+    const float *e3 = &mem[idx_buf[3]];
+    const float *e4 = &mem[idx_buf[4]];
+    const float *e5 = &mem[idx_buf[5]];
+    const float *e6 = &mem[idx_buf[6]];
+    const float *e7 = &mem[idx_buf[7]];
+    __m256 a0 = _mm256_loadu2_m128(e4, e0);
+    __m256 a1 = _mm256_loadu2_m128(e5, e1);
+    __m256 b0 = _mm256_unpacklo_ps(a0, a1);
+    __m256 b1 = _mm256_unpackhi_ps(a0, a1);
+    __m256 a2 = _mm256_loadu2_m128(e6, e2);
+    __m256 a3 = _mm256_loadu2_m128(e7, e3);
+    __m256 b2 = _mm256_unpacklo_ps(a2, a3);
+    __m256 b3 = _mm256_unpackhi_ps(a2, a3);
+    *out_0 = _mm256_shuffle_ps(b0, b2, 0x44);
+    *out_1 = _mm256_shuffle_ps(b0, b2, 0xEE);
+    *out_2 = _mm256_shuffle_ps(b1, b3, 0x44);
+    *out_3 = _mm256_shuffle_ps(b1, b3, 0xEE);
+#   endif
+  }
+  VEC_INLINE static void gather_3_adjacent(const IVEC_NAME &idx, 
+					   const FVEC_SCAL_T * mem, 
+					   const int scale, 
+					   FVEC_NAME * out_0, 
+					   FVEC_NAME * out_1, 
+					   FVEC_NAME * out_2) {
+    assert(scale == sizeof(FVEC_SCAL_T));
+    FVEC_NAME tmp_3;
+    gather_4_adjacent(idx, mem, scale, out_0, out_1, out_2, &tmp_3);
+  }
+
+  VEC_INLINE static double _mm256_reduce_add_pd(__m256d a) {
+    __m256d t1 = _mm256_hadd_pd(a, a);
+    __m128d t2 = _mm256_extractf128_pd(t1, 1);
+    __m128d t3 = _mm256_castpd256_pd128(t1);
+    return _mm_cvtsd_f64(_mm_add_pd(t2, t3));
+  }
+
+  VEC_INLINE static float _mm256_reduce_add_ps(__m256 a) {
+    __m256 t1 = _mm256_hadd_ps(a, a);
+    __m128 t2 = _mm256_extractf128_ps(t1, 1);
+    __m128 t3 = _mm256_castps256_ps128(t1);
+    __m128 t4 = _mm_add_ps(t2, t3);
+    __m128 t5 = _mm_permute_ps(t4, 0x1B); // 0x1B = reverse
+    return _mm_cvtss_f32(_mm_add_ps(t4, t5));
+  }
+
+  VEC_INLINE static FVEC_SCAL_T reduce_add(const FVEC_NAME &a) {
+    return FVEC_SUFFIX(_mm256_reduce_add_)(a.val_);
+  }
+  VEC_INLINE static FVEC_SCAL_T mask_reduce_add(const BVEC_NAME &mask, 
+						const FVEC_NAME &a) {
+    return reduce_add(FVEC_SUFFIX(_mm256_and_)(mask.val_, a.val_));
+  }
+
+  VEC_INLINE static IVEC_NAME unpackloepi32(const FVEC_NAME &a) {
+#   if FVEC_LEN==4
+#    if __AVX2__
+    static const unsigned int mask_shuffle[8] __attribute__((aligned(32))) =
+      {0, 0, 2, 2, 4, 4, 6, 6};
+    __m256 m = _mm256_permutevar8x32_ps(_mm256_castpd_ps(a.val_),
+      _mm256_load_si256((__m256i*)mask_shuffle));
+    return _mm256_castps_si256(m);
+#    else
+    __m128i a_lo = _mm256_castsi256_si128(_mm256_castpd_si256(a.val_));
+    __m128i a_hi = _mm256_extractf128_si256(_mm256_castpd_si256(a.val_), 1);
+    __m128i c_lo = _mm_shuffle_epi32(a_lo, 0xA0); /*1010 0000*/
+    __m128i c_hi = _mm_shuffle_epi32(a_hi, 0xA0);
+    __m256i ret = _mm256_setr_m128i(c_lo, c_hi);
+    return ret;
+#    endif
+#   else
+    return _mm256_castps_si256(a.val_);
+#   endif
+  }
+
+  VEC_INLINE static FVEC_NAME mask_sincos(
+      FVEC_NAME * cos, const FVEC_NAME &src_a, const FVEC_NAME &src_b,
+      const BVEC_NAME &mask, const FVEC_NAME &arg
+  ) {
+    FVEC_VEC_T c, s = FVEC_SUFFIX(_mm256_sincos_)(&c, arg.val_);
+    *cos = mask_blend(mask, src_b, c);
+    return mask_blend(mask, src_a, s);
+  }
+
+  #define FVEC_BINOP(the_sym, the_name)                                      \
+    VEC_INLINE inline FVEC_NAME operator the_sym(const FVEC_NAME &b) const { \
+    return FVEC_SUFFIX(_mm256_##the_name##_)(this->val_, b.val_);            \
+  }
+  FVEC_BINOP(+, add)
+  FVEC_BINOP(-, sub)
+  FVEC_BINOP(*, mul)
+  FVEC_BINOP(/, div)
+  #undef FVEC_BINOP
+
+  VEC_INLINE static void gather_prefetch0(const IVEC_NAME &a, void * mem) {
+    /* NOP */
+  }
+};
+
+class AVEC_NAME {
+  friend class avec8pd;
+  FVEC_VEC_T val_;
+  VEC_INLINE AVEC_NAME(const FVEC_VEC_T &a) : val_(a) {}
+public:
+  VEC_INLINE AVEC_NAME(const FVEC_NAME &a) : val_(a.val_) {}
+  VEC_INLINE static AVEC_NAME undefined() {
+    return FVEC_SUFFIX(_mm256_undefined_)();
+  }
+  VEC_INLINE static AVEC_NAME mask_gather(
+      const AVEC_NAME &src, const BVEC_NAME &mask, const IVEC_NAME &idx,
+      const FVEC_SCAL_T * mem, const int scale
+  ) {
+    assert(scale == sizeof(FVEC_SCAL_T));
+    return FVEC_NAME::mask_gather(src.val_, mask, idx, mem, scale);
+  }
+  VEC_INLINE static void mask_i32loscatter(
+      FVEC_SCAL_T * mem, const BVEC_NAME &mask, const IVEC_NAME &idx,
+      const AVEC_NAME &a, const int scale
+  ) {
+    assert(scale == sizeof(FVEC_SCAL_T));
+    for (int l = 0; l < FVEC_NAME::VL; l++) {
+      if (BVEC_NAME::test_at(mask, l))
+        mem[IVEC_NAME::at(idx, l)] = FVEC_NAME::at(a.val_, l);
+    }
+  }
+
+  #define AVEC_BINOP(the_sym, the_name)                                      \
+    VEC_INLINE inline AVEC_NAME operator the_sym(const AVEC_NAME &b) const { \
+    return FVEC_SUFFIX(_mm256_##the_name##_)(this->val_, b.val_);            \
+  }
+  AVEC_BINOP(-, sub)
+  #undef AVEC_BINOP
+};
+
+#if FVEC_LEN==8
+class avec8pd {
+  __m256d lo_, hi_;
+  VEC_INLINE avec8pd(const __m256d &lo, const __m256d &hi) : lo_(lo), hi_(hi) {}
+  VEC_INLINE static __m128 get_ps_hi(__m256 a) {
+    return _mm256_extractf128_ps(a, 1);
+  }
+  VEC_INLINE static __m128 get_ps_lo(__m256 a) {
+    return _mm256_castps256_ps128(a);
+  }
+  VEC_INLINE static __m128i get_si_hi(__m256i a) {
+    return _mm_castps_si128(get_ps_hi(_mm256_castsi256_ps(a)));
+  }
+  VEC_INLINE static __m128i get_si_lo(__m256i a) {
+    return _mm_castps_si128(get_ps_lo(_mm256_castsi256_ps(a)));
+  }
+public:
+  VEC_INLINE avec8pd(const FVEC_NAME &a) {
+    lo_ = _mm256_cvtps_pd(get_ps_lo(a.val_));
+    hi_ = _mm256_cvtps_pd(get_ps_hi(a.val_));
+  }
+  VEC_INLINE static avec8pd undefined() {
+    return avec8pd(_mm256_undefined_pd(), _mm256_undefined_pd());
+  }
+  VEC_INLINE static avec8pd mask_gather(
+      const avec8pd &src, const BVEC_NAME &mask, const IVEC_NAME &idx,
+      const double * mem, const int scale
+  ) {
+#   ifndef __AVX2__
+    assert(scale == sizeof(double));
+    int idx_buf[8] __attribute__((aligned(32)));
+    _mm256_store_si256((__m256i*) idx_buf, idx.val_);
+    int mask_val = _mm256_movemask_ps(mask.val_);
+    double ret_buf[8] __attribute__((aligned(32)));
+    _mm256_store_pd(&ret_buf[0], src.lo_);
+    _mm256_store_pd(&ret_buf[4], src.hi_);
+    for (int i = 0; i < 8; i++) {
+      if (mask_val & (1 << i)) {
+        ret_buf[i] = mem[idx_buf[i]];
+      }
+    }
+    __m256d lo = _mm256_load_pd(&ret_buf[0]);
+    __m256d hi = _mm256_load_pd(&ret_buf[4]);
+#   else
+    static const unsigned int lo_shuffle[8] __attribute__((aligned(32))) =
+      {0, 0, 1, 1, 2, 2, 3, 3};
+    static const unsigned int hi_shuffle[8] __attribute__((aligned(32))) =
+      {4, 4, 5, 5, 6, 6, 7, 7};
+    __m256d lo_mask = _mm256_castps_pd(_mm256_permutevar8x32_ps(mask.val_,
+      _mm256_load_si256((__m256i*) lo_shuffle)));
+    __m256d hi_mask = _mm256_castps_pd(_mm256_permutevar8x32_ps(mask.val_, 
+      _mm256_load_si256((__m256i*) hi_shuffle)));
+    __m256d lo = _mm256_mask_i32gather_pd(src.lo_, mem, get_si_lo(idx.val_), 
+					  lo_mask, sizeof(double));
+    __m256d hi = _mm256_mask_i32gather_pd(src.hi_, mem, get_si_hi(idx.val_), 
+					  hi_mask, sizeof(double));
+#   endif
+    return avec8pd(lo, hi);
+  }
+  VEC_INLINE static void mask_i32loscatter(
+      double * mem, const BVEC_NAME &mask, const IVEC_NAME &idx,
+      const avec8pd &a, const int scale
+  ) {
+    assert(scale == sizeof(double));
+    double a_buf[8] __attribute__((aligned(32)));
+    _mm256_store_pd(a_buf, a.lo_);
+    _mm256_store_pd(&a_buf[4], a.hi_);
+    int idx_buf[8] __attribute__((aligned(32)));
+    _mm256_store_si256((__m256i*)idx_buf, idx.val_);
+    int mask_val = _mm256_movemask_ps(mask.val_);
+    for (int i = 0; i < 8; i++) {
+      if (mask_val & (1 << i))
+        mem[idx_buf[i]] = a_buf[i];
+    }
+  }
+
+  #define AVEC2_BINOP(the_sym, the_name)                                    \
+    VEC_INLINE inline avec8pd operator the_sym(const avec8pd &b) const {    \
+    __m256d lo = _mm256_##the_name##_pd(this->lo_, b.lo_);                  \
+    __m256d hi = _mm256_##the_name##_pd(this->hi_, b.hi_);                  \
+    return avec8pd(lo, hi);                                                 \
+  }
+  AVEC2_BINOP(-, sub)
+};
+#endif
+
+}
+
+
+#ifdef FVEC_FIRST_PASS
+
+template<typename flt_t, typename acc_t>
+struct intr_types;
+
+template<>
+struct intr_types<double,double> {
+  typedef mm256::fvec4pd fvec;
+  typedef mm256::ivec4 ivec;
+  typedef mm256::bvec4 bvec;
+  typedef mm256::avec4pd avec;
+};
+
+template<>
+struct intr_types<float,float> {
+  typedef mm256::fvec8ps fvec;
+  typedef mm256::ivec8 ivec;
+  typedef mm256::bvec8 bvec;
+  typedef mm256::avec8ps avec;
+};
+
+template<>
+struct intr_types<float,double> {
+  typedef mm256::fvec8ps fvec;
+  typedef mm256::ivec8 ivec;
+  typedef mm256::bvec8 bvec;
+  typedef mm256::avec8pd avec;
+};
+
+#endif
+
+#ifndef FVEC_FIRST_PASS
+#  define FVEC_FIRST_PASS
+#  include "intel_intrinsics_airebo.h"
+#endif
+
+#endif
+
+#ifdef LMP_INTEL_AIREBO_SCALAR
+
+#include <cassert>
+#include <cmath>
+#include <immintrin.h>
+
+#define VEC_INLINE __attribute__((always_inline))
+
+template<typename flt_t, typename acc_t>
+struct intr_types {
+
+class fvec;
+class ivec;
+class avec;
+class bvec {
+  friend class fvec;
+  friend class ivec;
+  friend class avec;
+  bool val_;
+  VEC_INLINE bvec(const bool &v) : val_(v) {}
+public:
+  VEC_INLINE bvec() {}
+  VEC_INLINE static bvec kand(const bvec &a, const bvec &b) {
+    return a.val_ && b.val_;
+  }
+  VEC_INLINE static bvec kandn(const bvec &a, const bvec &b) {
+    return (! a.val_) && b.val_;
+  }
+  VEC_INLINE static bvec knot(const bvec &a) {
+    return ! a.val_;
+  }
+  VEC_INLINE static int kortestz(const bvec &a, const bvec &b) {
+    return (! a.val_) && (! b.val_) ? true : false;
+  }
+  VEC_INLINE static bvec masku_compress(const bvec &mask, const bvec &a) {
+    return mask.val_ ? a.val_ : false;
+  }
+  VEC_INLINE static bvec mask_expand(const bvec &src, const bvec &mask, 
+				     const bvec &a) {
+    return mask.val_ ? a.val_ : src.val_;
+  }
+  VEC_INLINE static bvec full() {
+    return true;
+  }
+  VEC_INLINE static bvec empty() {
+    return false;
+  }
+  VEC_INLINE static bvec only(int n) {
+    return n == 1 ? true : false;
+  }
+  VEC_INLINE static bvec after(int n) {
+    return n == 0 ? true : false;
+  }
+  VEC_INLINE static bvec onlyafter(int only, int after) {
+    return after == 0 && only == 1 ? true : false;
+  }
+  VEC_INLINE static int popcnt(const bvec &a) {
+    return static_cast<int>(a.val_);
+  }
+  VEC_INLINE static bool test_all_unset(const bvec &a) {
+    return kortestz(a, a);
+  }
+  VEC_INLINE static bool test_any_set(const bvec &a) {
+    return ! test_all_unset(a);
+  }
+  VEC_INLINE static bool test_at(const bvec &a, int i) {
+    assert(i < 1);
+    return a.val_;
+  }
+  VEC_INLINE bvec operator &(const bvec &b) const {
+    return val_ && b.val_;
+  }
+  VEC_INLINE bvec operator |(const bvec &b) const {
+    return val_ || b.val_;
+  }
+  VEC_INLINE bvec operator ~() const {
+    return ! val_;
+  }
+};
+
+class ivec {
+  friend class fvec;
+  friend class avec;
+  int val_;
+  VEC_INLINE ivec(const int &v) : val_(v) {}
+public:
+  static const int VL = 1;
+  VEC_INLINE ivec() {}
+
+  #define IVEC_MASK_BINFN_B(the_name, the_op)                        \
+    VEC_INLINE static bvec the_name(const ivec &a, const ivec &b) {  \
+      return a.val_ the_op b.val_;                                   \
+    }                                                                \
+    VEC_INLINE static bvec mask_##the_name(                          \
+        const bvec &mask,                                            \
+        const ivec &a, const ivec &b                                 \
+    ) {                                                              \
+      return mask.val_ && (a.val_ the_op b.val_);                    \
+                                                                     \
+    }
+  IVEC_MASK_BINFN_B(cmpeq, ==)
+  IVEC_MASK_BINFN_B(cmplt, <)
+  IVEC_MASK_BINFN_B(cmpneq, !=)
+  IVEC_MASK_BINFN_B(cmpgt, >)
+
+  #define IVEC_MASK_BINFN_I(the_name, the_op)                        \
+    VEC_INLINE static ivec mask_##the_name(                          \
+        const ivec &src, const bvec &mask,                           \
+        const ivec &a, const ivec &b                                 \
+    ) {                                                              \
+      return mask.val_ ? a.val_ the_op b.val_ : src.val_;            \
+    }
+  IVEC_MASK_BINFN_I(add, +)
+  VEC_INLINE static ivec mask_blend(
+      const bvec &mask, const ivec &a, const ivec &b
+  ) {
+    return mask.val_ ? b.val_ : a.val_;
+  }
+
+  #define IVEC_BINFN_I(the_name, the_op)                             \
+    VEC_INLINE static ivec the_name(const ivec &a, const ivec &b) {  \
+      return a.val_ the_op b.val_;                                   \
+    }
+  IVEC_BINFN_I(mullo, *)
+  IVEC_BINFN_I(srlv, >>)
+  VEC_INLINE static ivec the_and(const ivec &a, const ivec &b) {
+    return a.val_ & b.val_;
+  }
+
+  VEC_INLINE static ivec mask_expand(
+      const ivec &src, const bvec &a, const ivec &b
+  ) {
+    return a.val_ ? b.val_ : src.val_;
+  }
+  VEC_INLINE static ivec masku_compress(
+      const bvec &a, const ivec &b
+  ) {
+    return a.val_ ? b.val_ : 0;
+  }
+
+  VEC_INLINE static int at(const ivec &a, int b) {
+    assert(b == 0);
+    return a.val_;
+  }
+
+  VEC_INLINE static ivec load(const int * src) {
+    return *src;
+  }
+  VEC_INLINE static ivec mask_loadu(const bvec &mask, const int * src) {
+    return mask.val_ ? *src : 0xDEAD;
+  }
+  VEC_INLINE static ivec maskz_loadu(const bvec &mask, const int * src) {
+    return mask.val_ ? *src : 0;
+  }
+  VEC_INLINE static void mask_storeu(const bvec &mask, int * dest, 
+    const ivec &src) {
+    if (mask.val_) *dest = src.val_;
+  }
+  VEC_INLINE static void store(int * dest, const ivec &src) {
+    *dest = src.val_;
+  }
+
+  VEC_INLINE static ivec mask_gather(
+      const ivec &src, const bvec &mask, const ivec &idx, const int * mem, 
+	const int scale
+  ) {
+    return mask.val_ ? *reinterpret_cast<const int *>
+      (reinterpret_cast<const char*>(mem) + scale * idx.val_) : src.val_;
+  }
+  VEC_INLINE static void mask_i32scatter(
+      int * mem, const bvec &mask, const ivec &idx, const ivec &a, 
+	const int scale
+  ) {
+    if (mask.val_) *reinterpret_cast<int *>(reinterpret_cast<char*>(mem) + 
+      scale * idx.val_) = a.val_;
+  }
+
+  VEC_INLINE static void mask_compressstore(const bvec &mask, int * dest, 
+      const ivec &src) {
+    if (mask.val_) *dest = src.val_;
+  }
+
+  VEC_INLINE static ivec set(
+      int i15, int i14, int i13, int i12, int i11, int i10, int i9, int i8,
+      int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0
+  ) {
+    return i0;
+  }
+  VEC_INLINE static ivec set1(int i) {
+    return i;
+  }
+  VEC_INLINE static ivec setzero() {
+    return 0;
+  }
+  VEC_INLINE static ivec undefined() {
+    return 0xDEAD;
+  }
+
+  VEC_INLINE ivec operator +(const ivec &b) const {
+    return val_ + b.val_;
+  }
+};
+
+class fvec {
+  friend class avec;
+  flt_t val_;
+  VEC_INLINE fvec(const flt_t &v) : val_(v) {}
+public:
+  static const int VL = 1;
+  VEC_INLINE fvec() {}
+  VEC_INLINE static flt_t at(const fvec &a, int i) {
+    assert(i < 1);
+    return a.val_;
+  }
+  VEC_INLINE static bool fast_compress() { return false; }
+
+  #define FVEC_MASK_BINFN_B(the_name, the_op)                        \
+    VEC_INLINE static bvec the_name(const fvec &a, const fvec &b) {  \
+      return a.val_ the_op b.val_;                                   \
+    }                                                                \
+    VEC_INLINE static bvec mask_##the_name(                          \
+        const bvec &mask,                                            \
+        const fvec &a, const fvec &b                                 \
+    ) {                                                              \
+      return mask.val_ && (a.val_ the_op b.val_);                    \
+    }
+  FVEC_MASK_BINFN_B(cmple, <=)
+  FVEC_MASK_BINFN_B(cmplt, <)
+  FVEC_MASK_BINFN_B(cmpneq, !=)
+  FVEC_MASK_BINFN_B(cmpnle, >)
+  FVEC_MASK_BINFN_B(cmpnlt, >=)
+
+  #define FVEC_UNFN_F(the_name, the_fn)                              \
+    VEC_INLINE static fvec the_name(const fvec &a) {                 \
+      return the_fn(a.val_);                                         \
+    }
+  FVEC_UNFN_F(abs, fabs)
+  FVEC_UNFN_F(exp, ::exp)
+  FVEC_UNFN_F(invsqrt, 1/std::sqrt)
+  FVEC_UNFN_F(recip, 1/)
+  FVEC_UNFN_F(sqrt, std::sqrt)
+
+  #define FVEC_MASK_UNFN_F(the_name, the_fn)                         \
+    VEC_INLINE static fvec mask_##the_name(                          \
+        const fvec &src, const bvec &mask,                           \
+        const fvec &a                                                \
+    ) {                                                              \
+      return mask.val_ ? the_fn(a.val_) : src.val_;                  \
+    }
+  FVEC_MASK_UNFN_F(cos, std::cos)
+  FVEC_MASK_UNFN_F(recip, 1/)
+  FVEC_MASK_UNFN_F(sqrt, std::sqrt)
+
+  #define FVEC_BINFN_F(the_name, the_fn)                             \
+    VEC_INLINE static fvec the_name(const fvec &a, const fvec &b) {  \
+      return the_fn(a.val_, b.val_);                                 \
+    }
+  FVEC_BINFN_F(max, ::fmax)
+  FVEC_BINFN_F(min, ::fmin)
+
+  #define FVEC_MASK_BINFN_F(the_name, the_op)                        \
+    VEC_INLINE static fvec mask_##the_name(                          \
+        const fvec &src, const bvec &mask,                           \
+        const fvec &a, const fvec &b                                 \
+    ) {                                                              \
+      return mask.val_ ? a.val_ the_op b.val_ : src.val_;            \
+    }
+  FVEC_MASK_BINFN_F(add, +)
+  FVEC_MASK_BINFN_F(div, /)
+  FVEC_MASK_BINFN_F(mul, *)
+  FVEC_MASK_BINFN_F(sub, -)
+  VEC_INLINE static fvec mask_blend(
+      const bvec &mask, const fvec &a, const fvec &b
+  ) {
+    return mask.val_ ? b.val_ : a.val_;
+  }
+
+  VEC_INLINE static fvec mask_expand(
+      const fvec &src, const bvec &a, const fvec &b
+  ) {
+    return a.val_ ? b.val_ : src.val_;
+  }
+  VEC_INLINE static fvec masku_compress(
+      const bvec &a, const fvec &b
+  ) {
+    return a.val_ ? b.val_ : 0;
+  }
+
+  VEC_INLINE static fvec set1(const flt_t &a) {
+    return a;
+  }
+  VEC_INLINE static fvec setzero() {
+    return 0;
+  }
+  VEC_INLINE static fvec undefined() {
+    return 1337.1337;
+  }
+
+  VEC_INLINE static fvec load(const flt_t *mem) {
+    return *mem;
+  }
+  VEC_INLINE static void mask_storeu(const bvec &mask, flt_t * dest, 
+				     const fvec &a) {
+    if (mask.val_) *dest = a.val_;
+  }
+  VEC_INLINE static void store(flt_t * dest, const fvec &a) {
+    *dest = a.val_;
+  }
+
+  VEC_INLINE static fvec gather(const ivec &idx, const flt_t * mem, 
+				const int scale) {
+    return *reinterpret_cast<const flt_t*>(reinterpret_cast<const char*>(mem) +
+      scale * idx.val_);
+  }
+  VEC_INLINE static fvec mask_gather(
+      const fvec &src, const bvec &mask, const ivec &idx,
+      const flt_t * mem, const int scale
+  ) {
+    return mask.val_ ? *reinterpret_cast<const flt_t*>
+      (reinterpret_cast<const char*>(mem) + scale * idx.val_) : src.val_;
+  }
+
+  VEC_INLINE static void gather_3_adjacent(const ivec &idx, const flt_t * mem,
+					   const int scale, fvec * out_0, 
+					   fvec * out_1, fvec * out_2) {
+    assert(scale == sizeof(flt_t));
+    *out_0 = gather(idx, mem + 0, scale);
+    *out_1 = gather(idx, mem + 1, scale);
+    *out_2 = gather(idx, mem + 2, scale);
+  }
+  VEC_INLINE static void gather_4_adjacent(const ivec &idx, const flt_t * mem,
+					   const int scale, fvec * out_0, 
+					   fvec * out_1, fvec * out_2, 
+					   fvec * out_3) {
+    assert(scale == sizeof(flt_t));
+    *out_0 = gather(idx, mem + 0, scale);
+    *out_1 = gather(idx, mem + 1, scale);
+    *out_2 = gather(idx, mem + 2, scale);
+    *out_3 = gather(idx, mem + 3, scale);
+  }
+
+  VEC_INLINE static flt_t mask_reduce_add(const bvec &mask, const fvec &a) {
+    return mask.val_ ? a.val_ : 0;
+  }
+  VEC_INLINE static flt_t reduce_add(const fvec &a) {
+    return a.val_;
+  }
+
+  VEC_INLINE static ivec unpackloepi32(const fvec &a) {
+    return reinterpret_cast<const int*>(&a.val_)[0];
+  }
+
+  VEC_INLINE static fvec mask_sincos(
+      fvec * cos_out, const fvec &src_a, const fvec &src_b,
+      const bvec &mask, const fvec &arg
+  ) {
+    cos_out->val_ = mask.val_ ? ::cos(arg.val_) : src_b.val_;
+    return mask.val_ ? ::sin(arg.val_) : src_a.val_;
+  }
+
+  #define FVEC_BINOP(the_sym, the_name)                              \
+    VEC_INLINE inline fvec operator the_sym(const fvec &b) const {   \
+    return this->val_ the_sym b.val_;                                \
+  }
+  FVEC_BINOP(+, add)
+  FVEC_BINOP(-, sub)
+  FVEC_BINOP(*, mul)
+  FVEC_BINOP(/, div)
+
+  VEC_INLINE static void gather_prefetch0(const ivec &idx, const void * mem) {}
+};
+
+class avec {
+  acc_t val_;
+  VEC_INLINE avec(const acc_t &a) : val_(a) {}
+public:
+  VEC_INLINE avec(const fvec &a) : val_(a.val_) {}
+  VEC_INLINE static avec undefined() {
+    return 1337.1337;
+  }
+  VEC_INLINE static avec mask_gather(const avec &src, const bvec &mask, 
+				     const ivec &idx, const acc_t * mem, 
+				     const int scale) {
+    return mask.val_ ? *reinterpret_cast<const acc_t*>
+      (reinterpret_cast<const char*>(mem) + scale * idx.val_) : src.val_;
+  }
+  VEC_INLINE static void mask_i32loscatter(acc_t * mem, const bvec &mask, 
+					   const ivec &idx, const avec &a, 
+					   const int scale) {
+    if (mask.val_) *reinterpret_cast<acc_t*>(reinterpret_cast<char*>(mem) + 
+					     idx.val_ * scale) = a.val_;
+  }
+
+  #define AVEC_BINOP(the_sym, the_name)                              \
+    VEC_INLINE inline avec operator the_sym(const avec &b) const {   \
+    return this->val_ the_sym b.val_;                                \
+  }
+  AVEC_BINOP(-, sub)
+};
+
+};
+
+#endif
diff --git a/src/USER-INTEL/nbin_intel.cpp b/src/USER-INTEL/nbin_intel.cpp
index c5574a78c7..3a36ead499 100644
--- a/src/USER-INTEL/nbin_intel.cpp
+++ b/src/USER-INTEL/nbin_intel.cpp
@@ -211,6 +211,8 @@ void NBinIntel::bin_atoms(IntelBuffers<flt_t,acc_t> * buffers) {
     for (i = nall-1; i >= nlocal; i--) {
       if (mask[i] & bitmask) {
         ibin = coord2bin(atom->x[i]);
+	// Only necessary to store when neighboring ghost
+	atombin[i] = ibin;
         bins[i] = binhead[ibin];
         binhead[ibin] = i;
       }
@@ -222,14 +224,10 @@ void NBinIntel::bin_atoms(IntelBuffers<flt_t,acc_t> * buffers) {
       binhead[ibin] = i;
     }
   } else {
-    for (i = nall-1; i >= nlocal; i--) {
+    for (i = nall-1; i >= 0; i--) {
       ibin = coord2bin(atom->x[i]);
-      bins[i] = binhead[ibin];
-      binhead[ibin] = i;
-    }
-    for (i = nlocal-1; i >= 0; i--) {
-      ibin = coord2bin(atom->x[i]);
-      atombin[i]=ibin;
+      // Only necessary to store for ghost when neighboring ghost
+      atombin[i] = ibin;
       bins[i] = binhead[ibin];
       binhead[ibin] = i;
     }
diff --git a/src/USER-INTEL/npair_full_bin_ghost_intel.cpp b/src/USER-INTEL/npair_full_bin_ghost_intel.cpp
new file mode 100644
index 0000000000..12101712f1
--- /dev/null
+++ b/src/USER-INTEL/npair_full_bin_ghost_intel.cpp
@@ -0,0 +1,593 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include "npair_full_bin_ghost_intel.h"
+#include "neighbor.h"
+#include "nstencil.h"
+#include "neigh_list.h"
+#include "atom.h"
+#include "atom_vec.h"
+#include "molecule.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+NPairFullBinGhostIntel::NPairFullBinGhostIntel(LAMMPS *lmp) : NPairIntel(lmp) {}
+
+/* ----------------------------------------------------------------------
+   binned neighbor list construction for all neighbors
+   include neighbors of ghost atoms, but no "special neighbors" for ghosts
+   every neighbor pair appears in list of both atoms i and j
+------------------------------------------------------------------------- */
+
+void NPairFullBinGhostIntel::build(NeighList *list)
+{
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_fix->offload_noghost())
+    error->all(FLERR,
+      "The 'ghost no' option cannot be used with this USER-INTEL pair style.");
+  #endif
+
+  if (nstencil > INTEL_MAX_STENCIL_CHECK)
+    error->all(FLERR, "Too many neighbor bins for USER-INTEL package.");
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (exclude)
+    error->all(FLERR, "Exclusion lists not yet supported for Intel offload");
+  #endif
+
+  if (_fix->precision() == FixIntel::PREC_MODE_MIXED)
+    fbi(list, _fix->get_mixed_buffers());
+  else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    fbi(list, _fix->get_double_buffers());
+  else
+    fbi(list, _fix->get_single_buffers());
+
+  _fix->stop_watch(TIME_HOST_NEIGHBOR);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t>
+void NPairFullBinGhostIntel::fbi(NeighList * list, 
+				 IntelBuffers<flt_t,acc_t> * buffers) 
+{
+  const int nlocal = atom->nlocal;
+  const int nall = atom->nlocal + atom->nghost;
+  list->inum = atom->nlocal;
+  list->gnum = atom->nghost;
+
+  int host_start = _fix->host_start_neighbor();
+  const int off_end = _fix->offload_end_neighbor();
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (off_end) grow_stencil();
+  if (_fix->full_host_list()) host_start = 0;
+  int offload_noghost = _fix->offload_noghost();
+  #endif
+
+  // only uses offload_end_neighbor to check whether we are doing offloading
+  // at all, no need to correct this later
+  buffers->grow_list(list, nall, comm->nthreads, off_end,
+		     _fix->nbor_pack_width());
+
+  int need_ic = 0;
+  if (atom->molecular)
+    dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
+			 neighbor->cutneighmax);
+
+  if (need_ic) {
+    fbi<flt_t,acc_t,1>(1, list, buffers, 0, off_end);
+    fbi<flt_t,acc_t,1>(0, list, buffers, host_start, nlocal);
+  } else {
+    fbi<flt_t,acc_t,0>(1, list, buffers, 0, off_end);
+    fbi<flt_t,acc_t,0>(0, list, buffers, host_start, nlocal);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int need_ic>
+void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list, 
+				 IntelBuffers<flt_t,acc_t> * buffers,
+				 const int pstart, const int pend) {
+  if (pend-pstart == 0) return;
+
+  const int nall = atom->nlocal + atom->nghost;
+  int pad = 1;
+  int nall_t = nall;
+  const int aend = nall;
+
+  const int pack_width = _fix->nbor_pack_width();
+  const ATOM_T * _noalias const x = buffers->get_x();
+  int * _noalias const firstneigh = buffers->firstneigh(list);
+  const int e_nall = nall_t;
+
+  const int molecular = atom->molecular;
+  int *ns = NULL;
+  tagint *s = NULL;
+  int tag_size = 0, special_size;
+  if (buffers->need_tag()) tag_size = e_nall;
+  if (molecular) {
+    s = atom->special[0];
+    ns = atom->nspecial[0];
+    special_size = aend;
+  } else {
+    s = &buffers->_special_holder;
+    ns = &buffers->_nspecial_holder;
+    special_size = 0;
+  }
+  const tagint * _noalias const special = s;
+  const int * _noalias const nspecial = ns;
+  const int maxspecial = atom->maxspecial;
+  const tagint * _noalias const tag = atom->tag;
+
+  int * _noalias const ilist = list->ilist;
+  int * _noalias numneigh = list->numneigh;
+  int * _noalias const cnumneigh = buffers->cnumneigh(list);
+  const int nstencil = this->nstencil;
+  const int * _noalias const stencil = this->stencil;
+  const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
+  const flt_t * _noalias const cutneighghostsq = 
+    buffers->get_cutneighghostsq()[0];
+  const int ntypes = atom->ntypes + 1;
+  const int nlocal = atom->nlocal;
+
+  #ifndef _LMP_INTEL_OFFLOAD
+  int * const mask = atom->mask;
+  tagint * const molecule = atom->molecule;
+  #endif
+
+  int *molindex = atom->molindex;
+  int *molatom = atom->molatom;
+  Molecule **onemols = atom->avec->onemols;
+  int moltemplate;
+  if (molecular == 2) moltemplate = 1;
+  else moltemplate = 0;
+  if (moltemplate) 
+    error->all(FLERR, 
+	       "Can't use moltemplate with npair style full/bin/ghost/intel.");
+
+  int tnum;
+  int *overflow;
+  double *timer_compute;
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (offload) {
+    timer_compute = _fix->off_watch_neighbor();
+    tnum = buffers->get_off_threads();
+    overflow = _fix->get_off_overflow_flag();
+    _fix->stop_watch(TIME_HOST_NEIGHBOR);
+    _fix->start_watch(TIME_OFFLOAD_LATENCY);
+  } else
+  #endif
+  {
+    tnum = comm->nthreads;
+    overflow = _fix->get_overflow_flag();
+  }
+  const int nthreads = tnum;
+  const int maxnbors = buffers->get_max_nbors();
+  int * _noalias const atombin = buffers->get_atombin();
+  const int * _noalias const binpacked = buffers->get_binpacked();
+
+  const int xperiodic = domain->xperiodic;
+  const int yperiodic = domain->yperiodic;
+  const int zperiodic = domain->zperiodic;
+  const flt_t xprd_half = domain->xprd_half;
+  const flt_t yprd_half = domain->yprd_half;
+  const flt_t zprd_half = domain->zprd_half;
+
+  flt_t * _noalias const ncachex = buffers->get_ncachex();
+  flt_t * _noalias const ncachey = buffers->get_ncachey();
+  flt_t * _noalias const ncachez = buffers->get_ncachez();
+  int * _noalias const ncachej = buffers->get_ncachej();
+  int * _noalias const ncachejtype = buffers->get_ncachejtype();
+  int * _noalias const ncachetag = buffers->get_ncachetag();
+  const int ncache_stride = buffers->ncache_stride();
+
+  const int mbinx = this->mbinx;
+  const int mbiny = this->mbiny;
+  const int mbinz = this->mbinz;
+  const int * const stencilxyz = &this->stencilxyz[0][0];
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  const int * _noalias const binhead = this->binhead;
+  const int * _noalias const bins = this->bins;
+  const int cop = _fix->coprocessor_number();
+  const int separate_buffers = _fix->separate_buffers();
+  #pragma offload target(mic:cop) if(offload) \
+    in(x:length(e_nall+1) alloc_if(0) free_if(0)) \
+    in(tag:length(tag_size) alloc_if(0) free_if(0)) \
+    in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
+    in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
+    in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \
+    in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \
+    in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
+    in(cutneighghostsq:length(0) alloc_if(0) free_if(0)) \
+    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
+    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
+    in(numneigh:length(0) alloc_if(0) free_if(0)) \
+    in(ilist:length(0) alloc_if(0) free_if(0)) \
+    in(atombin:length(aend) alloc_if(0) free_if(0)) \
+    in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
+    in(ncachex,ncachey,ncachez,ncachej:length(0) alloc_if(0) free_if(0)) \
+    in(ncachejtype,ncachetag:length(0) alloc_if(0) free_if(0)) \
+    in(ncache_stride,maxnbors,nthreads,maxspecial,nstencil,e_nall,offload) \
+    in(separate_buffers,aend,nlocal,molecular,ntypes,mbinx,mbiny) \
+    in(mbinz,xperiodic,yperiodic,zperiodic,xprd_half,yprd_half,zprd_half) \
+    in(stencilxyz:length(3*nstencil)) \
+    out(overflow:length(5) alloc_if(0) free_if(0)) \
+    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
+    signal(tag)
+  #endif
+  {
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
+    *timer_compute = MIC_Wtime();
+    #endif
+
+    #ifdef _LMP_INTEL_OFFLOAD
+    overflow[LMP_LOCAL_MIN] = 0;
+    overflow[LMP_LOCAL_MAX] = aend - 1;
+    overflow[LMP_GHOST_MIN] = e_nall;
+    overflow[LMP_GHOST_MAX] = -1;
+    #endif
+
+    int nstencilp = 0;
+    int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL];
+    for (int k = 0; k < nstencil; k++) {
+      binstart[nstencilp] = stencil[k];
+      int end = stencil[k] + 1;
+      for (int kk = k + 1; kk < nstencil; kk++) {
+        if (stencil[kk-1]+1 == stencil[kk]) {
+          end++;
+          k++;
+        } else break;
+      }
+      binend[nstencilp] = end;
+      nstencilp++;
+    }
+
+    const int mbinyx = mbiny * mbinx;
+
+    #if defined(_OPENMP)
+    #pragma omp parallel
+    #endif
+    {
+      const int num = aend;
+      int tid, ifrom, ito;
+
+      const double balance_factor = 2.0;
+      const double ibalance_factor = 1.0 / balance_factor;
+      const int gnum = num - nlocal;
+      const int wlocal = static_cast<int>(ceil(balance_factor * nlocal));
+      const int snum = wlocal + gnum;
+      IP_PRE_omp_range_id(ifrom, ito, tid, snum, nthreads);
+      if (ifrom < wlocal) ifrom = static_cast<int>(ibalance_factor * ifrom);
+      else ifrom -= wlocal - nlocal;
+      if (ito < wlocal) ito = static_cast<int>(ibalance_factor * ito);
+      else ito -= wlocal - nlocal;
+
+      int e_ito = ito;
+      const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
+
+      int which;
+
+      int pack_offset = maxnbors;
+      int ct = (ifrom + tid * 2) * maxnbors;
+      int *neighptr = firstneigh + ct;
+      const int obound = pack_offset + maxnbors * 2;
+
+      const int toffs = tid * ncache_stride;
+      flt_t * _noalias const tx = ncachex + toffs;
+      flt_t * _noalias const ty = ncachey + toffs;
+      flt_t * _noalias const tz = ncachez + toffs;
+      int * _noalias const tj = ncachej + toffs;
+      int * _noalias const tjtype = ncachejtype + toffs;
+      int * _noalias const ttag = ncachetag + toffs;
+
+      // loop over all atoms in other bins in stencil, store every pair
+      int istart, icount, ncount, oldbin = -9999999, lane, max_chunk;
+      for (int i = ifrom; i < ito; i++) {
+        const flt_t xtmp = x[i].x;
+        const flt_t ytmp = x[i].y;
+        const flt_t ztmp = x[i].z;
+        const int itype = x[i].w;
+        const tagint itag = tag[i];
+        const int ioffset = ntypes * itype;
+
+        const int ibin = atombin[i];
+        if (ibin != oldbin) {
+          oldbin = ibin;
+          ncount = 0;
+	  if (i < nlocal) {
+	    for (int k = 0; k < nstencilp; k++) {
+	      const int bstart = binhead[ibin + binstart[k]];
+	      const int bend = binhead[ibin + binend[k]];
+              #if defined(LMP_SIMD_COMPILER)
+              #pragma vector aligned
+              #pragma simd
+              #endif
+              for (int jj = bstart; jj < bend; jj++)
+                tj[ncount++] = binpacked[jj];
+	    }
+	  } else {
+	    const int zbin = ibin / mbinyx;
+	    const int zrem = ibin % mbinyx;
+	    const int ybin = zrem / mbinx;
+	    const int xbin = zrem % mbinx;
+	    for (int k = 0; k < nstencil; k++) {
+	      const int xbin2 = xbin + stencilxyz[3 * k + 0];
+	      const int ybin2 = ybin + stencilxyz[3 * k + 1];
+	      const int zbin2 = zbin + stencilxyz[3 * k + 2];
+	      if (xbin2 < 0 || xbin2 >= mbinx ||
+                  ybin2 < 0 || ybin2 >= mbiny ||
+                  zbin2 < 0 || zbin2 >= mbinz) continue;
+
+	      const int bstart = binhead[ibin + stencil[k]];
+	      const int bend = binhead[ibin + stencil[k] + 1];
+              #if defined(LMP_SIMD_COMPILER)
+              #pragma vector aligned
+              #pragma simd
+              #endif
+              for (int jj = bstart; jj < bend; jj++)
+                tj[ncount++] = binpacked[jj];
+	    }
+	  } // if i < nlocal
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma vector aligned
+          #pragma simd
+          #endif
+          for (int u = 0; u < ncount; u++) {
+            const int j = tj[u];
+            tx[u] = x[j].x;
+            ty[u] = x[j].y;
+            tz[u] = x[j].z;
+            tjtype[u] = x[j].w;
+	    ttag[u] = tag[j];
+          }
+	} // if ibin != oldbin
+
+        // ---------------------- Loop over other bins
+
+        int n = maxnbors;
+        int n2 = n * 2;
+	int *neighptr2 = neighptr;
+	const flt_t * _noalias cutsq;
+	if (i < nlocal) cutsq = cutneighsq;
+	else cutsq = cutneighghostsq;
+
+	const int icp = i;
+
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma ivdep
+        #endif
+        for (int u = 0; u < ncount; u++) {
+          int addme = 1;
+          int j = tj[u];
+
+	  if (i == j) addme = 0;
+
+          // Cutoff Check
+          const flt_t delx = xtmp - tx[u];
+          const flt_t dely = ytmp - ty[u];
+          const flt_t delz = ztmp - tz[u];
+          const int jtype = tjtype[u];
+	  const int jtag = ttag[u];
+          const flt_t rsq = delx * delx + dely * dely + delz * delz;
+          if (rsq > cutsq[ioffset + jtype]) addme = 0;
+
+          if (need_ic && icp < nlocal) {
+            int no_special;
+	    ominimum_image_check(no_special, delx, dely, delz);
+            if (no_special)
+              j = -j - 1;
+          }
+
+	  int flist = 0;
+	  if (itag > jtag) {
+	    if (((itag+jtag) & 1) == 0) flist = 1;
+	  } else if (itag < jtag) {
+	    if (((itag+jtag) & 1) == 1) flist = 1;
+	  } else {
+	    if (tz[u] < ztmp) flist = 1;
+	    else if (tz[u] == ztmp && ty[u] < ytmp) flist = 1;
+	    else if (tz[u] == ztmp && ty[u] == ytmp && tx[u] < xtmp)
+	      flist = 1;
+	  }
+	  if (addme) {
+	    if (flist)
+	      neighptr2[n2++] = j;
+	    else
+	      neighptr[n++] = j;
+	  }
+        } // for u
+
+        #ifndef _LMP_INTEL_OFFLOAD
+        if (exclude) {
+          int alln = n;
+          n = maxnbors;
+          for (int u = pack_offset; u < alln; u++) {
+            const int j = neighptr[u];
+            int pj = j;
+            if (need_ic)
+              if (pj < 0) pj = -j - 1;
+            const int jtype = x[pj].w;
+            if (exclusion(i,pj,itype,jtype,mask,molecule)) continue;
+            neighptr[n++] = j;
+          }
+	  alln = n2;
+	  n2 = maxnbors * 2;
+	  for (int u = n2; u < alln; u++) {
+	    const int j = neighptr[u];
+	    int pj = j;
+	    if (need_ic)
+	      if (pj < 0) pj = -j - 1;
+	    const int jtype = x[pj].w;
+	    if (exclusion(i,pj,itype,jtype,mask,molecule)) continue;
+	    neighptr[n2++] = j;
+	  }
+        }
+        #endif
+        int ns = n - maxnbors;
+	int alln = n;
+	atombin[i] = ns;
+	n = 0;
+	for (int u = maxnbors; u < alln; u++)
+          neighptr[n++] = neighptr[u];
+	ns += n2 - maxnbors * 2;
+	for (int u = maxnbors * 2; u < n2; u++)
+          neighptr[n++] = neighptr[u];
+	if (ns > maxnbors) *overflow = 1;
+
+        ilist[i] = i;
+        cnumneigh[i] = ct;
+        numneigh[i] = ns;
+
+	ct += ns;
+	const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
+	const int edge = ct & (alignb - 1);
+	if (edge) ct += alignb - edge;
+	neighptr = firstneigh + ct;
+	if (ct + obound > list_size) {
+	  if (i < ito - 1) {
+	    *overflow = 1;
+	    ct = (ifrom + tid * 2) * maxnbors;
+	  }
+	}
+      }
+
+      if (*overflow == 1)
+        for (int i = ifrom; i < ito; i++)
+          numneigh[i] = 0;
+
+      #ifdef _LMP_INTEL_OFFLOAD
+      int ghost_offset = 0, nall_offset = e_nall;
+      if (separate_buffers) {
+        for (int i = ifrom; i < ito; ++i) {
+          int * _noalias jlist = firstneigh + cnumneigh[i];
+          const int jnum = numneigh[i];
+          #if __INTEL_COMPILER+0 > 1499
+          #pragma vector aligned
+          #pragma simd
+          #endif
+          for (int jj = 0; jj < jnum; jj++) {
+            int j = jlist[jj];
+            if (need_ic && j < 0) j = -j - 1;
+          }
+        }
+
+	overflow[LMP_LOCAL_MIN] = 0;
+	overflow[LMP_LOCAL_MAX] = nlocal - 1;
+	overflow[LMP_GHOST_MIN] = nlocal;
+	overflow[LMP_GHOST_MAX] = e_nall - 1;
+
+        int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
+        if (nghost < 0) nghost = 0;
+        if (offload) {
+          ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
+          nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
+        } else {
+          ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
+          nall_offset = nlocal + nghost;
+        }
+      } // if separate_buffers
+      #endif
+
+      if (molecular) {
+	int ito_m = ito;
+	if (ito >= nlocal) ito_m = nlocal; 
+        for (int i = ifrom; i < ito_m; ++i) {
+          int * _noalias jlist = firstneigh + cnumneigh[i];
+          const int jnum = numneigh[i];
+
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma vector aligned
+          #pragma simd
+          #endif
+          for (int jj = 0; jj < jnum; jj++) {
+            const int j = jlist[jj];
+            if (need_ic && j < 0) {
+              which = 0;
+              jlist[jj] = -j - 1;
+            } else
+              ofind_special(which, special, nspecial, i, tag[j]);
+            #ifdef _LMP_INTEL_OFFLOAD
+            if (j >= nlocal) {
+              if (j == e_nall)
+                jlist[jj] = nall_offset;
+              else if (which)
+                jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
+              else jlist[jj]-=ghost_offset;
+            } else
+            #endif
+            if (which) jlist[jj] = j ^ (which << SBBITS);
+          }
+        } // for i
+      } // if molecular
+      #ifdef _LMP_INTEL_OFFLOAD
+      else if (separate_buffers) {
+        for (int i = ifrom; i < ito; ++i) {
+          int * _noalias jlist = firstneigh + cnumneigh[i];
+          const int jnum = numneigh[i];
+          int jj = 0;
+          #pragma vector aligned
+          #pragma simd
+          for (jj = 0; jj < jnum; jj++) {
+            if (jlist[jj] >= nlocal) {
+              if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
+              else jlist[jj] -= ghost_offset;
+            }
+          }
+        }
+      }
+      #endif
+    } // end omp
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
+    *timer_compute = MIC_Wtime() - *timer_compute;
+    #endif
+  } // end offload
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (offload) {
+    _fix->stop_watch(TIME_OFFLOAD_LATENCY);
+    _fix->start_watch(TIME_HOST_NEIGHBOR);
+    for (int n = 0; n < aend; n++) {
+      ilist[n] = n;
+      numneigh[n] = 0;
+    }
+  } else {
+    for (int i = 0; i < aend; i++)
+      list->firstneigh[i] = firstneigh + cnumneigh[i];
+    if (separate_buffers) {
+      _fix->start_watch(TIME_PACK);
+      _fix->set_neighbor_host_sizes();
+      buffers->pack_sep_from_single(_fix->host_min_local(),
+                                    _fix->host_used_local(),
+                                    _fix->host_min_ghost(),
+                                    _fix->host_used_ghost());
+      _fix->stop_watch(TIME_PACK);
+    }
+  }
+  #else
+  #pragma vector aligned
+  #pragma simd
+  for (int i = 0; i < aend; i++)
+    list->firstneigh[i] = firstneigh + cnumneigh[i];
+  #endif
+}
diff --git a/src/USER-INTEL/npair_full_bin_ghost_intel.h b/src/USER-INTEL/npair_full_bin_ghost_intel.h
new file mode 100644
index 0000000000..4449dfa1e1
--- /dev/null
+++ b/src/USER-INTEL/npair_full_bin_ghost_intel.h
@@ -0,0 +1,55 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifdef NPAIR_CLASS
+
+NPairStyle(full/bin/ghost/intel,
+           NPairFullBinGhostIntel,
+           NP_FULL | NP_BIN | NP_GHOST | NP_NEWTON | NP_NEWTOFF | 
+           NP_ORTHO | NP_TRI | NP_INTEL)
+
+#else
+
+#ifndef LMP_NPAIR_FULL_BIN_GHOST_INTEL_H
+#define LMP_NPAIR_FULL_BIN_GHOST_INTEL_H
+
+#include "npair_intel.h"
+
+namespace LAMMPS_NS {
+
+class NPairFullBinGhostIntel : public NPairIntel {
+ public:
+  NPairFullBinGhostIntel(class LAMMPS *);
+  ~NPairFullBinGhostIntel() {}
+  void build(class NeighList *);
+ private:
+  template<class flt_t, class acc_t>
+  void fbi(NeighList * list, IntelBuffers<flt_t,acc_t> * buffers);
+  template<class flt_t, class acc_t, int need_ic>
+  void fbi(const int offload, NeighList * list, 
+	   IntelBuffers<flt_t,acc_t> * buffers, 
+           const int astart, const int aend);
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+*/
diff --git a/src/USER-INTEL/npair_intel.cpp b/src/USER-INTEL/npair_intel.cpp
index b20b1dcd08..79dc75366e 100644
--- a/src/USER-INTEL/npair_intel.cpp
+++ b/src/USER-INTEL/npair_intel.cpp
@@ -143,6 +143,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
   flt_t * _noalias const ncachez = buffers->get_ncachez();
   int * _noalias const ncachej = buffers->get_ncachej();
   int * _noalias const ncachejtype = buffers->get_ncachejtype();
+  int * _noalias const ncachetag = buffers->get_ncachetag();
   const int ncache_stride = buffers->ncache_stride();
 
   #ifdef _LMP_INTEL_OFFLOAD
@@ -165,7 +166,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
     in(atombin:length(aend) alloc_if(0) free_if(0)) \
     in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
     in(ncachex,ncachey,ncachez,ncachej:length(0) alloc_if(0) free_if(0)) \
-    in(ncachejtype:length(0) alloc_if(0) free_if(0)) \
+    in(ncachejtype,ncachetag:length(0) alloc_if(0) free_if(0)) \
     in(ncache_stride,maxnbors,nthreads,maxspecial,nstencil,e_nall,offload) \
     in(pad_width,offload_end,separate_buffers,astart,aend,nlocal,molecular) \
     in(ntypes,xperiodic,yperiodic,zperiodic,xprd_half,yprd_half,zprd_half) \
@@ -222,7 +223,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
       ito += astart;
       int e_ito = ito;
       if (THREE && ito == num) {
-        int imod = ito % pack_width;
+        int imod = ito & (pack_width - 1);
         if (imod) e_ito += pack_width - imod;
       }
       const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
@@ -241,6 +242,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
       flt_t * _noalias const tz = ncachez + toffs;
       int * _noalias const tj = ncachej + toffs;
       int * _noalias const tjtype = ncachejtype + toffs;
+      int * _noalias const ttag = ncachetag + toffs;
 
       flt_t * _noalias itx;
       flt_t * _noalias ity;
@@ -287,13 +289,14 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
             ty[u] = x[j].y;
             tz[u] = x[j].z;
             tjtype[u] = x[j].w;
+	    if (THREE) ttag[u] = tag[j];
           }
 
           if (FULL == 0 || TRI == 1) {
             icount = 0;
             istart = ncount;
             const int alignb = INTEL_DATA_ALIGN / sizeof(int);
-            int nedge = istart % alignb;
+            int nedge = istart & (alignb - 1);
             if (nedge) istart + (alignb - nedge);
             itx = tx + istart;
             ity = ty + istart;
@@ -343,7 +346,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
 
             // i bin (half) check and offload ghost check
             if (j < nlocal) {
-              const int ijmod = (i + j) % 2;
+              const int ijmod = (i + j) & 1;
               if (i > j) {
                 if (ijmod == 0) addme = 0;
               } else if (i < j) {
@@ -424,8 +427,6 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
           }
           #endif
 
-          int pj;
-          if (THREE) pj = j;
           if (need_ic) {
             int no_special;
             ominimum_image_check(no_special, delx, dely, delz);
@@ -434,12 +435,12 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
           }
 
           if (THREE) {
-            const int jtag = tag[pj];
+            const int jtag = ttag[u];
             int flist = 0;
             if (itag > jtag) {
-              if ((itag+jtag) % 2 == 0) flist = 1;
+	      if (((itag+jtag) & 1) == 0) flist = 1;
             } else if (itag < jtag) {
-              if ((itag+jtag) % 2 == 1) flist = 1;
+	      if (((itag+jtag) & 1) == 1) flist = 1;
             } else {
               if (tz[u] < ztmp) flist = 1;
               else if (tz[u] == ztmp && ty[u] < ytmp) flist = 1;
@@ -512,7 +513,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
           cnumneigh[i] += lane;
           numneigh[i] = ns;
         } else {
-          int edge = (n % pad_width);
+          int edge = n & (pad_width - 1);
           if (edge) {
             const int pad_end = n + (pad_width - edge);
             #if defined(LMP_SIMD_COMPILER)
@@ -532,7 +533,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
           if (lane == pack_width) {
             ct += max_chunk * pack_width;
             const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
-            const int edge = (ct % alignb);
+            const int edge = ct & (alignb - 1);
             if (edge) ct += alignb - edge;
             neighptr = firstneigh + ct;
             max_chunk = 0;
@@ -548,7 +549,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
         } else {
           ct += n;
           const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
-          const int edge = (ct % alignb);
+          const int edge = ct & (alignb - 1);
           if (edge) ct += alignb - edge;
           neighptr = firstneigh + ct;
           if (ct + obound > list_size) {
diff --git a/src/USER-INTEL/pair_airebo_intel.cpp b/src/USER-INTEL/pair_airebo_intel.cpp
new file mode 100644
index 0000000000..ad3c97c9df
--- /dev/null
+++ b/src/USER-INTEL/pair_airebo_intel.cpp
@@ -0,0 +1,4891 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Markus Hohnerbach (RWTH)
+------------------------------------------------------------------------- */
+
+#ifdef __INTEL_OFFLOAD
+#pragma offload_attribute(push, target(mic))
+#endif
+#include <unistd.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <assert.h>
+#include <stddef.h>
+#include "lmptype.h"
+#include "intel_preprocess.h"
+#include "intel_intrinsics_airebo.h"
+#ifdef __INTEL_OFFLOAD
+#pragma offload_attribute(pop)
+#endif
+
+#include <omp.h>
+#include <string.h>
+#include "pair_airebo_intel.h"
+#include "atom.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "force.h"
+#include "comm.h"
+#include "memory.h"
+#include "error.h"
+#include "group.h"
+#include "kspace.h"
+#include "modify.h"
+#include "suffix.h"
+
+using namespace LAMMPS_NS;
+
+#ifdef __INTEL_OFFLOAD
+#pragma offload_attribute(push, target(mic))
+#endif
+
+template<typename flt_t, typename acc_t>
+struct LAMMPS_NS::PairAIREBOIntelParam {
+  flt_t cutlj, cutljrebosq, cut3rebo;
+  flt_t sigmin, sigcut;
+  flt_t cutljsq[2][2];
+  flt_t lj1[2][2], lj2[2][2], lj3[2][2], lj4[2][2];
+
+  flt_t smin, Nmin, Nmax, NCmin, NCmax, thmin, thmax;
+  flt_t rcmin[2][2], rcmax[2][2], rcmaxsq[2][2], rcmaxp[2][2];
+  flt_t Q[2][2], alpha[2][2], A[2][2], rho[2][2], BIJc[2][2][3],
+      Beta[2][2][3];
+  flt_t rcLJmin[2][2], rcLJmax[2][2], rcLJmaxsq[2][2], bLJmin[2][2],
+      bLJmax[2][2];
+  flt_t epsilon[2][2], sigma[2][2], epsilonT[2][2];
+
+  // spline coefficients
+
+  flt_t gCdom[5], gC1[4][6], gC2[4][6], gHdom[4], gH[3][6];
+  flt_t gDom[5+4];
+  flt_t gVal[(4+4+3)*6];
+  flt_t pCCdom[2][2], pCHdom[2][2], pCC[4][4][16], pCH[4][4][16];
+  flt_t piCCdom[3][2], piCHdom[3][2], piHHdom[3][2];
+  acc_t piCC[4][4][9][64], piCH[4][4][9][64], piHH[4][4][9][64];
+  flt_t Tijdom[3][2];
+  acc_t Tijc[4][4][9][64];
+
+  // spline knot values
+
+  flt_t PCCf[5][5], PCCdfdx[5][5], PCCdfdy[5][5], PCHf[5][5];
+  flt_t PCHdfdx[5][5], PCHdfdy[5][5];
+  flt_t piCCf[5][5][11], piCCdfdx[5][5][11];
+  flt_t piCCdfdy[5][5][11], piCCdfdz[5][5][11];
+  flt_t piCHf[5][5][11], piCHdfdx[5][5][11];
+  flt_t piCHdfdy[5][5][11], piCHdfdz[5][5][11];
+  flt_t piHHf[5][5][11], piHHdfdx[5][5][11];
+  flt_t piHHdfdy[5][5][11], piHHdfdz[5][5][11];
+  flt_t Tf[5][5][10], Tdfdx[5][5][10], Tdfdy[5][5][10], Tdfdz[5][5][10];
+};
+
+namespace {
+
+struct NeighListAIREBO {
+  int * num; /* num_all */
+  int * num_half; /* num_all */
+  int * offset; /* num_all */
+  int * entries; /* num_all * num_neighs_per_atom */
+};
+
+template<typename flt_t>
+struct AtomAIREBOT {
+  flt_t x, y, z;
+  int w;
+};
+
+template<typename acc_t>
+struct ResultForceT {
+  acc_t x, y, z, w;
+};
+
+template<typename flt_t, typename acc_t>
+struct KernelArgsAIREBOT {
+  int num_local;
+  int num_all;
+  int num_neighs_per_atom;
+  int num_types;
+  int frebo_from_atom, frebo_to_atom;
+  int neigh_from_atom, neigh_to_atom;
+  int rebuild_flag;
+  flt_t skin;
+  struct NeighListAIREBO neigh_lmp;
+  struct NeighListAIREBO neigh_rebo;
+  PairAIREBOIntelParam<flt_t,acc_t> params;
+  struct AtomAIREBOT<flt_t> * x; /* num_all */
+  int * tag; /* num_all */
+  flt_t * nC, * nH; /* num_all */
+  int * map; /* num_types+1 */
+  struct ResultForceT<acc_t> * result_f; /* num_all */
+  acc_t result_eng;
+};
+
+template<typename flt_t, typename acc_t>
+void aut_lennard_jones(KernelArgsAIREBOT<flt_t,acc_t> * ka, int morseflag);
+template<typename flt_t, typename acc_t>
+void aut_rebo_neigh(KernelArgsAIREBOT<flt_t,acc_t> * ka);
+template<typename flt_t, typename acc_t>
+void aut_frebo(KernelArgsAIREBOT<flt_t,acc_t> * ka, int torsion_flag);
+
+}
+
+#ifdef __INTEL_OFFLOAD
+#pragma offload_attribute(pop)
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+PairAIREBOIntel::PairAIREBOIntel(LAMMPS *lmp) : PairAIREBO(lmp)
+{
+  suffix_flag |= Suffix::INTEL;
+  REBO_cnumneigh = NULL;
+  REBO_num_skin = NULL;
+  REBO_list_data = NULL;
+  fix = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+PairAIREBOIntel::~PairAIREBOIntel()
+{
+  memory->destroy(REBO_cnumneigh);
+  memory->destroy(REBO_num_skin);
+  memory->destroy(REBO_list_data);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairAIREBOIntel::init_style()
+{
+  PairAIREBO::init_style();
+  neighbor->requests[neighbor->nrequest-1]->intel = 1;
+
+  int ifix = modify->find_fix("package_intel");
+  if (ifix < 0)
+    error->all(FLERR,
+               "The 'package intel' command is required for /intel styles");
+  fix = static_cast<FixIntel *>(modify->fix[ifix]);
+
+  fix->pair_init_check();
+  #ifdef _LMP_INTEL_OFFLOAD
+  _cop = fix->coprocessor_number();
+  #endif
+
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+    pack_force_const(fix->get_mixed_buffers());
+    fix->get_mixed_buffers()->need_tag(1);
+  } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+    pack_force_const(fix->get_double_buffers());
+    fix->get_double_buffers()->need_tag(1);
+  } else {
+    pack_force_const(fix->get_single_buffers());
+    fix->get_single_buffers()->need_tag(1);
+  }
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (fix->offload_noghost())
+    error->all(FLERR,"The 'ghost no' option cannot be used with airebo/intel.");
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<typename T>
+T * calloc_it(size_t size) {
+  return static_cast<T*>(calloc(size, sizeof(T)));
+}
+
+void PairAIREBOIntel::compute(int eflag, int vflag)
+{
+  if (fix->precision()==FixIntel::PREC_MODE_MIXED)
+    compute<float,double>(eflag, vflag, fix->get_mixed_buffers());
+  else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
+    compute<double,double>(eflag, vflag, fix->get_double_buffers());
+  else
+    compute<float,float>(eflag, vflag, fix->get_single_buffers());
+
+  fix->balance_stamp();
+  vflag_fdotr = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t>
+PairAIREBOIntelParam<flt_t,acc_t> PairAIREBOIntel::get_param()
+{
+  PairAIREBOIntelParam<flt_t,acc_t> fc;
+
+#define A(a)                                                           \
+  for (int i = 0; i < sizeof(this->a)/sizeof(double); i++) {           \
+    reinterpret_cast<flt_t*>(&fc.a)[i] =			       \
+      reinterpret_cast<double*>(&this->a)[i];			       \
+  }
+#define A0(a)								\
+  for (int i = 0; i < sizeof(fc.a)/sizeof(flt_t); i++) {		\
+    reinterpret_cast<flt_t*>(&fc.a)[i] =				\
+      reinterpret_cast<double*>(this->a[0])[i];				\
+  }
+#define B(a)								\
+  for (int i = 0; i < sizeof(this->a)/sizeof(double); i++) {		\
+    reinterpret_cast<acc_t*>(&fc.a)[i] =				\
+      reinterpret_cast<double*>(&this->a)[i];				\
+  }
+
+  A(cutlj) A(cutljrebosq) A(cut3rebo) A(sigmin);
+  A(sigcut) A0(cutljsq) A0(lj1) A0(lj2) A0(lj3);
+  A0(lj4) A(smin) A(Nmin) A(Nmax) A(NCmin) A(NCmax) A(thmin) A(thmax);
+  A(rcmin) A(rcmax) A(rcmaxsq) A(rcmaxp) A(Q) A(alpha) A(A) A(rho) A(BIJc);
+  A(Beta) A(rcLJmin) A(rcLJmax) A(rcLJmaxsq) A(bLJmin) A(bLJmax) A(epsilon);
+  A(sigma) A(epsilonT) A(gCdom) A(gC1) A(gC2) A(gHdom) A(gH) A(pCCdom);
+  A(pCHdom) A(pCC) A(pCH) A(piCCdom) A(piCHdom) A(piHHdom) B(piCC);
+  B(piCH) B(piHH) A(Tijdom) B(Tijc) A(PCCf) A(PCCdfdx) A(PCCdfdy) A(PCHf);
+  A(PCHdfdx) A(PCHdfdy) A(piCCf) A(piCCdfdx) A(piCCdfdy) A(piCCdfdz);
+  A(piCHf) A(piCHdfdx) A(piCHdfdy) A(piCHdfdz) A(piHHf) A(piHHdfdx);
+  A(piHHdfdy) A(piHHdfdz) A(Tf) A(Tdfdx) A(Tdfdy) A(Tdfdz);
+
+#undef A
+#undef A0
+#undef B
+  for (int i = 0; i < 5; i++) fc.gDom[i] = fc.gCdom[i];
+  for (int i = 0; i < 4; i++) fc.gDom[5+i] = fc.gHdom[i];
+  for (int i = 0; i < 4; i++) for (int j = 0; j < 6; j++) 
+				fc.gVal[6*i+j] = fc.gC1[i][j];
+  for (int i = 0; i < 4; i++) for (int j = 0; j < 6; j++) 
+				fc.gVal[4*6+6*i+j] = fc.gC2[i][j];
+  for (int i = 0; i < 3; i++) for (int j = 0; j < 6; j++) 
+				fc.gVal[8*6+6*i+j] = fc.gH[i][j];
+
+  return fc;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t>
+void PairAIREBOIntel::compute(
+    int eflag, int vflag, IntelBuffers<flt_t,acc_t> * buffers
+) {
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = vflag_atom = 0;
+  pvector[0] = pvector[1] = pvector[2] = 0.0;
+
+  const int inum = list->inum;
+  const int nthreads = comm->nthreads;
+  const int host_start = fix->host_start_pair();
+  const int offload_end = fix->offload_end_pair();
+  const int ago = neighbor->ago;
+
+  if (ago != 0 && fix->separate_buffers() == 0) {
+    fix->start_watch(TIME_PACK);
+    int packthreads;
+    if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
+    else packthreads = 1;
+    #if defined(_OPENMP)
+    #pragma omp parallel if(packthreads > 1)
+    #endif
+    {
+      int ifrom, ito, tid;
+      IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
+                                packthreads, sizeof(ATOM_T));
+        buffers->thr_pack(ifrom,ito,ago);
+    }
+    fix->stop_watch(TIME_PACK);
+  }
+
+  if (atom->nmax > maxlocal) {
+    #ifdef LMP_INTEL_OFFLOAD
+    if (maxlocal > 0 && _cop >= 0) {
+      int * const REBO_numneigh = this->REBO_numneigh;
+      int * const REBO_num_skin = this->REBO_num_skin;
+      int * const REBO_cnumneigh = this->REBO_cnumneigh;
+      int * const REBO_list_data = this->REBO_list_data;
+      double * const nC = this->nC;
+      double * const nH = this->nH;
+      #pragma offload_transfer target(mic:_cop) \
+        nocopy(REBO_numneigh: alloc_if(0) free_if(1)) \
+        nocopy(REBO_cnumneigh: alloc_if(0) free_if(1)) \
+        nocopy(REBO_num_skin: alloc_if(0) free_if(1)) \
+        nocopy(REBO_list_data: alloc_if(0) free_if(1)) \
+        nocopy(nH: alloc_if(0) free_if(1)) \
+        nocopy(nC: alloc_if(0) free_if(1))
+    }
+    #endif
+    maxlocal = atom->nmax;
+    memory->destroy(REBO_numneigh);
+    memory->destroy(REBO_cnumneigh);
+    memory->destroy(REBO_list_data);
+    memory->sfree(REBO_firstneigh);
+    memory->destroy(nC);
+    memory->destroy(nH);
+    memory->create(REBO_numneigh,maxlocal,"AIREBO:numneigh");
+    memory->create(REBO_cnumneigh,maxlocal,"AIREBO:cnumneigh");
+    memory->create(REBO_num_skin,maxlocal,"AIREBO:cnumneigh");
+    int max_nbors = buffers->get_max_nbors();
+    memory->create(REBO_list_data,maxlocal * max_nbors,"AIREBO:list_data");
+    REBO_firstneigh = (int **) memory->smalloc(maxlocal*sizeof(int *),
+                                               "AIREBO:firstneigh");
+    memory->create(nC,maxlocal,"AIREBO:nC");
+    memory->create(nH,maxlocal,"AIREBO:nH");
+    #ifdef _LMP_INTEL_OFFLOAD
+    if (_cop >= 0) {
+      int * const REBO_numneigh = this->REBO_numneigh;
+      int * const REBO_num_skin = this->REBO_num_skin;
+      int * const REBO_cnumneigh = this->REBO_cnumneigh;
+      int * const REBO_list_data = this->REBO_list_data;
+      double * const nC = this->nC;
+      double * const nH = this->nH;
+      const int mnml = max_nbors * maxlocal;
+      #pragma offload_transfer target(mic:_cop) \
+        nocopy(REBO_numneigh: length(maxlocal) alloc_if(1) free_if(0)) \
+        nocopy(REBO_cnumneigh:length(maxlocal) alloc_if(1) free_if(0)) \
+        nocopy(REBO_num_skin: length(maxlocal) alloc_if(1) free_if(0)) \
+        nocopy(REBO_list_data:length(mnml) alloc_if(1) free_if(0)) \
+        nocopy(nH: length(maxlocal) alloc_if(1) free_if(0)) \
+        nocopy(nC: length(maxlocal) alloc_if(1) free_if(0))
+    }
+    #endif
+  }
+
+  if (evflag || vflag_fdotr) {
+    int ovflag = 0;
+    if (vflag_fdotr) ovflag = 2;
+    else if (vflag) ovflag = 1;
+    if (eflag) {
+      eval<1,1>(1, ovflag, buffers, 0, offload_end);
+      eval<1,1>(0, ovflag, buffers, host_start, inum);
+    } else {
+      eval<1,0>(1, ovflag, buffers, 0, offload_end);
+      eval<1,0>(0, ovflag, buffers, host_start, inum);
+    }
+  } else {
+    eval<0,0>(1, 0, buffers, 0, offload_end);
+    eval<0,0>(0, 0, buffers, host_start, inum);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<int EVFLAG, int EFLAG, class flt_t, class acc_t>
+void PairAIREBOIntel::eval(
+    const int offload, const int vflag,
+    IntelBuffers<flt_t,acc_t> * buffers,
+    const int astart, const int aend
+) {
+  const int inum = aend - astart;
+  if (inum == 0) {
+    return;
+  }
+  int nlocal, nall, minlocal;
+  fix->get_buffern(offload, nlocal, nall, minlocal);
+
+  const int ago = neighbor->ago;
+  IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
+
+  ATOM_T * _noalias const x = buffers->get_x(offload);
+  const int * _noalias const numneighhalf = buffers->get_atombin();
+  const int * _noalias const numneigh = list->numneigh;
+  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
+  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  int * const tag = atom->tag;
+
+  const int ntypes = atom->ntypes + 1;
+  const int eatom = this->eflag_atom;
+
+  int x_size, q_size, f_stride, ev_size, separate_flag;
+  IP_PRE_get_transfern(ago, 1 /*NEWTON_PAIR*/, EFLAG, vflag,
+		       buffers, offload, fix, separate_flag,
+		       x_size, q_size, ev_size, f_stride);
+
+  int tc;
+  FORCE_T * _noalias f_start;
+  acc_t * _noalias ev_global;
+  IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
+
+  const int nthreads = tc;
+  const double skin = neighbor->skin;
+  const int max_nbor = buffers->get_max_nbors();
+  const PairAIREBOIntelParam<flt_t,acc_t> param = get_param<flt_t,acc_t>();
+
+  // offload here
+  #ifdef _LMP_INTEL_OFFLOAD
+  int *overflow = fix->get_off_overflow_flag();
+  double *timer_compute = fix->off_watch_pair();
+
+  int * const REBO_numneigh = this->REBO_numneigh;
+  int * const REBO_num_skin = this->REBO_num_skin;
+  int * const REBO_cnumneigh = this->REBO_cnumneigh;
+  int * const REBO_list_data = this->REBO_list_data;
+  double * const nC = this->nC;
+  double * const nH = this->nH;
+  const int torflag = this->torflag;
+  const int ljflag = this->ljflag;
+  const int morseflag = this->morseflag;
+  int * const map = this->map;
+
+  if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
+
+  #pragma offload target(mic:_cop) if(offload) \
+    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
+    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
+    in(numneigh:length(0) alloc_if(0) free_if(0)) \
+    in(numneighhalf:length(0) alloc_if(0) free_if(0)) \
+    in(x:length(x_size) alloc_if(0) free_if(0)) \
+    in(overflow:length(0) alloc_if(0) free_if(0)) \
+    in(astart,nthreads,inum,nall,ntypes,vflag,eatom) \
+    in(f_stride,nlocal,minlocal,separate_flag,offload) \
+    out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
+    out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
+    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
+    in(param,skin,max_nbor) \
+    in(tag: length(0) alloc_if(0) free_if(0)) \
+    in(torflag, ljflag, morseflag, ago) \
+    in(nC: length(0) alloc_if(0) free_if(0)) \
+    in(nH: length(0) alloc_if(0) free_if(0)) \
+    in(REBO_numneigh: length(0) alloc_if(0) free_if(0)) \
+    in(REBO_cnumneigh: length(0) alloc_if(0) free_if(0)) \
+    in(REBO_num_skin: length(0) alloc_if(0) free_if(0)) \
+    in(REBO_list_data: length(0) alloc_if(0) free_if(0)) \
+    in(map: length(0) alloc_if(0) free_if(0)) \
+    signal(f_start)
+  #endif
+  {
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
+    *timer_compute = MIC_Wtime();
+    #endif
+
+    IP_PRE_repack_for_offload(1 /*NEWTON_PAIR*/, separate_flag, nlocal, nall,
+			      f_stride, x, 0/*q*/);
+
+    acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
+    if (EVFLAG) {
+      oevdwl = oecoul = (acc_t)0;
+      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    }
+
+    // loop over neighbors of my atoms
+    #if defined(_OPENMP)
+    #pragma omp parallel \
+      shared(f_start,f_stride,nlocal,nall,minlocal)	\
+      reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
+    #endif
+    {
+      int iifrom, iito, tid;
+      IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
+      iifrom += astart;
+      iito += astart;
+
+      int neigh_iifrom, neigh_iito;
+      IP_PRE_omp_range(neigh_iifrom, neigh_iito, tid, nall, nthreads);
+
+      FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
+      memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
+
+      KernelArgsAIREBOT<flt_t,acc_t> args;
+      args.num_local = nlocal;
+      args.num_all = nall;
+      args.num_neighs_per_atom = max_nbor;
+      args.num_types = ntypes;
+      args.frebo_from_atom = 0;
+      args.frebo_to_atom = args.num_local;
+      args.neigh_from_atom = 0;
+      args.neigh_to_atom = args.num_all;
+      args.rebuild_flag = ago == 0;
+      args.skin = skin;
+      args.neigh_lmp.num = const_cast<int*>(numneigh);
+      args.neigh_lmp.num_half = const_cast<int*>(numneighhalf);
+      args.neigh_lmp.offset = const_cast<int*>(cnumneigh);
+      args.neigh_lmp.entries = const_cast<int*>(firstneigh);
+      args.neigh_rebo.num = REBO_numneigh;
+      args.neigh_rebo.num_half = REBO_num_skin;
+      args.neigh_rebo.offset = REBO_cnumneigh;
+      args.neigh_rebo.entries = REBO_list_data;
+      args.params = param;
+      args.tag = tag;
+      args.nC = reinterpret_cast<flt_t*>(nC);
+      args.nH = reinterpret_cast<flt_t*>(nH);
+      args.map = map;
+      args.result_eng = 0;
+      args.x = (AtomAIREBOT<flt_t>*) x;
+
+      args.result_f = (ResultForceT<acc_t> *) f;
+      args.neigh_from_atom = neigh_iifrom;
+      args.neigh_to_atom = neigh_iito;
+      args.frebo_from_atom = iifrom;
+      args.frebo_to_atom = iito;
+
+      aut_rebo_neigh(&args);
+      #if defined(_OPENMP)
+      #pragma omp barrier
+      #endif
+      aut_frebo(&args, torflag);
+      if (ljflag) aut_lennard_jones(&args, morseflag);
+
+      oevdwl += args.result_eng;
+
+      IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start, f_stride, x,
+                              offload, vflag, ov0, ov1, ov2, ov3, ov4, ov5);
+    } // end of omp parallel region
+    IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag,
+                        ov0, ov1, ov2, ov3, ov4, ov5);
+    if (EVFLAG) {
+      if (EFLAG) {
+        ev_global[0] = oevdwl;
+        ev_global[1] = oecoul;
+      }
+      if (vflag) {
+        ev_global[2] = ov0;
+        ev_global[3] = ov1;
+        ev_global[4] = ov2;
+        ev_global[5] = ov3;
+        ev_global[6] = ov4;
+        ev_global[7] = ov5;
+      }
+    }
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
+    *timer_compute = MIC_Wtime() - *timer_compute;
+    #endif
+  } // end of offload region
+
+  if (offload)
+    fix->stop_watch(TIME_OFFLOAD_LATENCY);
+  else
+    fix->stop_watch(TIME_HOST_PAIR);
+
+  if (EVFLAG)
+    fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
+  else
+    fix->add_result_array(f_start, 0, offload);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t>
+void PairAIREBOIntel::pack_force_const(IntelBuffers<flt_t,acc_t> * buffers) {
+  int tp1 = atom->ntypes + 1;
+
+  buffers->set_ntypes(tp1,1);
+  flt_t **cutneighsq = buffers->get_cutneighsq();
+  flt_t **cutneighghostsq = buffers->get_cutneighghostsq();
+
+  // Repeat cutsq calculation because done after call to init_style
+  double cut, cutneigh;
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i, j);
+        cutneigh = cut + neighbor->skin;
+        cutsq[i][j] = cutsq[j][i] = cut*cut;
+        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
+        cut = cutghost[i][j] + neighbor->skin;
+        cutneighghostsq[i][j] = cutneighghostsq[j][i] = cut*cut;
+      }
+    }
+  }
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_cop < 0) return;
+  flt_t * ocutneighsq = cutneighsq[0];
+  size_t VL = 512 / 8 / sizeof(flt_t);
+  int ntypes = tp1;
+  int tp1sq = tp1 * tp1;
+  // TODO the lifecycle of "map" is currently not 100% correct
+  // it might not be freed if this method is called more than once
+  int * map = this->map;
+  #pragma offload_transfer target(mic:_cop) \
+    in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0)) \
+    in(map: length(tp1) alloc_if(1) free_if(0))
+  #endif
+
+}
+
+/* ----------------------------------------------------------------------
+    Implementation
+   ---------------------------------------------------------------------- */
+
+namespace {
+
+#ifdef __INTEL_OFFLOAD
+#pragma offload_attribute(push, target(mic))
+#endif
+
+namespace overloaded {
+  double sqrt(double a) { return ::sqrt(a); }
+  float sqrt(float a) { return ::sqrtf(a); }
+  double sin(double a) { return ::sin(a); }
+  float sin(float a) { return ::sinf(a); }
+  double cos(double a) { return ::cos(a); }
+  float cos(float a) { return ::cosf(a); }
+  double exp(double a) { return ::exp(a); }
+  float exp(float a) { return ::expf(a); }
+  double pow(double a, double b) { return ::pow(a, b); }
+  float pow(float a, float b) { return ::powf(a, b); }
+}
+
+/* ----------------------------------------------------------------------
+    Scalar AIREBO implementation, standalone, with massive code reuse
+    compared to original code.
+   ---------------------------------------------------------------------- */
+
+#define M_PI           3.14159265358979323846  /* pi */
+
+#define CARBON 0
+#define HYDROGEN 1
+#define TOL 1.0e-9
+
+template<typename T>
+inline T fmin_nonan(T a, T b) {
+  return a < b ? a : b;
+}
+template<typename T>
+inline T fmax_nonan(T a, T b) {
+  return a > b ? a : b;
+}
+
+template<typename flt_t>
+inline flt_t Sp(flt_t r, flt_t lo, flt_t hi, flt_t * del) {
+  flt_t t = (r - lo) / (hi - lo);
+  if (t <= 0) {
+    if (del) *del = 0;
+    return 1;
+  } else if (t >= 1) {
+    if (del) *del = 0;
+    return 0;
+  } else {
+    t *= static_cast<flt_t>(M_PI);
+    if (del) *del = static_cast<flt_t>(-0.5 * M_PI)
+                  * overloaded::sin(t) / (hi - lo);
+    return static_cast<flt_t>(0.5) * (1 + overloaded::cos(t));
+  }
+}
+
+template<typename flt_t>
+inline flt_t Sp2(flt_t r, flt_t lo, flt_t hi, flt_t * del) {
+  flt_t t = (r - lo) / (hi - lo);
+  if (t <= 0) {
+    if (del) *del = 0;
+    return 1;
+  } else if (t >= 1) {
+    if (del) *del = 0;
+    return 0;
+  } else {
+    if (del) *del = 6 * (t * t - t) / (hi - lo);
+    return 1 - t * t * (3 - 2 * t);
+  }
+}
+
+template<typename flt_t>
+inline flt_t eval_poly_lin(int n, flt_t * coeffs, flt_t x, flt_t * deriv) {
+  flt_t result = coeffs[n - 1];
+  *deriv = coeffs[n - 1] * (n - 1);
+  for (int i = n - 2; i > 0; i--) {
+    result = coeffs[i] + x * result;
+    *deriv = coeffs[i] * i + x * (*deriv);
+  }
+  result = coeffs[0] + x * result;
+  return result;
+}
+
+template<typename flt_t, typename acc_t>
+inline flt_t gSpline(KernelArgsAIREBOT<flt_t,acc_t> * ka, int itype, flt_t cos, flt_t N, flt_t * dgdc, flt_t * dgdN) {
+  flt_t NCmin = ka->params.NCmin;
+  flt_t NCmax = ka->params.NCmax;
+  int index = 0;
+  flt_t * gDom = NULL;
+  int nDom = 0;
+  int offs = 0;
+  if (itype == 0) {
+    nDom = 4;
+    gDom = &ka->params.gCdom[0];
+    if (N > NCmin) offs = 4 * 6;
+  } else {
+    nDom = 3;
+    gDom = &ka->params.gHdom[0];
+    offs = 8 * 6;
+  }
+  cos = fmax_nonan(gDom[0], fmin_nonan(gDom[nDom], cos));
+  int i;
+  for (i = 0; i < nDom; i++) {
+    if (cos >= gDom[i] && cos <= gDom[i + 1]) {
+      index = i;
+    }
+  }
+  flt_t g = eval_poly_lin(6, &ka->params.gVal[offs+index*6], cos, dgdc);
+  *dgdN = 0;
+  if (itype == 0 && N > NCmin && N < NCmax) {
+    flt_t dg1;
+    flt_t g1 = eval_poly_lin(6, &ka->params.gVal[index*6], cos, &dg1);
+    flt_t dS;
+    flt_t cut = Sp(N, NCmin, NCmax, &dS);
+    *dgdN = dS * (g1 - g);
+    g = g + cut * (g1 - g);
+    *dgdc = *dgdc + cut * (dg1 - *dgdc);
+  }
+  return g;
+}
+
+template<typename flt_t>
+inline flt_t eval_poly_bi(int n, flt_t * coeffs, flt_t x, flt_t y, 
+			  flt_t * deriv) {
+  flt_t dy;
+  flt_t vy = eval_poly_lin(n, &coeffs[n * (n - 1)], y, &dy);
+  flt_t result = vy;
+  deriv[0] = vy * (n - 1);
+  deriv[1] = dy;
+  for (int i = n - 2; i > 0; i--) {
+    vy = eval_poly_lin(n, &coeffs[n * i], y, &dy);
+    result = vy + x * result;
+    deriv[0] = vy * i + x * deriv[0];
+    deriv[1] = dy + x * deriv[1];
+  }
+  result = eval_poly_lin(n, &coeffs[0], y, &dy) + x * result;
+  deriv[1] = dy + x * deriv[1];
+  return result;
+}
+
+template<typename flt_t>
+inline flt_t eval_poly_tri(int n, flt_t * coeffs, flt_t x, flt_t y, flt_t z, 
+			   flt_t * deriv) {
+  flt_t dyz[2];
+  flt_t vyz = eval_poly_bi(n, &coeffs[n * n * (n - 1)], y, z, &dyz[0]);
+  flt_t result = vyz;
+  deriv[0] = vyz * (n - 1);
+  deriv[1] = dyz[0];
+  deriv[2] = dyz[1];
+  for (int i = n - 2; i > 0; i--) {
+    vyz = eval_poly_bi(n, &coeffs[n * n * i], y, z, &dyz[0]);
+    result = vyz + x * result;
+    deriv[0] = vyz * i + x * deriv[0];
+    deriv[1] = dyz[0] + x * deriv[1];
+    deriv[2] = dyz[1] + x * deriv[2];
+  }
+  result = eval_poly_bi(n, &coeffs[0], y, z, &dyz[0]) + x * result;
+  deriv[1] = dyz[0] + x * deriv[1];
+  deriv[2] = dyz[1] + x * deriv[2];
+  return result;
+}
+
+template<typename flt_t, typename acc_t>
+inline flt_t PijSpline(KernelArgsAIREBOT<flt_t,acc_t> * ka, int itype, 
+		       int jtype, flt_t NC, flt_t NH, flt_t * dN) {
+  dN[0] = 0.0;
+  dN[1] = 0.0;
+  if (itype == HYDROGEN) return 0;
+  flt_t *pCJdom = jtype == CARBON ? &ka->params.pCCdom[0][0] : 
+    &ka->params.pCHdom[0][0];
+  NC = fmax_nonan(pCJdom[0], fmin_nonan(pCJdom[1], NC));
+  NH = fmax_nonan(pCJdom[2], fmin_nonan(pCJdom[3], NH));
+  int nC = floor(NC);
+  int nH = floor(NH);
+  #define PijSelect(a, b) (jtype == CARBON ? ka->params.a : ka->params.b)
+  if (fabs(NC - nC) < TOL && fabs(NH - nH) < TOL) {
+    dN[0] = PijSelect(PCCdfdx, PCHdfdx)[nC][nH];
+    dN[1] = PijSelect(PCCdfdy, PCHdfdy)[nC][nH];
+    return PijSelect(PCCf, PCHf)[nC][nH];
+  }
+  if (NC == pCJdom[1]) nC -= 1;
+  if (NH == pCJdom[3]) nH -= 1;
+  return eval_poly_bi(4, &PijSelect(pCC, pCH)[nC][nH][0], NC, NH, dN);
+  #undef PijSelect
+}
+
+template<typename flt_t, typename acc_t>
+inline flt_t TijSpline(KernelArgsAIREBOT<flt_t,acc_t> * ka, flt_t Nij, 
+    flt_t Nji, flt_t Nijconj, acc_t * dN3) {
+  flt_t * Tijdom = &ka->params.Tijdom[0][0];
+  Nij = fmax_nonan(Tijdom[0], fmin_nonan(Tijdom[1], Nij));
+  Nji = fmax_nonan(Tijdom[2], fmin_nonan(Tijdom[3], Nji));
+  Nijconj = fmax_nonan(Tijdom[4], fmin_nonan(Tijdom[5], Nijconj));
+  int nij = floor(Nij);
+  int nji = floor(Nji);
+  int nijconj = floor(Nijconj);
+  if (fabs(Nij - nij) < TOL && fabs(Nji - nji) < 
+			  TOL && fabs(Nijconj - nijconj) < TOL) {
+    dN3[0] = ka->params.Tdfdx[nij][nji][nijconj];
+    dN3[1] = ka->params.Tdfdy[nij][nji][nijconj];
+    dN3[2] = ka->params.Tdfdz[nij][nji][nijconj];
+    return ka->params.Tf[nij][nji][nijconj];
+  }
+  if (Nij == Tijdom[1]) nij -= 1;
+  if (Nji == Tijdom[3]) nji -= 1;
+  if (Nijconj == Tijdom[5]) nijconj -= 1;
+  return eval_poly_tri<acc_t>(4, &ka->params.Tijc[nij][nji][nijconj][0], Nij, 
+    Nji, Nijconj, dN3);
+}
+
+template<typename flt_t, typename acc_t>
+inline flt_t piRCSpline(KernelArgsAIREBOT<flt_t,acc_t> * ka, int itype, 
+    int jtype, flt_t Nij, flt_t Nji, flt_t Nijconj, acc_t * dN3) {
+  const int HH = 2;
+  const int CH = 1;
+  /* const int CC = 0; */
+  int select = itype + jtype;
+  #define piRCSelect(a, b, c) (select == HH ? ka->params.a : select == CH ? \
+			       ka->params.b : ka->params.c)
+  flt_t * piIJdom = &piRCSelect(piHHdom, piCHdom, piCCdom)[0][0];
+  if (select == HH) {
+    if (Nij < piIJdom[0] || Nij > piIJdom[1] || Nji < piIJdom[2] || 
+	Nji > piIJdom[3] || Nijconj < piIJdom[4] || Nijconj > piIJdom[5]) {
+      Nij = 0;
+      Nji = 0;
+      Nijconj = 0;
+    }
+  }
+  Nij = fmax_nonan(piIJdom[0], fmin_nonan(piIJdom[1], Nij));
+  Nji = fmax_nonan(piIJdom[2], fmin_nonan(piIJdom[3], Nji));
+  Nijconj = fmax_nonan(piIJdom[4], fmin_nonan(piIJdom[5], Nijconj));
+  int nij = floor(Nij);
+  int nji = floor(Nji);
+  int nijconj = floor(Nijconj);
+  if (fabs(Nij - nij) < TOL && fabs(Nji - nji) < 
+			  TOL && fabs(Nijconj - nijconj) < TOL) {
+    dN3[0] = piRCSelect(piHHdfdx, piCHdfdx, piCCdfdx)[nij][nji][nijconj];
+    dN3[1] = piRCSelect(piHHdfdy, piCHdfdy, piCCdfdy)[nij][nji][nijconj];
+    dN3[2] = piRCSelect(piHHdfdz, piCHdfdz, piCCdfdz)[nij][nji][nijconj];
+    return piRCSelect(piHHf, piCHf, piCCf)[nij][nji][nijconj];
+  }
+  if (Nij == piIJdom[1]) nij -= 1;
+  if (Nji == piIJdom[3]) nji -= 1;
+  if (Nijconj == piIJdom[5]) nijconj -= 1;
+  return eval_poly_tri<acc_t>(4, 
+    &piRCSelect(piHH, piCH, piCC)[nij][nji][nijconj][0], Nij, Nji, Nijconj, 
+    dN3);
+  #undef piRCSelect
+}
+
+/*
+ * Implements the p_ij term in airebo, which occurs on 4 different occasions
+ * in the original lammps code.
+ */
+template<typename flt_t, typename acc_t>
+inline flt_t frebo_pij(KernelArgsAIREBOT<flt_t,acc_t> * ka, int i, int j, 
+    flt_t rijx, flt_t rijy, flt_t rijz, flt_t rijmag, flt_t wij, flt_t VA, 
+    flt_t * sum_N, acc_t fij[3]) {
+  ResultForceT<acc_t> * result_f = ka->result_f;
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * map = ka->map;
+  flt_t * nC = ka->nC;
+  flt_t * nH = ka->nH;
+  flt_t x_i = x[i].x;
+  flt_t y_i = x[i].y;
+  flt_t z_i = x[i].z;
+  int itype = map[x[i].w];
+  int jtype = map[x[j].w];
+  flt_t invrijm = 1 / rijmag;
+  flt_t invrijm2 = invrijm * invrijm;
+  flt_t rcminij = ka->params.rcmin[itype][jtype];
+  flt_t rcmaxij = ka->params.rcmax[itype][jtype];
+  flt_t Nmin = ka->params.Nmin;
+  flt_t Nmax = ka->params.Nmax;
+  flt_t Nij = nC[i] + nH[i] - wij;
+  flt_t NijC = nC[i] - wij * (1 - jtype);
+  flt_t NijH = nH[i] - wij * jtype;
+  flt_t sum_pij = 0;
+  flt_t sum_dpij_dN = 0;
+  flt_t dN2[2] = {0};
+  flt_t pij = 0;
+  *sum_N = 0;
+  int * neighs = ka->neigh_rebo.entries + ka->neigh_rebo.offset[i];
+  int pass;
+  for (pass = 0; pass < 2; pass++) {
+    int kk;
+    int knum = ka->neigh_rebo.num[i];
+    for (kk = 0; kk < knum; kk++) {
+      int k = neighs[kk];
+      if (k == j) continue;
+      flt_t rikx = x_i - x[k].x;
+      flt_t riky = y_i - x[k].y;
+      flt_t rikz = z_i - x[k].z;
+      int ktype = map[x[k].w];
+      flt_t rikmag = overloaded::sqrt(rikx * rikx + riky * riky + rikz * rikz);
+      flt_t rho_k = ka->params.rho[ktype][1];
+      flt_t rho_j = ka->params.rho[jtype][1];
+      flt_t lamdajik = 4 * itype * ((rho_k - rikmag) - (rho_j - rijmag));
+      flt_t ex_lam = exp(lamdajik);
+      flt_t rcminik = ka->params.rcmin[itype][ktype];
+      flt_t rcmaxik = ka->params.rcmax[itype][ktype];
+      flt_t dwik;
+      flt_t wik = Sp(rikmag, rcminik, rcmaxik, &dwik);
+      flt_t Nki = nC[k] + nH[k] - wik;
+      flt_t cosjik = (rijx * rikx + rijy * riky + rijz * rikz) / 
+	(rijmag * rikmag);
+      cosjik = fmin_nonan<flt_t>(1, fmax_nonan<flt_t>(-1, cosjik));
+      flt_t dgdc, dgdN;
+      flt_t g = gSpline(ka, itype, cosjik, Nij, &dgdc, &dgdN);
+      if (pass == 0) {
+        sum_pij += wik * g * ex_lam;
+        sum_dpij_dN += wik * dgdN * ex_lam;
+        flt_t cutN = Sp<flt_t>(Nki, Nmin, Nmax, NULL);
+        *sum_N += (1 - ktype) * wik * cutN;
+      } else {
+        flt_t tmp = -0.5 * pij * pij * pij;
+        flt_t invrikm = 1 / rikmag;
+        flt_t rjkx = rikx - rijx;
+        flt_t rjky = riky - rijy;
+        flt_t rjkz = rikz - rijz;
+        flt_t rjkmag = sqrt(rjkx * rjkx + rjky * rjky + rjkz * rjkz);
+        flt_t rijrik = 2 * rijmag * rikmag;
+        flt_t rr = rijmag * rijmag - rikmag * rikmag;
+        flt_t dctdjk = -2 / rijrik;
+        flt_t dctdik = (-rr + rjkmag * rjkmag) / (rijrik * rikmag * rikmag);
+        flt_t dctdij = (rr + rjkmag * rjkmag) / (rijrik * rijmag * rijmag);
+
+        acc_t fi[3], fj[3], fk[3];
+        flt_t pref = 0.5 * VA * tmp;
+        flt_t tmp20 = pref * wik * dgdc * ex_lam;
+        fj[0] = fj[1] = fj[2] = 0;
+        fi[0] = -tmp20 * dctdik * rikx;
+        fi[1] = -tmp20 * dctdik * riky;
+        fi[2] = -tmp20 * dctdik * rikz;
+        fk[0] =  tmp20 * dctdik * rikx;
+        fk[1] =  tmp20 * dctdik * riky;
+        fk[2] =  tmp20 * dctdik * rikz;
+
+        fij[0] += -tmp20 * dctdij * rijx;
+        fij[1] += -tmp20 * dctdij * rijy;
+        fij[2] += -tmp20 * dctdij * rijz;
+
+        fi[0] += -tmp20 * dctdjk * rjkx;
+        fi[1] += -tmp20 * dctdjk * rjky;
+        fi[2] += -tmp20 * dctdjk * rjkz;
+        fk[0] +=  tmp20 * dctdjk * rjkx;
+        fk[1] +=  tmp20 * dctdjk * rjky;
+        fk[2] +=  tmp20 * dctdjk * rjkz;
+        fij[0] -= -tmp20 * dctdjk * rjkx;
+        fij[1] -= -tmp20 * dctdjk * rjky;
+        fij[2] -= -tmp20 * dctdjk * rjkz;
+
+        flt_t tmp21 = pref * (wik * g * ex_lam * 4 * itype);
+        fij[0] -= 1 * tmp21 * rijx * invrijm;
+        fij[1] -= 1 * tmp21 * rijy * invrijm;
+        fij[2] -= 1 * tmp21 * rijz * invrijm;
+        fi[0] -= tmp21 * (-rikx * invrikm);
+        fi[1] -= tmp21 * (-riky * invrikm);
+        fi[2] -= tmp21 * (-rikz * invrikm);
+        fk[0] -= tmp21 * (rikx * invrikm);
+        fk[1] -= tmp21 * (riky * invrikm);
+        fk[2] -= tmp21 * (rikz * invrikm);
+
+        // coordination forces
+
+        // dwik forces
+        flt_t tmp22 = pref * dwik * g * ex_lam * invrikm;
+        fi[0] -= tmp22 * rikx;
+        fi[1] -= tmp22 * riky;
+        fi[2] -= tmp22 * rikz;
+        fk[0] += tmp22 * rikx;
+        fk[1] += tmp22 * riky;
+        fk[2] += tmp22 * rikz;
+
+        // PIJ forces
+        flt_t tmp23 = pref * dN2[ktype] * dwik * invrikm;
+        fi[0] -= tmp23 * rikx;
+        fi[1] -= tmp23 * riky;
+        fi[2] -= tmp23 * rikz;
+        fk[0] += tmp23 * rikx;
+        fk[1] += tmp23 * riky;
+        fk[2] += tmp23 * rikz;
+
+        // dgdN forces
+        flt_t tmp24 = pref * sum_dpij_dN * dwik * invrikm;
+        fi[0] -= tmp24 * rikx;
+        fi[1] -= tmp24 * riky;
+        fi[2] -= tmp24 * rikz;
+        fk[0] += tmp24 * rikx;
+        fk[1] += tmp24 * riky;
+        fk[2] += tmp24 * rikz;
+
+        result_f[i].x += fi[0];
+        result_f[i].y += fi[1];
+        result_f[i].z += fi[2];
+        result_f[j].x += fj[0];
+        result_f[j].y += fj[1];
+        result_f[j].z += fj[2];
+        result_f[k].x += fk[0];
+        result_f[k].y += fk[1];
+        result_f[k].z += fk[2];
+      }
+    }
+    if (pass == 0) {
+      flt_t PijS = PijSpline(ka, itype, jtype, NijC, NijH, dN2);
+      pij = 1 / overloaded::sqrt(1 + sum_pij + PijS);
+    }
+  }
+  return pij;
+}
+
+template<typename flt_t, typename acc_t>
+inline flt_t frebo_pi_rc(KernelArgsAIREBOT<flt_t,acc_t> * ka, int itype, 
+    int jtype, flt_t Nij, flt_t Nji, flt_t Nijconj, flt_t * dN3) {
+  acc_t dN3tmp[3] = {0};
+  flt_t ret = piRCSpline(ka, itype, jtype, Nij, Nji, Nijconj, dN3tmp);
+  dN3[0] = dN3tmp[0];
+  dN3[1] = dN3tmp[1];
+  dN3[2] = dN3tmp[2];
+  return ret;
+}
+
+template<typename flt_t, typename acc_t>
+inline flt_t frebo_Tij(KernelArgsAIREBOT<flt_t,acc_t> * ka, int itype, 
+    int jtype, flt_t Nij, flt_t Nji, flt_t Nijconj, flt_t * dN3) {
+  dN3[0] = 0;
+  dN3[1] = 0;
+  dN3[2] = 0;
+  if (itype == HYDROGEN || jtype == HYDROGEN) return 0;
+  acc_t dN3tmp[3] = {0};
+  flt_t ret = TijSpline(ka, Nij, Nji, Nijconj, dN3tmp);
+  dN3[0] = dN3tmp[0];
+  dN3[1] = dN3tmp[1];
+  dN3[2] = dN3tmp[2];
+  return ret;
+}
+
+/*
+ * Implements a scalar version of the sum cos^1(omega) term used in pi^dh_ij.
+ * Occurs in both bondorder and bondorderLJ.
+ */
+template<typename flt_t, typename acc_t>
+inline flt_t frebo_sum_omega(KernelArgsAIREBOT<flt_t,acc_t> * ka, int i, int j,
+ flt_t r23x, flt_t r23y, flt_t r23z, flt_t r23mag, flt_t VA, acc_t fij[3]) {
+  ResultForceT<acc_t> * result_f = ka->result_f;
+  acc_t sum_omega = 0;
+  int a2 = i;
+  int a3 = j;
+  flt_t r32x = - r23x;
+  flt_t r32y = - r23y;
+  flt_t r32z = - r23z;
+  int * map = ka->map;
+  AtomAIREBOT<flt_t> * x = ka->x;
+  flt_t thmin = ka->params.thmin;
+  flt_t thmax = ka->params.thmax;
+  int itype = map[x[i].w];
+  int jtype = map[x[j].w];
+  int * neighs_i = ka->neigh_rebo.entries + ka->neigh_rebo.offset[i];
+  int * neighs_j = ka->neigh_rebo.entries + ka->neigh_rebo.offset[j];
+  int num_i = ka->neigh_rebo.num[i];
+  int num_j = ka->neigh_rebo.num[j];
+  int kk;
+  for (kk = 0; kk < num_i; kk++) {
+    int k = neighs_i[kk];
+    if (k == j) continue;
+    int a1 = k;
+    int ktype = map[x[k].w];
+    flt_t r21x = x[a2].x - x[a1].x;
+    flt_t r21y = x[a2].y - x[a1].y;
+    flt_t r21z = x[a2].z - x[a1].z;
+    flt_t r21mag = overloaded::sqrt(r21x * r21x + r21y * r21y + r21z * r21z);
+    flt_t cos321 = (r23x * r21x + r23y * r21y + r23z * r21z) / 
+      (r23mag * r21mag);
+    cos321 = fmin_nonan<flt_t>(1, fmax_nonan<flt_t>(-1, cos321));
+    flt_t sin321 = overloaded::sqrt(1 - cos321 * cos321);
+    if (sin321 == 0) continue;
+    flt_t sink2i = 1 / (sin321 * sin321);
+    flt_t rik2i = 1 / (r21mag * r21mag);
+    flt_t rr = r23mag * r23mag - r21mag * r21mag;
+    flt_t r31x = r21x - r23x;
+    flt_t r31y = r21y - r23y;
+    flt_t r31z = r21z - r23z;
+    flt_t r31mag2 = r31x * r31x + r31y * r31y + r31z * r31z;
+    flt_t rijrik = 2 * r23mag * r21mag;
+    flt_t r21mag2 = r21mag * r21mag;
+    flt_t dctik = (-rr + r31mag2) / (rijrik * r21mag2);
+    flt_t dctij = (rr + r31mag2) / (rijrik * r23mag * r23mag);
+    flt_t dctjk = -2 / rijrik;
+    flt_t rcmin21  = ka->params.rcmin [itype][ktype];
+    flt_t rcmaxp21 = ka->params.rcmaxp[itype][ktype];
+    flt_t dw21;
+    flt_t w21 = Sp(r21mag, rcmin21, rcmaxp21, &dw21);
+    // why does this additional cutoff in the cosine exist?
+    // the original code by stuart answers this:
+    // it avoid issues when bonds in the dihedral are linear
+    // by switching the dihedral off beforehand.
+    // This is the reason for both the sin == 0 checks and the
+    // tspjik = Sp2(..) calls.
+    // Unfortunately, this is not exactly stated in the original paper.
+    // It might be similar in purpose to the H(sin - s^min) term that
+    // appears in that paper, but can not be found in original REBO papers.
+    flt_t dtsjik;
+    flt_t tspjik = Sp2(cos321, thmin, thmax, &dtsjik);
+    dtsjik = - dtsjik;
+    int ll;
+    for (ll = 0; ll < num_j; ll++) {
+      int l = neighs_j[ll];
+      if (l == i || l == k) continue;
+      int ltype = map[x[l].w];
+      int a4 = l;
+      flt_t r34x = x[a3].x - x[a4].x;
+      flt_t r34y = x[a3].y - x[a4].y;
+      flt_t r34z = x[a3].z - x[a4].z;
+      flt_t r34mag = overloaded::sqrt(r34x * r34x + r34y * r34y + r34z * r34z);
+      flt_t cos234 = (r32x * r34x + r32y * r34y + r32z * r34z) / 
+	(r23mag * r34mag);
+      cos234 = fmin_nonan<flt_t>(1, fmax_nonan<flt_t>(-1, cos234));
+      flt_t sin234 = overloaded::sqrt(1 - cos234 * cos234);
+      if (sin234 == 0) continue;
+      flt_t sinl2i = 1 / (sin234 * sin234);
+      flt_t rjl2i = 1 / (r34mag * r34mag);
+
+      flt_t rcminjl = ka->params.rcmin[jtype][ltype];
+      flt_t rcmaxpjl = ka->params.rcmaxp[jtype][ltype];
+      flt_t dw34;
+      flt_t w34 = Sp(r34mag, rcminjl, rcmaxpjl, &dw34);
+      flt_t rr = (r23mag * r23mag) - (r34mag * r34mag);
+      flt_t r24x = r23x + r34x;
+      flt_t r24y = r23y + r34y;
+      flt_t r24z = r23z + r34z;
+      flt_t r242 =
+          (r24x * r24x) + (r24y * r24y) + (r24z * r24z);
+      flt_t rijrjl = 2 * r23mag * r34mag;
+      flt_t rjl2 = r34mag * r34mag;
+      flt_t dctjl = (-rr + r242) / (rijrjl * rjl2);
+      flt_t dctji = (rr + r242) / (rijrjl * r23mag * r23mag);
+      flt_t dctil = -2 / rijrjl;
+      flt_t dtsijl;
+      flt_t tspijl = Sp2(cos234, thmin, thmax, &dtsijl);
+      dtsijl = -dtsijl; // need minus sign
+      flt_t prefactor = VA;
+
+      flt_t cross321x = (r32y * r21z) - (r32z * r21y);
+      flt_t cross321y = (r32z * r21x) - (r32x * r21z);
+      flt_t cross321z = (r32x * r21y) - (r32y * r21x);
+      flt_t cross234x = (r23y * r34z) - (r23z * r34y);
+      flt_t cross234y = (r23z * r34x) - (r23x * r34z);
+      flt_t cross234z = (r23x * r34y) - (r23y * r34x);
+
+      flt_t cwnum = (cross321x * cross234x) +
+              (cross321y * cross234y) +
+              (cross321z * cross234z);
+      flt_t cwnom = r21mag * r34mag * r23mag * r23mag * sin321 * sin234;
+      flt_t om1234 = cwnum / cwnom;
+      flt_t cw = om1234;
+      sum_omega += ((1 - (om1234 * om1234)) * w21 * w34) *
+              (1 - tspjik) * (1 - tspijl);
+      if (VA == static_cast<flt_t>(0.0)) continue;
+
+      flt_t dt1dik = (rik2i) - (dctik * sink2i * cos321);
+      flt_t dt1djk = (-dctjk * sink2i * cos321);
+      flt_t dt1djl = (rjl2i) - (dctjl * sinl2i * cos234);
+      flt_t dt1dil = (-dctil * sinl2i * cos234);
+      flt_t dt1dij = (2 / (r23mag * r23mag)) -
+               (dctij * sink2i * cos321) -
+               (dctji * sinl2i * cos234);
+
+      flt_t dt2dikx = (-r23z * cross234y) + (r23y * cross234z);
+      flt_t dt2diky = (-r23x * cross234z) + (r23z * cross234x);
+      flt_t dt2dikz = (-r23y * cross234x) + (r23x * cross234y);
+
+      flt_t dt2djlx = (-r23y * cross321z) + (r23z * cross321y);
+      flt_t dt2djly = (-r23z * cross321x) + (r23x * cross321z);
+      flt_t dt2djlz = (-r23x * cross321y) + (r23y * cross321x);
+
+      flt_t dt2dijx = (r21z * cross234y) - (r34z * cross321y) -
+      flt_t      (r21y * cross234z) + (r34y * cross321z);
+      flt_t dt2dijy = (r21x * cross234z) - (r34x * cross321z) -
+      flt_t      (r21z * cross234x) + (r34z * cross321x);
+      flt_t dt2dijz = (r21y * cross234x) - (r34y * cross321x) -
+      flt_t      (r21x * cross234y) + (r34x * cross321y);
+
+      flt_t aa = (prefactor * 2 * cw / cwnom) * w21 * w34 *
+           (1 - tspjik) * (1 - tspijl);
+      flt_t aaa1 = -prefactor * (1 - (om1234 * om1234)) *
+             (1 - tspjik) * (1 - tspijl);
+      flt_t aaa2 = -prefactor * (1 - (om1234 * om1234)) * w21 * w34;
+      flt_t at2 = aa * cwnum;
+
+      flt_t fcijpc = (-dt1dij * at2) +
+              (aaa2 * dtsjik * dctij * (1 - tspijl)) +
+              (aaa2 * dtsijl * dctji * (1 - tspjik));
+      flt_t fcikpc = (-dt1dik * at2) +
+              (aaa2 * dtsjik * dctik * (1 - tspijl));
+      flt_t fcjlpc = (-dt1djl * at2) +
+              (aaa2 * dtsijl * dctjl * (1 - tspjik));
+      flt_t fcjkpc = (-dt1djk * at2) +
+              (aaa2 * dtsjik * dctjk * (1 - tspijl));
+      flt_t fcilpc = (-dt1dil * at2) +
+              (aaa2 * dtsijl * dctil * (1 - tspjik));
+
+      flt_t F23x = (fcijpc * r23x) + (aa * dt2dijx);
+      flt_t F23y = (fcijpc * r23y) + (aa * dt2dijy);
+      flt_t F23z = (fcijpc * r23z) + (aa * dt2dijz);
+
+      flt_t F12x = (fcikpc * r21x) + (aa * dt2dikx);
+      flt_t F12y = (fcikpc * r21y) + (aa * dt2diky);
+      flt_t F12z = (fcikpc * r21z) + (aa * dt2dikz);
+
+      flt_t F34x = (fcjlpc * r34x) + (aa * dt2djlx);
+      flt_t F34y = (fcjlpc * r34y) + (aa * dt2djly);
+      flt_t F34z = (fcjlpc * r34z) + (aa * dt2djlz);
+
+      flt_t F31x = (fcjkpc * r31x);
+      flt_t F31y = (fcjkpc * r31y);
+      flt_t F31z = (fcjkpc * r31z);
+
+      flt_t F24x = (fcilpc * r24x);
+      flt_t F24y = (fcilpc * r24y);
+      flt_t F24z = (fcilpc * r24z);
+
+      flt_t f1x = -F12x - F31x;
+      flt_t f1y = -F12y - F31y;
+      flt_t f1z = -F12z - F31z;
+      flt_t f2x = F12x + F31x;
+      flt_t f2y = F12y + F31y;
+      flt_t f2z = F12z + F31z;
+      flt_t f3x = F34x + F24x;
+      flt_t f3y = F34y + F24y;
+      flt_t f3z = F34z + F24z;
+      flt_t f4x = -F34x - F24x;
+      flt_t f4y = -F34y - F24y;
+      flt_t f4z = -F34z - F24z;
+
+      fij[0] += F23x + F24x - F31x;
+      fij[1] += F23y + F24y - F31y;
+      fij[2] += F23z + F24z - F31z;
+
+      // coordination forces
+
+      flt_t tmp20 = VA * ((1 - (om1234 * om1234))) *
+             (1 - tspjik) * (1 - tspijl) * dw21 * w34 / r21mag;
+      f2x -= tmp20 * r21x;
+      f2y -= tmp20 * r21y;
+      f2z -= tmp20 * r21z;
+      f1x += tmp20 * r21x;
+      f1y += tmp20 * r21y;
+      f1z += tmp20 * r21z;
+
+      flt_t tmp21 = VA * ((1 - (om1234 * om1234))) *
+             (1 - tspjik) * (1 - tspijl) * w21 * dw34 / r34mag;
+      f3x -= tmp21 * r34x;
+      f3y -= tmp21 * r34y;
+      f3z -= tmp21 * r34z;
+      f4x += tmp21 * r34x;
+      f4y += tmp21 * r34y;
+      f4z += tmp21 * r34z;
+
+      result_f[a1].x += f1x;
+      result_f[a1].y += f1y;
+      result_f[a1].z += f1z;
+      result_f[a2].x += f2x;
+      result_f[a2].y += f2y;
+      result_f[a2].z += f2z;
+      result_f[a3].x += f3x;
+      result_f[a3].y += f3y;
+      result_f[a3].z += f3z;
+      result_f[a4].x += f4x;
+      result_f[a4].y += f4y;
+      result_f[a4].z += f4z;
+    }
+  }
+  return sum_omega;
+}
+
+/*
+ * Implements a scalar implementation the force update due to splines.
+ * It is used for both pi^rc_ij and T_ij.
+ * Occurs four times in each bondorder and bondorderLJ.
+ */
+template<typename flt_t, typename acc_t>
+inline void frebo_N_spline_force(KernelArgsAIREBOT<flt_t,acc_t> * ka, int i, 
+    int j, flt_t VA, flt_t dN, flt_t dNconj, flt_t Nconj) {
+  int * map = ka->map;
+  AtomAIREBOT<flt_t> * x = ka->x;
+  ResultForceT<acc_t> * result_f = ka->result_f;
+  flt_t * nC = ka->nC;
+  flt_t * nH = ka->nH;
+  flt_t Nmin = ka->params.Nmin;
+  flt_t Nmax = ka->params.Nmax;
+  int itype = map[x[i].w];
+  int * neighs = ka->neigh_rebo.entries + ka->neigh_rebo.offset[i];
+  int knum = ka->neigh_rebo.num[i];
+  int kk;
+  for (kk = 0; kk < knum; kk++) {
+    int k = neighs[kk];
+    if (k == j) continue;
+    flt_t rikx = x[i].x - x[k].x;
+    flt_t riky = x[i].y - x[k].y;
+    flt_t rikz = x[i].z - x[k].z;
+    flt_t rikmag = overloaded::sqrt(rikx * rikx + riky * riky + rikz * rikz);
+    int ktype = map[x[k].w];
+    flt_t rcminik = ka->params.rcmin[itype][ktype];
+    flt_t rcmaxik = ka->params.rcmax[itype][ktype];
+    flt_t dwik;
+    flt_t wik = Sp(rikmag, rcminik, rcmaxik, &dwik);
+    flt_t Nki = nC[k] + nH[k] - wik;
+    flt_t dNki;
+    flt_t SpN = Sp(Nki, Nmin, Nmax, &dNki);
+    flt_t fdN = VA * dN * dwik / rikmag;
+    flt_t fdNconj = VA * dNconj * 2 * Nconj * dwik * SpN / rikmag;
+    flt_t ffactor = fdN;
+    if (ktype == 0) ffactor += fdNconj;
+    flt_t fkx = ffactor * rikx;
+    flt_t fky = ffactor * riky;
+    flt_t fkz = ffactor * rikz;
+    result_f[i].x -= fkx;
+    result_f[i].y -= fky;
+    result_f[i].z -= fkz;
+    result_f[k].x += fkx;
+    result_f[k].y += fky;
+    result_f[k].z += fkz;
+    if (ktype != 0 || fabs(dNki) <= TOL) continue;
+    int * neighs_k = ka->neigh_rebo.entries + ka->neigh_rebo.offset[k];
+    int nnum = ka->neigh_rebo.num[k];
+    int nn;
+    for (nn = 0; nn < nnum; nn++) {
+      int n = neighs_k[nn];
+      if (n == i) continue;
+      flt_t rknx = x[k].x - x[n].x;
+      flt_t rkny = x[k].y - x[n].y;
+      flt_t rknz = x[k].z - x[n].z;
+      flt_t rknmag = overloaded::sqrt(rknx * rknx + rkny * rkny + rknz * rknz);
+      int ntype = map[x[n].w];
+      flt_t rcminkn = ka->params.rcmin[ktype][ntype];
+      flt_t rcmaxkn = ka->params.rcmax[ktype][ntype];
+      flt_t dwkn;
+      Sp(rknmag, rcminkn, rcmaxkn, &dwkn);
+      flt_t ffactor = VA * dNconj * 2 * Nconj * wik * dNki * dwkn / rknmag;
+      result_f[k].x -= ffactor * rknx;
+      result_f[k].y -= ffactor * rkny;
+      result_f[k].z -= ffactor * rknz;
+      result_f[n].x += ffactor * rknx;
+      result_f[n].y += ffactor * rkny;
+      result_f[n].z += ffactor * rknz;
+    }
+  }
+}
+
+/*
+ * This data-structure contains the result of a search through neighbor-lists.
+ * It is used to calculate C_ij and the corresponding force updates.
+ */
+template<typename flt_t>
+struct LennardJonesPathAIREBOT {
+  AtomAIREBOT<flt_t> del[3];
+  int num;
+  flt_t w[3];
+  flt_t dw[3];
+  flt_t r[3];
+  int idx[4];
+};
+
+/*
+ * Checks a candidate path stored in idxs whether it is better than *path
+ * and updates *path accordingly.
+ */
+template<typename flt_t, typename acc_t>
+inline flt_t ref_lennard_jones_test_path_single(
+ KernelArgsAIREBOT<flt_t,acc_t> * ka, flt_t best, int num, int * idxs, 
+ LennardJonesPathAIREBOT<flt_t> * path) {
+  LennardJonesPathAIREBOT<flt_t> result;
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * map = ka->map;
+  result.num = num;
+  flt_t combined = 1;
+  for (int i = num - 2; i >= 0; i--) {
+    int a0 = idxs[i+0];
+    int a1 = idxs[i+1];
+    flt_t delx = x[a1].x - x[a0].x;
+    flt_t dely = x[a1].y - x[a0].y;
+    flt_t delz = x[a1].z - x[a0].z;
+    flt_t rsq = delx * delx + dely * dely + delz * delz;
+    int type0 = map[x[a0].w];
+    int type1 = map[x[a1].w];
+    if (rsq >= ka->params.rcmaxsq[type0][type1]) return best;
+    flt_t r = overloaded::sqrt(rsq);
+    flt_t dw, w = Sp<flt_t>(r, ka->params.rcmin[type0][type1], 
+                            ka->params.rcmax[type0][type1], &dw);
+    if (w == 0) return best;
+    combined *= w;
+    if (combined <= best) return best;
+    result.idx[i] = a0;
+    result.del[i].x = delx;
+    result.del[i].y = dely;
+    result.del[i].z = delz;
+    result.r[i] = r;
+    result.w[i] = w;
+    result.dw[i] = dw;
+  }
+  result.idx[num - 1] = idxs[num - 1];
+  *path = result;
+  return combined;
+}
+
+/*
+ * Test through all paths surrounding i and j to find the corresponding
+ * best path. Uses the same iteration ordering as FLJ() does.
+ * Note that an optimization would use the j neighlist instead in the inner
+ * loop.
+ */
+template<typename flt_t, typename acc_t>
+inline flt_t ref_lennard_jones_test_path(KernelArgsAIREBOT<flt_t,acc_t> * ka, 
+    int i, int j, flt_t rij, flt_t rcmax, 
+    LennardJonesPathAIREBOT<flt_t> * path) {
+  int idxs[4];
+  idxs[0] = i;
+  idxs[1] = j;
+  flt_t best = 0;
+  if (rij <= rcmax) {
+    best = ref_lennard_jones_test_path_single(ka, best, 2, idxs, path);
+    if (best == static_cast<flt_t>(1.0)) return 0;
+  }
+  for (int kk = 0; kk < ka->neigh_rebo.num[i]; kk++) {
+    int k = ka->neigh_rebo.entries[ka->neigh_rebo.offset[i] + kk];
+    if (k == j) continue;
+    idxs[1] = k;
+    idxs[2] = j;
+    best = ref_lennard_jones_test_path_single(ka, best, 3, idxs, path);
+    if (best == static_cast<flt_t>(1.0)) return 0;
+    for (int mm = 0; mm < ka->neigh_rebo.num[k]; mm++) {
+      int m = ka->neigh_rebo.entries[ka->neigh_rebo.offset[k] + mm];
+      if (m == i || m == j) continue;
+      idxs[2] = m;
+      idxs[3] = j;
+      best = ref_lennard_jones_test_path_single(ka, best, 4, idxs, path);
+      if (best == static_cast<flt_t>(1.0)) return 0;
+    }
+  }
+  return 1 - best;
+}
+
+/*
+ * Conducts the force update due to C_ij, given the active path.
+ */
+template<typename flt_t, typename acc_t>
+inline void ref_lennard_jones_force_path(KernelArgsAIREBOT<flt_t,acc_t> * ka, 
+    flt_t dC, LennardJonesPathAIREBOT<flt_t> * path) {
+  AtomAIREBOT<flt_t> * x = ka->x;
+  ResultForceT<acc_t> * result_f = ka->result_f;
+  for (int i = 0; i < path->num - 1; i++) {
+    flt_t fpair = dC * path->dw[i] / path->r[i];
+    for (int j = 0; j < path->num - 1; j++) {
+      if (i != j) fpair *= path->w[j];
+    }
+    result_f[path->idx[i+0]].x -= fpair * path->del[i].x;
+    result_f[path->idx[i+0]].y -= fpair * path->del[i].y;
+    result_f[path->idx[i+0]].z -= fpair * path->del[i].z;
+    result_f[path->idx[i+1]].x += fpair * path->del[i].x;
+    result_f[path->idx[i+1]].y += fpair * path->del[i].y;
+    result_f[path->idx[i+1]].z += fpair * path->del[i].z;
+  }
+}
+
+/*
+ * Calculate the bondorderLJ term.
+ */
+template<typename flt_t, typename acc_t>
+inline flt_t ref_lennard_jones_bondorder(KernelArgsAIREBOT<flt_t,acc_t> * ka, 
+    int i, int j, flt_t VA, acc_t fij[3]) {
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * map = ka->map;
+  ResultForceT<acc_t> * result_f = ka->result_f;
+
+  int itype = map[x[i].w];
+  int jtype = map[x[j].w];
+
+  flt_t delx = x[i].x - x[j].x;
+  flt_t dely = x[i].y - x[j].y;
+  flt_t delz = x[i].z - x[j].z;
+  flt_t rsq = delx * delx + dely * dely + delz * delz;
+  flt_t rij = overloaded::sqrt(rsq);
+
+  flt_t rcminij = ka->params.rcmin[itype][jtype];
+  flt_t rcmaxij = ka->params.rcmax[itype][jtype];
+  flt_t dwij;
+  flt_t wij = Sp(rij, rcminij, rcmaxij, &dwij);
+
+  flt_t the_r = ka->params.rcmin[itype][jtype];
+  flt_t scale = the_r / rij;
+  flt_t Nij = ka->nH[i] + ka->nC[i] - wij;
+  flt_t Nji = ka->nH[j] + ka->nC[j] - wij;
+  flt_t NconjtmpI;
+  acc_t fijc[3] = {0}, fjic[3] = {0};
+  flt_t pij = frebo_pij<flt_t,acc_t>(ka, i, j, delx * scale, dely * scale, 
+    delz * scale, the_r, wij, 0.0, &NconjtmpI, fijc);
+  flt_t NconjtmpJ;
+  flt_t pji = frebo_pij<flt_t,acc_t>(ka, j, i, -delx * scale, -dely * scale, 
+    -delz * scale, the_r, wij, 0.0, &NconjtmpJ, fjic);
+  flt_t Nijconj = 1.0 + (NconjtmpI * NconjtmpI) + (NconjtmpJ * NconjtmpJ);
+  flt_t dN3_pi_rc[3];
+  flt_t pi_rc = frebo_pi_rc<flt_t,acc_t>(ka, itype, jtype, Nij, Nji, Nijconj, 
+    dN3_pi_rc);
+  flt_t dN3_Tij[3];
+  flt_t Tij = frebo_Tij<flt_t,acc_t>(ka, itype, jtype, Nij, Nji, Nijconj, 
+    dN3_Tij);
+  flt_t sum_omega = 0;
+  if (fabs(Tij) > TOL) {
+    sum_omega = frebo_sum_omega<flt_t,acc_t>(ka, i, j, delx * scale, dely * 
+                                             scale, delz * scale, the_r, 0.0, 
+                                             fijc);
+  }
+  flt_t pi_dh = Tij * sum_omega;
+  flt_t bij = 0.5 * (pij + pji) + pi_rc + pi_dh;
+  flt_t dStb;
+  flt_t Stb = Sp2<flt_t>(bij, ka->params.bLJmin[itype][jtype], 
+    ka->params.bLJmax[itype][jtype], &dStb);
+  if (dStb != 0) {
+    flt_t pij_reverse = frebo_pij<flt_t,acc_t>(ka, i, j, delx * scale, 
+      dely * scale, delz * scale, the_r, wij, VA * dStb, &NconjtmpI, fijc);
+    flt_t pji_reverse = frebo_pij<flt_t,acc_t>(ka, j, i, -delx * scale, 
+      -dely * scale, -delz * scale, the_r, wij, VA * dStb, &NconjtmpJ, fjic);
+    fijc[0] -= fjic[0];
+    fijc[1] -= fjic[1];
+    fijc[2] -= fjic[2];
+    frebo_N_spline_force<flt_t,acc_t>(ka, i, j, VA * dStb, dN3_pi_rc[0], 
+      dN3_pi_rc[2], NconjtmpI);
+    frebo_N_spline_force<flt_t,acc_t>(ka, j, i, VA * dStb, dN3_pi_rc[1], 
+      dN3_pi_rc[2], NconjtmpJ);
+    if (fabs(Tij) > TOL) {
+      flt_t sum_omega_reverse = frebo_sum_omega<flt_t,acc_t>(ka, i, j, 
+        delx * scale, dely * scale, delz * scale, the_r, VA * dStb * Tij, fijc);
+      frebo_N_spline_force(ka, i, j, VA * dStb * sum_omega, dN3_Tij[0], 
+        dN3_Tij[2], NconjtmpI);
+      frebo_N_spline_force(ka, j, i, VA * dStb * sum_omega, dN3_Tij[1], 
+        dN3_Tij[2], NconjtmpJ);
+    }
+    assert(fij[0] == 0);
+    assert(fij[1] == 0);
+    assert(fij[2] == 0);
+    fij[0] = scale * (fijc[0] - (delx * delx * fijc[0] + dely * delx * 
+                                 fijc[1] + delz * delx * fijc[2]) / rsq);
+    fij[1] = scale * (fijc[1] - (delx * dely * fijc[0] + dely * dely * 
+                                 fijc[1] + delz * dely * fijc[2]) / rsq);
+    fij[2] = scale * (fijc[2] - (delx * delz * fijc[0] + dely * delz * 
+                                 fijc[1] + delz * delz * fijc[2]) / rsq);
+  }
+  return Stb;
+}
+
+/*
+ * Scalar reference implementation of neighbor routine.
+ */
+template<typename flt_t, typename acc_t>
+void ref_rebo_neigh(KernelArgsAIREBOT<flt_t,acc_t> * ka) {
+  int offset = ka->neigh_from_atom * ka->num_neighs_per_atom;
+  for (int i = ka->neigh_from_atom; i < ka->neigh_to_atom; i++) {
+    ka->neigh_rebo.offset[i] = offset;
+    int itype = ka->map[ka->x[i].w];
+    int n = 0;
+    ka->nC[i] = 0;
+    ka->nH[i] = 0;
+    for (int j = 0; j < ka->neigh_lmp.num[i]; j++) {
+      int ji = ka->neigh_lmp.entries[ka->neigh_lmp.offset[i] + j];
+      flt_t delx = ka->x[i].x - ka->x[ji].x;
+      flt_t dely = ka->x[i].y - ka->x[ji].y;
+      flt_t delz = ka->x[i].z - ka->x[ji].z;
+      flt_t rsq = delx * delx + dely * dely + delz * delz;
+      int jtype = ka->map[ka->x[ji].w];
+      if (rsq < ka->params.rcmaxsq[itype][jtype]) {
+        ka->neigh_rebo.entries[offset + n++] = ji;
+        flt_t rcmin = ka->params.rcmin[itype][jtype];
+        flt_t rcmax = ka->params.rcmax[itype][jtype];
+        if (jtype == CARBON)
+          ka->nC[i] += Sp<flt_t>(overloaded::sqrt(rsq), rcmin, rcmax, NULL);
+        else
+          ka->nH[i] += Sp<flt_t>(overloaded::sqrt(rsq), rcmin, rcmax, NULL);
+      }
+    }
+    ka->neigh_rebo.num[i] = n;
+    offset += n;
+  }
+}
+
+template<typename flt_t, typename acc_t>
+void ref_torsion_single_interaction(KernelArgsAIREBOT<flt_t,acc_t> * ka, int i,
+                                    int j) {
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * map = ka->map;
+  ResultForceT<acc_t> * f = ka->result_f;
+  flt_t (*rcmin)[2] = ka->params.rcmin;
+  flt_t (*rcmax)[2] = ka->params.rcmax;
+  flt_t (*epsilonT)[2] = ka->params.epsilonT;
+  flt_t thmin = ka->params.thmin;
+  flt_t thmax = ka->params.thmax;
+  int itype = map[x[i].w];
+  flt_t xtmp = x[i].x;
+  flt_t ytmp = x[i].y;
+  flt_t ztmp = x[i].z;
+  int * REBO_neighs_i = &ka->neigh_rebo.entries[ka->neigh_rebo.offset[i]];
+  int jnum = ka->neigh_rebo.num[i];
+  int jtype = map[x[j].w];
+
+  flt_t del32x = x[j].x-x[i].x;
+  flt_t del32y = x[j].y-x[i].y;
+  flt_t del32z = x[j].z-x[i].z;
+  flt_t rsq = del32x*del32x + del32y*del32y + del32z*del32z;
+  flt_t r32 = overloaded::sqrt(rsq);
+  flt_t del23x = -del32x;
+  flt_t del23y = -del32y;
+  flt_t del23z = -del32z;
+  flt_t r23 = r32;
+  flt_t dw23, w23 = Sp<flt_t>(r23,rcmin[itype][jtype],rcmax[itype][jtype],
+    &dw23);
+
+  assert(itype == 0);
+  assert(jtype == 0);
+
+  for (int kk = 0; kk < jnum; kk++) {
+    int k = REBO_neighs_i[kk];
+    int ktype = map[x[k].w];
+    if (k == j) continue;
+    flt_t del21x = x[i].x-x[k].x;
+    flt_t del21y = x[i].y-x[k].y;
+    flt_t del21z = x[i].z-x[k].z;
+    flt_t rsq = del21x*del21x + del21y*del21y + del21z*del21z;
+    flt_t r21 = overloaded::sqrt(rsq);
+    flt_t cos321 = - ((del21x*del32x) + (del21y*del32y) +
+                (del21z*del32z)) / (r21*r32);
+    cos321 = fmin(cos321,1);
+    cos321 = fmax(cos321,-1);
+    flt_t sin321 = overloaded::sqrt(1 - cos321*cos321);
+    if (sin321 < TOL) continue;
+
+    flt_t deljkx = del21x-del23x;
+    flt_t deljky = del21y-del23y;
+    flt_t deljkz = del21z-del23z;
+    flt_t rjk2 = deljkx*deljkx + deljky*deljky + deljkz*deljkz;
+    flt_t rjk = overloaded::sqrt(rjk2);
+    flt_t rik2 = r21*r21;
+    flt_t dw21, w21 = Sp<flt_t>(r21,rcmin[itype][ktype],rcmax[itype][ktype],
+      &dw21);
+
+    flt_t rij = r32;
+    flt_t rik = r21;
+    flt_t rij2 = r32*r32;
+    flt_t costmp = static_cast<flt_t>(0.5)*(rij2+rik2-rjk2)/rij/rik;
+    flt_t dtsjik, tspjik = Sp2<flt_t>(costmp,thmin,thmax,&dtsjik);
+    dtsjik = -dtsjik;
+
+    int * REBO_neighs_j = &ka->neigh_rebo.entries[ka->neigh_rebo.offset[j]];
+    int lnum = ka->neigh_rebo.num[j];
+    for (int ll = 0; ll < lnum; ll++) {
+      int l = REBO_neighs_j[ll];
+      int ltype = map[x[l].w];
+      if (l == i || l == k) continue;
+      flt_t del34x = x[j].x-x[l].x;
+      flt_t del34y = x[j].y-x[l].y;
+      flt_t del34z = x[j].z-x[l].z;
+      flt_t rsq = del34x*del34x + del34y*del34y + del34z*del34z;
+      flt_t r34 = overloaded::sqrt(rsq);
+      flt_t cos234 = (del32x*del34x + del32y*del34y +
+                del32z*del34z) / (r32*r34);
+      cos234 = fmin(cos234,1);
+      cos234 = fmax(cos234,-1);
+      flt_t sin234 = overloaded::sqrt(1 - cos234*cos234);
+      if (sin234 < TOL) continue;
+      flt_t dw34, w34 = Sp<flt_t>(r34,rcmin[jtype][ltype],rcmax[jtype][ltype],
+        &dw34);
+      flt_t delilx = del23x + del34x;
+      flt_t delily = del23y + del34y;
+      flt_t delilz = del23z + del34z;
+      flt_t ril2 = delilx*delilx + delily*delily + delilz*delilz;
+      flt_t ril = overloaded::sqrt(ril2);
+      flt_t rjl2 = r34*r34;
+
+      flt_t rjl = r34;
+      flt_t costmp = static_cast<flt_t>(0.5)*(rij2+rjl2-ril2)/rij/rjl;
+      flt_t dtsijl, tspijl = Sp2<flt_t>(costmp,thmin,thmax,&dtsijl);
+      dtsijl = -dtsijl; //need minus sign
+      flt_t cross321x = (del32y*del21z)-(del32z*del21y);
+      flt_t cross321y = (del32z*del21x)-(del32x*del21z);
+      flt_t cross321z = (del32x*del21y)-(del32y*del21x);
+      flt_t cross321mag = overloaded::sqrt(cross321x*cross321x+
+                         cross321y*cross321y + cross321z*cross321z);
+      flt_t cross234x = (del23y*del34z)-(del23z*del34y);
+      flt_t cross234y = (del23z*del34x)-(del23x*del34z);
+      flt_t cross234z = (del23x*del34y)-(del23y*del34x);
+      flt_t cross234mag = overloaded::sqrt(cross234x*cross234x+
+                         cross234y*cross234y + cross234z*cross234z);
+      flt_t cwnum = (cross321x*cross234x) +
+        (cross321y*cross234y)+(cross321z*cross234z);
+      flt_t cwnom = r21*r34*r32*r32*sin321*sin234;
+      flt_t cw = cwnum/cwnom;
+
+      flt_t cw2 = (static_cast<flt_t>(.5)*(1-cw));
+      flt_t ekijl = epsilonT[ktype][ltype];
+      flt_t Ec = 256*ekijl/405;
+      flt_t Vtors = (Ec*(overloaded::pow(cw2,5)))-(ekijl/10);
+
+      ka->result_eng += Vtors*w21*w23*w34*(1-tspjik)*(1-tspijl);
+
+      flt_t dndijx = (cross234y*del21z)-(cross234z*del21y);
+      flt_t dndijy = (cross234z*del21x)-(cross234x*del21z);
+      flt_t dndijz = (cross234x*del21y)-(cross234y*del21x);
+
+      flt_t tmpvecx = (del34y*cross321z)-(del34z*cross321y);
+      flt_t tmpvecy = (del34z*cross321x)-(del34x*cross321z);
+      flt_t tmpvecz = (del34x*cross321y)-(del34y*cross321x);
+
+      dndijx = dndijx+tmpvecx;
+      dndijy = dndijy+tmpvecy;
+      dndijz = dndijz+tmpvecz;
+
+      flt_t dndikx = (del23y*cross234z)-(del23z*cross234y);
+      flt_t dndiky = (del23z*cross234x)-(del23x*cross234z);
+      flt_t dndikz = (del23x*cross234y)-(del23y*cross234x);
+
+      flt_t dndjlx = (cross321y*del23z)-(cross321z*del23y);
+      flt_t dndjly = (cross321z*del23x)-(cross321x*del23z);
+      flt_t dndjlz = (cross321x*del23y)-(cross321y*del23x);
+
+      flt_t dcidij = ((r23*r23)-(r21*r21)+(rjk*rjk))/(2*r23*r23*r21);
+      flt_t dcidik = ((r21*r21)-(r23*r23)+(rjk*rjk))/(2*r23*r21*r21);
+      flt_t dcidjk = (-rjk)/(r23*r21);
+      flt_t dcjdji = ((r23*r23)-(r34*r34)+(ril*ril))/(2*r23*r23*r34);
+      flt_t dcjdjl = ((r34*r34)-(r23*r23)+(ril*ril))/(2*r23*r34*r34);
+      flt_t dcjdil = (-ril)/(r23*r34);
+
+      flt_t dsidij = (-cos321/sin321)*dcidij;
+      flt_t dsidik = (-cos321/sin321)*dcidik;
+      flt_t dsidjk = (-cos321/sin321)*dcidjk;
+
+      flt_t dsjdji = (-cos234/sin234)*dcjdji;
+      flt_t dsjdjl = (-cos234/sin234)*dcjdjl;
+      flt_t dsjdil = (-cos234/sin234)*dcjdil;
+
+      flt_t dxidij = (r21*sin321)+(r23*r21*dsidij);
+      flt_t dxidik = (r23*sin321)+(r23*r21*dsidik);
+      flt_t dxidjk = (r23*r21*dsidjk);
+
+      flt_t dxjdji = (r34*sin234)+(r23*r34*dsjdji);
+      flt_t dxjdjl = (r23*sin234)+(r23*r34*dsjdjl);
+      flt_t dxjdil = (r23*r34*dsjdil);
+
+      flt_t ddndij = (dxidij*cross234mag)+(cross321mag*dxjdji);
+      flt_t ddndik = dxidik*cross234mag;
+      flt_t ddndjk = dxidjk*cross234mag;
+      flt_t ddndjl = cross321mag*dxjdjl;
+      flt_t ddndil = cross321mag*dxjdil;
+      flt_t dcwddn = -cwnum/(cwnom*cwnom);
+      flt_t dcwdn = 1/cwnom;
+      flt_t dvpdcw = (-1)*Ec*static_cast<flt_t>(-0.5)*5*overloaded::pow(cw2,4)*
+                      w23*w21*w34*(1-tspjik)*(1-tspijl);
+
+      flt_t Ftmpx = dvpdcw*((dcwdn*dndijx)+(dcwddn*ddndij*del23x/r23));
+      flt_t Ftmpy = dvpdcw*((dcwdn*dndijy)+(dcwddn*ddndij*del23y/r23));
+      flt_t Ftmpz = dvpdcw*((dcwdn*dndijz)+(dcwddn*ddndij*del23z/r23));
+      flt_t fix = Ftmpx;
+      flt_t fiy = Ftmpy;
+      flt_t fiz = Ftmpz;
+      flt_t fjx = -Ftmpx;
+      flt_t fjy = -Ftmpy;
+      flt_t fjz = -Ftmpz;
+
+      Ftmpx = dvpdcw*((dcwdn*dndikx)+(dcwddn*ddndik*del21x/r21));
+      Ftmpy = dvpdcw*((dcwdn*dndiky)+(dcwddn*ddndik*del21y/r21));
+      Ftmpz = dvpdcw*((dcwdn*dndikz)+(dcwddn*ddndik*del21z/r21));
+      fix += Ftmpx;
+      fiy += Ftmpy;
+      fiz += Ftmpz;
+      flt_t fkx = -Ftmpx;
+      flt_t fky = -Ftmpy;
+      flt_t fkz = -Ftmpz;
+
+      Ftmpx = (dvpdcw*dcwddn*ddndjk*deljkx)/rjk;
+      Ftmpy = (dvpdcw*dcwddn*ddndjk*deljky)/rjk;
+      Ftmpz = (dvpdcw*dcwddn*ddndjk*deljkz)/rjk;
+      fjx += Ftmpx;
+      fjy += Ftmpy;
+      fjz += Ftmpz;
+      fkx -= Ftmpx;
+      fky -= Ftmpy;
+      fkz -= Ftmpz;
+
+      Ftmpx = dvpdcw*((dcwdn*dndjlx)+(dcwddn*ddndjl*del34x/r34));
+      Ftmpy = dvpdcw*((dcwdn*dndjly)+(dcwddn*ddndjl*del34y/r34));
+      Ftmpz = dvpdcw*((dcwdn*dndjlz)+(dcwddn*ddndjl*del34z/r34));
+      fjx += Ftmpx;
+      fjy += Ftmpy;
+      fjz += Ftmpz;
+      flt_t flx = -Ftmpx;
+      flt_t fly = -Ftmpy;
+      flt_t flz = -Ftmpz;
+
+      Ftmpx = (dvpdcw*dcwddn*ddndil*delilx)/ril;
+      Ftmpy = (dvpdcw*dcwddn*ddndil*delily)/ril;
+      Ftmpz = (dvpdcw*dcwddn*ddndil*delilz)/ril;
+      fix += Ftmpx;
+      fiy += Ftmpy;
+      fiz += Ftmpz;
+      flx -= Ftmpx;
+      fly -= Ftmpy;
+      flz -= Ftmpz;
+
+      // coordination forces
+
+      flt_t fpair = Vtors*dw21*w23*w34*(1-tspjik)*(1-tspijl) / r21;
+      fix -= del21x*fpair;
+      fiy -= del21y*fpair;
+      fiz -= del21z*fpair;
+      fkx += del21x*fpair;
+      fky += del21y*fpair;
+      fkz += del21z*fpair;
+
+      fpair = Vtors*w21*dw23*w34*(1-tspjik)*(1-tspijl) / r23;
+      fix -= del23x*fpair;
+      fiy -= del23y*fpair;
+      fiz -= del23z*fpair;
+      fjx += del23x*fpair;
+      fjy += del23y*fpair;
+      fjz += del23z*fpair;
+
+      fpair = Vtors*w21*w23*dw34*(1-tspjik)*(1-tspijl) / r34;
+      fjx -= del34x*fpair;
+      fjy -= del34y*fpair;
+      fjz -= del34z*fpair;
+      flx += del34x*fpair;
+      fly += del34y*fpair;
+      flz += del34z*fpair;
+
+      // additional cut off function forces
+
+      flt_t fcpc = -Vtors*w21*w23*w34*dtsjik*(1-tspijl);
+      fpair = fcpc*dcidij/rij;
+      fix += fpair*del23x;
+      fiy += fpair*del23y;
+      fiz += fpair*del23z;
+      fjx -= fpair*del23x;
+      fjy -= fpair*del23y;
+      fjz -= fpair*del23z;
+
+      fpair = fcpc*dcidik/rik;
+      fix += fpair*del21x;
+      fiy += fpair*del21y;
+      fiz += fpair*del21z;
+      fkx -= fpair*del21x;
+      fky -= fpair*del21y;
+      fkz -= fpair*del21z;
+
+      fpair = fcpc*dcidjk/rjk;
+      fjx += fpair*deljkx;
+      fjy += fpair*deljky;
+      fjz += fpair*deljkz;
+      fkx -= fpair*deljkx;
+      fky -= fpair*deljky;
+      fkz -= fpair*deljkz;
+
+      fcpc = -Vtors*w21*w23*w34*(1-tspjik)*dtsijl;
+      fpair = fcpc*dcjdji/rij;
+      fix += fpair*del23x;
+      fiy += fpair*del23y;
+      fiz += fpair*del23z;
+      fjx -= fpair*del23x;
+      fjy -= fpair*del23y;
+      fjz -= fpair*del23z;
+
+      fpair = fcpc*dcjdjl/rjl;
+      fjx += fpair*del34x;
+      fjy += fpair*del34y;
+      fjz += fpair*del34z;
+      flx -= fpair*del34x;
+      fly -= fpair*del34y;
+      flz -= fpair*del34z;
+
+      fpair = fcpc*dcjdil/ril;
+      fix += fpair*delilx;
+      fiy += fpair*delily;
+      fiz += fpair*delilz;
+      flx -= fpair*delilx;
+      fly -= fpair*delily;
+      flz -= fpair*delilz;
+
+      // sum per-atom forces into atom force array
+
+      f[i].x += fix; f[i].y += fiy; f[i].z += fiz;
+      f[j].x += fjx; f[j].y += fjy; f[j].z += fjz;
+      f[k].x += fkx; f[k].y += fky; f[k].z += fkz;
+      f[l].x += flx; f[l].y += fly; f[l].z += flz;
+    }
+  }
+}
+
+template<typename flt_t, typename acc_t>
+void ref_torsion(KernelArgsAIREBOT<flt_t,acc_t> * ka) {
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * map = ka->map;
+  int * tag = ka->tag;
+  for (int ii = ka->frebo_from_atom; ii < ka->frebo_to_atom; ii++) {
+    int i = ii;
+    int itag = tag[i];
+    int itype = map[x[i].w];
+    if (itype != 0) continue;
+    flt_t xtmp = x[i].x;
+    flt_t ytmp = x[i].y;
+    flt_t ztmp = x[i].z;
+    int * REBO_neighs_i = &ka->neigh_rebo.entries[ka->neigh_rebo.offset[i]];
+    int jnum = ka->neigh_rebo.num[i];
+    for (int jj = 0; jj < jnum; jj++) {
+      int j = REBO_neighs_i[jj];
+      int jtag = tag[j];
+
+      if (itag > jtag) {
+        if (((itag+jtag) & 1) == 0) continue;
+      } else if (itag < jtag) {
+        if (((itag+jtag) & 1) == 1) continue;
+      } else {
+        if (x[j].z < ztmp) continue;
+        if (x[j].z == ztmp && x[j].y < ytmp) continue;
+        if (x[j].z == ztmp && x[j].y == ytmp && x[j].x < xtmp) continue;
+      }
+
+      int jtype = map[x[j].w];
+      if (jtype != 0) continue;
+      ref_torsion_single_interaction(ka, i, j);
+    }
+  }
+}
+
+/*
+ * Calculate single REBO interaction.
+ * Corresponds to FREBO method. Note that the bondorder() function is
+ * inlined.
+ */
+template<typename flt_t, typename acc_t>
+void ref_frebo_single_interaction(KernelArgsAIREBOT<flt_t,acc_t> * ka, int i, 
+    int j) {
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * map = ka->map;
+  ResultForceT<acc_t> * result_f = ka->result_f;
+  int jj;
+  int itype = map[x[i].w];
+  flt_t x_i = x[i].x;
+  flt_t y_i = x[i].y;
+  flt_t z_i = x[i].z;
+  int jtype = map[x[j].w];
+  flt_t delx = x[i].x - x[j].x;
+  flt_t dely = x[i].y - x[j].y;
+  flt_t delz = x[i].z - x[j].z;
+  flt_t rsq = delx * delx + dely * dely + delz * delz;
+  flt_t rij = overloaded::sqrt(rsq);
+  flt_t rcminij = ka->params.rcmin[itype][jtype];
+  flt_t rcmaxij = ka->params.rcmax[itype][jtype];
+  flt_t dwij;
+  flt_t wij = Sp(rij, rcminij, rcmaxij, &dwij);
+  if (wij <= TOL) return;
+
+  flt_t Qij = ka->params.Q[itype][jtype];
+  flt_t Aij = ka->params.A[itype][jtype];
+  flt_t alphaij = ka->params.alpha[itype][jtype];
+
+  flt_t exp_alphar = exp(-alphaij * rij);
+  flt_t VR_by_wij = (1.0 + (Qij / rij)) * Aij * exp_alphar;
+  flt_t VR = wij * VR_by_wij;
+  flt_t pre = wij * Aij * exp_alphar;
+  flt_t dVRdi = pre * ((-alphaij) - (Qij / rsq) - (Qij * alphaij / rij));
+  dVRdi += VR_by_wij * dwij;
+
+  flt_t VA_by_wij = 0, dVA = 0;
+  for (int k = 0; k < 3; k++) {
+    flt_t BIJc = ka->params.BIJc[itype][jtype][k];
+    flt_t Betaij = ka->params.Beta[itype][jtype][k];
+    flt_t term = -BIJc * overloaded::exp(-Betaij * rij);
+    VA_by_wij += term;
+    dVA += -Betaij * wij * term;
+  }
+  dVA += VA_by_wij * dwij;
+  flt_t VA = VA_by_wij * wij;
+
+  acc_t fij[3] = {0};
+  flt_t Nij = ka->nH[i] + ka->nC[i] - wij;
+  flt_t Nji = ka->nH[j] + ka->nC[j] - wij;
+  flt_t NconjtmpI;
+  flt_t pij = frebo_pij(ka, i, j, delx, dely, delz, rij, wij, VA, &NconjtmpI, 
+    fij);
+  flt_t NconjtmpJ;
+  acc_t fji[3] = {0};
+  flt_t pji = frebo_pij(ka, j, i, -delx, -dely, -delz, rij, wij, VA, 
+    &NconjtmpJ, fji);
+  fij[0] -= fji[0]; fij[1] -= fji[1]; fij[2] -= fji[2];
+  flt_t Nijconj = 1.0 + (NconjtmpI * NconjtmpI) + (NconjtmpJ * NconjtmpJ);
+  flt_t dN3[3];
+  flt_t pi_rc = frebo_pi_rc(ka, itype, jtype, Nij, Nji, Nijconj, dN3);
+  frebo_N_spline_force(ka, i, j, VA, dN3[0], dN3[2], NconjtmpI);
+  frebo_N_spline_force(ka, j, i, VA, dN3[1], dN3[2], NconjtmpJ);
+  flt_t Tij = frebo_Tij(ka, itype, jtype, Nij, Nji, Nijconj, dN3);
+  flt_t sum_omega = 0.0;
+  if (fabs(Tij) > TOL) {
+    sum_omega = frebo_sum_omega(ka, i, j, delx, dely, delz, rij, VA * Tij, fij);
+    frebo_N_spline_force(ka, i, j, VA * sum_omega, dN3[0], dN3[2], NconjtmpI);
+    frebo_N_spline_force(ka, j, i, VA * sum_omega, dN3[1], dN3[2], NconjtmpJ);
+  }
+  flt_t pi_dh = Tij * sum_omega;
+  flt_t bij = static_cast<flt_t>(0.5) * (pij + pji) + pi_rc + pi_dh;
+  flt_t dVAdi = bij * dVA;
+  flt_t fpair = -(dVRdi + dVAdi) / rij;
+
+  result_f[i].x += fpair * delx + fij[0];
+  result_f[i].y += fpair * dely + fij[1];
+  result_f[i].z += fpair * delz + fij[2];
+  result_f[j].x -= fpair * delx + fij[0];
+  result_f[j].y -= fpair * dely + fij[1];
+  result_f[j].z -= fpair * delz + fij[2];
+
+  flt_t evdwl = VR + bij * VA;
+  ka->result_eng += evdwl;
+  result_f[i].w += 0.5 * evdwl;
+  result_f[j].w += 0.5 * evdwl;
+}
+
+
+template<typename flt_t, typename acc_t>
+inline void ref_frebo_single_atom(KernelArgsAIREBOT<flt_t,acc_t> * ka, int i) {
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * tag = ka->tag;
+  int jj;
+  int itag = tag[i];
+  flt_t x_i = x[i].x;
+  flt_t y_i = x[i].y;
+  flt_t z_i = x[i].z;
+  int * neighs = ka->neigh_rebo.entries + ka->neigh_rebo.offset[i];
+  int jnum = ka->neigh_rebo.num[i];
+  for (jj = 0; jj < jnum; jj++) {
+    int j = neighs[jj];
+    int jtag = tag[j];
+    if (itag > jtag) {
+      if (((itag + jtag) & 1) == 0)
+        continue;
+    } else if (itag < jtag) {
+      if (((itag + jtag) & 1) == 1)
+        continue;
+    } else {
+      if (x[j].z < z_i)
+        continue;
+      if (x[j].z == z_i && x[j].y < y_i)
+        continue;
+      if (x[j].z == z_i && x[j].y == y_i && x[j].x < x_i)
+        continue;
+    }
+    ref_frebo_single_interaction(ka, i, j);
+  }
+}
+
+
+template<typename flt_t, typename acc_t>
+void ref_frebo(KernelArgsAIREBOT<flt_t,acc_t> * ka, int torflag) {
+  for (int i = ka->frebo_from_atom; i < ka->frebo_to_atom; i++) {
+    ref_frebo_single_atom(ka, i);
+  }
+  if (torflag) ref_torsion(ka);
+}
+
+template<typename flt_t, typename acc_t>
+void ref_lennard_jones_single_interaction(KernelArgsAIREBOT<flt_t,acc_t> * ka, 
+    int i, int j, int morseflag) {
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * map = ka->map;
+  ResultForceT<acc_t> * result_f = ka->result_f;
+
+  int itype = map[x[i].w];
+  int jtype = map[x[j].w];
+
+  flt_t delx = x[i].x - x[j].x;
+  flt_t dely = x[i].y - x[j].y;
+  flt_t delz = x[i].z - x[j].z;
+  flt_t rsq = delx * delx + dely * dely + delz * delz;
+
+  if (rsq >= ka->params.cutljsq[itype][jtype]) { return; }
+  flt_t rij = overloaded::sqrt(rsq);
+
+  LennardJonesPathAIREBOT<flt_t> testpath;
+  flt_t cij = 1.0;
+  if (rij < ka->params.cut3rebo) {
+    #pragma noinline
+    cij = ref_lennard_jones_test_path<flt_t,acc_t>(ka, i, j, rij, 
+      ka->params.rcmax[itype][jtype], &testpath);
+  }
+  if (cij == 0) {
+    return;
+  }
+
+  flt_t sigcut = ka->params.sigcut;
+  flt_t sigmin = ka->params.sigmin;
+  flt_t sigma = ka->params.sigma[itype][jtype];
+  flt_t rljmax = sigcut * sigma;
+  flt_t rljmin = sigmin * sigma;
+
+  flt_t dslw, slw = Sp2(rij, rljmin, rljmax, &dslw);
+
+  flt_t vdw, dvdw;
+  if (morseflag) {
+    const flt_t exr = exp(-rij * ka->params.lj4[itype][jtype]);
+    vdw = ka->params.lj1[itype][jtype] * exr * 
+      (ka->params.lj2[itype][jtype]*exr - 2);
+    dvdw = ka->params.lj3[itype][jtype] * exr * 
+      (1 - ka->params.lj2[itype][jtype]*exr);
+  } else {
+    flt_t r2inv = 1 / rsq;
+    flt_t r6inv = r2inv * r2inv * r2inv;
+
+    vdw = r6inv * (ka->params.lj3[itype][jtype]*r6inv - 
+		   ka->params.lj4[itype][jtype]);
+    dvdw = -r6inv * (ka->params.lj1[itype][jtype]*r6inv - 
+		     ka->params.lj2[itype][jtype]) / rij;
+  }
+
+  flt_t VLJ = vdw * slw;
+  flt_t dVLJ = dvdw * slw + vdw * dslw;
+
+  flt_t dStr, Str = Sp2<flt_t>(rij, ka->params.rcLJmin[itype][jtype], 
+    ka->params.rcLJmax[itype][jtype], &dStr);
+  flt_t VA = Str * cij * VLJ;
+  flt_t Stb = 0;
+  acc_t fij[3] = {0};
+  if (Str > 0) {
+    #pragma noinline
+    Stb = ref_lennard_jones_bondorder(ka, i, j, VA, fij);
+  }
+  flt_t fpair = -(dStr * (Stb * cij * VLJ - cij * VLJ) +
+                   dVLJ * (Str * Stb * cij + cij - Str * cij)) / rij;
+  flt_t evdwl = VA * Stb + (1 - Str) * cij * VLJ;
+  result_f[i].x += fpair * delx + fij[0];
+  result_f[i].y += fpair * dely + fij[1];
+  result_f[i].z += fpair * delz + fij[2];
+  result_f[j].x -= fpair * delx + fij[0];
+  result_f[j].y -= fpair * dely + fij[1];
+  result_f[j].z -= fpair * delz + fij[2];
+  ka->result_eng += evdwl;
+
+  if (cij < 1) {
+    #pragma noinline
+    ref_lennard_jones_force_path(ka, Str * Stb * VLJ + (1 - Str) * VLJ, 
+      &testpath);
+  }
+}
+
+template<typename flt_t, typename acc_t>
+void ref_lennard_jones_single_atom(KernelArgsAIREBOT<flt_t,acc_t> * ka, int i,
+				   int morseflag) {
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * tag = ka->tag;
+  int jj;
+  int itag = tag[i];
+  int * neighs = ka->neigh_lmp.entries + ka->neigh_lmp.offset[i];
+  int jnum = ka->neigh_lmp.num_half[i];
+  for (jj = 0; jj < jnum; jj++) {
+    int j = neighs[jj];
+    ref_lennard_jones_single_interaction(ka, i, j, morseflag);
+  }
+}
+
+template<typename flt_t, typename acc_t>
+void ref_lennard_jones(KernelArgsAIREBOT<flt_t,acc_t> * ka, int morseflag) {
+  for (int i = ka->frebo_from_atom; i < ka->frebo_to_atom; i++) {
+    #pragma noinline
+    ref_lennard_jones_single_atom(ka, i, morseflag);
+  }
+}
+
+/* ----------------------------------------------------------------------
+    Vectorized AIREBO implementation, standalone, using caching to reduce
+    memory access.
+   ---------------------------------------------------------------------- */
+
+template<typename flt_t, typename acc_t>
+struct aut_wrap {
+
+typedef typename intr_types<flt_t, acc_t>::fvec fvec;
+typedef typename intr_types<flt_t, acc_t>::avec avec;
+typedef typename intr_types<flt_t, acc_t>::ivec ivec;
+typedef typename intr_types<flt_t, acc_t>::bvec bvec;
+
+VEC_INLINE inline
+static void aut_loadatoms_vec(
+    AtomAIREBOT<flt_t> * atoms, ivec j_vec,
+    fvec *x, fvec * y, fvec * z, bvec * type_mask, int * map, ivec map_i, 
+    ivec c_1
+) {
+  const ivec c_4 = ivec::set1(4);
+  ivec j_vec_4 = ivec::mullo(c_4, j_vec);
+  fvec w;
+  fvec::gather_4_adjacent(j_vec_4, &atoms[0].x, sizeof(flt_t), x, y, z, &w);
+  ivec jtype = fvec::unpackloepi32(w);
+  jtype = ivec::srlv(map_i, jtype); //_mm512_castpd_si512(w));
+  jtype = ivec::the_and(c_1, jtype);
+  bvec jtype_mask = ivec::cmpneq(jtype, ivec::setzero());
+  *type_mask = jtype_mask;
+}
+
+VEC_INLINE inline
+static void aut_loadatoms_vec_notype(
+    AtomAIREBOT<flt_t> * atoms, ivec j_vec,
+    fvec *x, fvec * y, fvec * z
+) {
+  const ivec c_4 = ivec::set1(4);
+  ivec j_vec_4 = ivec::mullo(c_4, j_vec);
+  fvec::gather_3_adjacent(j_vec_4, &atoms[0].x, sizeof(flt_t), x, y, z);
+}
+
+static fvec aut_Sp2_deriv(fvec r, fvec lo, fvec hi, fvec * d) {
+  fvec c_1 = fvec::set1(1);
+  fvec c_2 = fvec::set1(2);
+  fvec c_3 = fvec::set1(3);
+  fvec c_6 = fvec::set1(6);
+  bvec m_lo = fvec::cmple(r, lo);
+  bvec m_hi = fvec::cmpnlt(r, hi); // nlt == ge
+  bvec m_tr = bvec::kandn(m_lo, ~ m_hi);
+  fvec ret = c_1;
+  ret = fvec::mask_blend(m_hi, ret, fvec::setzero());
+  fvec der = fvec::setzero();
+  if (bvec::test_any_set(m_tr)) {
+    fvec diff = hi -  lo;
+    fvec rcp = fvec::recip(diff);
+    fvec t = (r -  lo) *  rcp;
+    fvec v = c_1 -  t *  t * ( c_3 -  c_2 *  t);
+    ret = fvec::mask_blend(m_tr, ret, v);
+    fvec dv = c_6 *  rcp * ( t *  t -  t);
+    der = fvec::mask_blend(m_tr, der, dv);
+  }
+  *d = der;
+  return ret;
+}
+
+static fvec aut_Sp_deriv(fvec r, fvec lo, fvec hi, fvec * d) {
+  fvec c_1 = fvec::set1(1);
+  fvec c_0_5 = fvec::set1(0.5);
+  fvec c_m0_5 = fvec::set1(-0.5);
+  fvec c_PI = fvec::set1(M_PI);
+  bvec m_lo = fvec::cmple(r, lo);
+  bvec m_hi = fvec::cmpnlt(r, hi); // nlt == ge
+  bvec m_tr = bvec::kandn(m_lo, ~ m_hi);
+  fvec ret = c_1;
+  ret = fvec::mask_blend(m_hi, ret, fvec::setzero());
+  fvec der = fvec::setzero();
+  if (bvec::test_any_set(m_tr)) {
+    fvec diff = hi -  lo;
+    fvec rcp = fvec::mask_recip(c_1, m_tr, diff);
+    fvec t = (r -  lo) /  diff;
+    fvec sinval, cosval;
+    sinval = fvec::mask_sincos(&cosval, fvec::setzero(), c_1, m_tr, c_PI *  t);
+    fvec v = c_0_5 * ( c_1 +  cosval);
+    ret = fvec::mask_blend(m_tr, ret, v);
+    fvec dv = c_PI *  c_m0_5 *  rcp *  sinval;
+    der = fvec::mask_blend(m_tr, der, dv);
+  }
+  *d = der;
+  return ret;
+}
+
+static fvec aut_mask_Sp(bvec mask, fvec r, fvec lo, fvec hi) {
+  fvec c_1 = fvec::set1(1);
+  fvec c_0_5 = fvec::set1(0.5);
+  fvec c_PI = fvec::set1(M_PI);
+  bvec m_lo = fvec::mask_cmple(mask, r, lo);
+  bvec m_hi = fvec::mask_cmpnlt(mask, r, hi); // nlt == ge
+  bvec m_tr = bvec::kandn(m_lo, bvec::kandn(m_hi, mask));
+  fvec ret = c_1;
+  ret = fvec::mask_blend(m_hi, ret, fvec::setzero());
+  if (bvec::test_any_set(m_tr)) {
+    fvec rcp = fvec::mask_recip(c_1, m_tr, hi -  lo);
+    fvec t = (r -  lo) *  rcp;
+    fvec v = c_0_5 * ( c_1 +  fvec::mask_cos(c_1, m_tr, c_PI *  t));
+    ret = fvec::mask_blend(m_tr, ret, v);
+  }
+  return ret;
+}
+
+static void aut_rebo_neigh(KernelArgsAIREBOT<flt_t,acc_t> * ka) {
+  int offset = ka->neigh_from_atom * ka->num_neighs_per_atom;
+  ivec c_CARBON = ivec::setzero();
+  int map_i = 0;
+  int i;
+  for (i = 1; i < ka->num_types; i++) {
+    if (ka->map[i])
+      map_i |= (1 << i);
+  }
+  ivec c_i1 = ivec::set1(1);
+  ivec c_im = ivec::set1(map_i);
+  AtomAIREBOT<flt_t> * _noalias x = ka->x;
+
+  for (i = ka->neigh_from_atom; i < ka->neigh_to_atom; i++) {
+
+    fvec x_i = fvec::set1(x[i].x);
+    fvec y_i = fvec::set1(x[i].y);
+    fvec z_i = fvec::set1(x[i].z);
+    int itype = ka->map[ka->x[i].w];
+
+    fvec rcmaxsq0 = fvec::set1(ka->params.rcmaxsq[itype][0]);
+    fvec rcmaxsq1 = fvec::set1(ka->params.rcmaxsq[itype][1]);
+    fvec rcmax0 = fvec::set1(ka->params.rcmax[itype][0]);
+    fvec rcmax1 = fvec::set1(ka->params.rcmax[itype][1]);
+    fvec rcmin0 = fvec::set1(ka->params.rcmin[itype][0]);
+    fvec rcmin1 = fvec::set1(ka->params.rcmin[itype][1]);
+    fvec rcmaxskinsq0 = fvec::set1(
+        (ka->params.rcmax[itype][0] + ka->skin) * (ka->params.rcmax[itype][0] +
+						   ka->skin));
+    fvec rcmaxskinsq1 = fvec::set1(
+        (ka->params.rcmax[itype][1] + ka->skin) * (ka->params.rcmax[itype][1] +
+						   ka->skin));
+    fvec nC = fvec::setzero();
+    fvec nH = fvec::setzero();
+
+    ka->neigh_rebo.offset[i] = offset;
+
+    int jnum = ka->rebuild_flag ? ka->neigh_lmp.num[i] : 
+      ka->neigh_rebo.num_half[i];
+    int * neighs = ka->rebuild_flag ? 
+      &ka->neigh_lmp.entries[ka->neigh_lmp.offset[i]] : 
+      &ka->neigh_rebo.entries[ka->neigh_rebo.offset[i]+jnum];
+    int * skin_target = &ka->neigh_rebo.entries[offset+ka->num_neighs_per_atom];
+    int n = 0;
+    int n_skin = 0;
+
+    int lowest_idx;
+    #pragma unroll(4)
+    for (lowest_idx = 0; lowest_idx < jnum; lowest_idx += fvec::VL) {
+      bvec j_mask = bvec::full();
+      if (lowest_idx + fvec::VL > jnum) j_mask = bvec::only(jnum - lowest_idx);
+
+      int * _noalias neighs_l = neighs + lowest_idx;
+      fvec x_j, y_j, z_j;
+      bvec jtype_mask;
+      ivec ji = ivec::maskz_loadu(j_mask, neighs_l);
+      aut_loadatoms_vec(x, ji,
+          &x_j, &y_j, &z_j, &jtype_mask, ka->map, c_im, c_i1);
+      fvec delx = x_i -  x_j;
+      fvec dely = y_i -  y_j;
+      fvec delz = z_i -  z_j;
+      fvec rsq = delx *  delx +  dely *  dely +  delz *  delz;
+      if (ka->rebuild_flag) {
+        fvec rcmaxskinsq = fvec::mask_blend(jtype_mask, rcmaxskinsq0, 
+					    rcmaxskinsq1);
+        bvec c_mask = fvec::mask_cmplt(j_mask, rsq, rcmaxskinsq);
+        ivec::mask_compressstore(c_mask, &skin_target[n_skin], ji);
+        n_skin += bvec::popcnt(c_mask);
+      }
+      fvec rcmaxsq = fvec::mask_blend(jtype_mask, rcmaxsq0, rcmaxsq1);
+      bvec c_mask = fvec::mask_cmplt(j_mask, rsq, rcmaxsq);
+      if (bvec::test_all_unset(c_mask)) continue;
+      ivec::mask_compressstore(c_mask, &ka->neigh_rebo.entries[offset + n], ji);
+      n += bvec::popcnt(c_mask);
+      fvec rcmax = fvec::mask_blend(jtype_mask, rcmax0, rcmax1);
+      fvec rcmin = fvec::mask_blend(jtype_mask, rcmin0, rcmin1);
+      fvec sp = aut_mask_Sp(c_mask, fvec::sqrt(rsq), rcmin, rcmax);
+      nC = fvec::mask_add(nC, bvec::kandn(jtype_mask, c_mask), nC, sp);
+      nH = fvec::mask_add(nH, bvec::kand (jtype_mask, c_mask), nH, sp);
+    }
+    ka->neigh_rebo.num[i] = n;
+    if (ka->rebuild_flag) {
+      for (int i = 0; i < n_skin; i++) {
+        ka->neigh_rebo.entries[offset+n_skin+i] = skin_target[i];
+      }
+    }
+    if (ka->rebuild_flag) {
+      assert(n <= n_skin);
+      offset += 2 * n_skin;
+      ka->neigh_rebo.num_half[i] = n_skin;
+    } else {
+      assert(n <= jnum);
+      offset += 2 * jnum;
+    }
+    ka->nC[i] = fvec::reduce_add(nC);
+    ka->nH[i] = fvec::reduce_add(nH);
+  }
+}
+
+
+static fvec aut_eval_poly_lin_pd_2(int n, flt_t * vals, ivec idx, fvec x, 
+				   fvec * deriv) {
+  fvec c_1 = fvec::set1(1);
+  fvec x_i = c_1;
+  fvec x_im1 = fvec::setzero();
+  fvec result = fvec::setzero();
+  fvec i_v = fvec::setzero();
+  *deriv = fvec::setzero();
+  int i;
+  for (i = 0; i < n; i++) {
+    fvec coeff = fvec::gather(idx, vals + i, sizeof(flt_t));
+    result = result +  coeff *  x_i;
+    *deriv = *deriv +  coeff *  x_im1 *  i_v;
+    x_im1 = x_i;
+    x_i = x_i *  x;
+    i_v = i_v +  c_1;
+  }
+  return result;
+}
+
+static fvec aut_mask_gSpline_pd_2(KernelArgsAIREBOT<flt_t,acc_t> * ka, 
+				  bvec active_mask, int itype, fvec cosjik, 
+				  fvec Nij, fvec *dgdc, fvec *dgdN) {
+  int i;
+  flt_t * gDom = NULL;
+  int nDom = 0;
+  ivec offs = ivec::setzero();
+  fvec NCmin = fvec::set1(ka->params.NCmin);
+  bvec Ngt = fvec::cmpnle(Nij, NCmin); //gt
+  if (itype == 0) {
+    nDom = 4;
+    gDom = &ka->params.gCdom[0];
+    offs = ivec::mask_blend(Ngt, offs, ivec::set1(4*6));
+  } else {
+    nDom = 3;
+    gDom = &ka->params.gHdom[0];
+    offs = ivec::set1(8 * 6);
+  }
+  cosjik = fvec::max(fvec::set1(gDom[0]), fvec::min(fvec::set1(gDom[nDom]), 
+						    cosjik));
+  ivec index6 = ivec::setzero();
+  for (i = 0; i < nDom; i++) {
+    bvec cosge = fvec::cmpnlt(cosjik, fvec::set1(gDom[i])); //ge
+    bvec cosle = fvec::cmple(cosjik, fvec::set1(gDom[i+1]));
+    index6 = ivec::mask_blend(cosge & cosle, index6, ivec::set1(6*i));
+  }
+  fvec g = aut_eval_poly_lin_pd_2(6, &ka->params.gVal[0], offs +  index6, 
+				  cosjik, dgdc);
+  *dgdN = fvec::setzero();
+  if (itype == 0) {
+    fvec NCmax = fvec::set1(ka->params.NCmax);
+    bvec Nlt = fvec::cmplt(Nij, NCmax); //gt
+    bvec Nmask = Ngt & Nlt;
+    if (bvec::test_any_set(Nmask)) {
+      fvec dg1;
+      fvec g1 = aut_eval_poly_lin_pd_2(6, &ka->params.gVal[0], index6, cosjik, 
+				       &dg1);
+      fvec dS;
+      fvec cut = aut_Sp_deriv(Nij, NCmin, NCmax, &dS);
+      *dgdN = fvec::mask_mul(*dgdN, Nmask, dS, g1 -  g);
+      g = fvec::mask_add(g, Nmask, g, cut * ( g1 -  g));
+      *dgdc = fvec::mask_add(*dgdc, Nmask, *dgdc, cut * ( dg1 -  *dgdc));
+    }
+  }
+  return g;
+}
+
+static fvec aut_PijSpline(KernelArgsAIREBOT<flt_t,acc_t> * ka, int itype, 
+			  int jtype, fvec NijC, fvec NijH, fvec *dN2) {
+  flt_t ret[fvec::VL] __attribute__((aligned(64)));
+  flt_t dN20[fvec::VL] __attribute__((aligned(64)));
+  flt_t dN21[fvec::VL] __attribute__((aligned(64)));
+  flt_t NijC_[fvec::VL] __attribute__((aligned(64)));
+  flt_t NijH_[fvec::VL] __attribute__((aligned(64)));
+  flt_t tmp_dN2[2];
+  fvec::store(NijC_, NijC);
+  fvec::store(NijH_, NijH);
+  int i;
+  for (i = 0; i < fvec::VL; i++) {
+    ret[i] = PijSpline(ka, itype, jtype, NijC_[i], NijH_[i], tmp_dN2);
+    dN20[i] = tmp_dN2[0];
+    dN21[i] = tmp_dN2[1];
+  }
+  dN2[0] = fvec::load(dN20);
+  dN2[1] = fvec::load(dN21);
+  return fvec::load(ret);
+}
+
+/*
+ * aut_frebo_data stores all the short-ranged coordinations
+ * and intermediate values that get reused frequently during
+ * bondorder calculations.
+ * BUF_CAP should rarely exceed 4, so 8 is a very conservative
+ * value.
+ */
+static const int BUF_CAP = 8;
+struct aut_frebo_data {
+  fvec rikx_buf[BUF_CAP];
+  fvec riky_buf[BUF_CAP];
+  fvec rikz_buf[BUF_CAP];
+  fvec rikmag_buf[BUF_CAP];
+  fvec cosjik_buf[BUF_CAP];
+  ivec k_buf[BUF_CAP];
+  fvec g_buf[BUF_CAP];
+  fvec dgdc_buf[BUF_CAP];
+  fvec ex_lam_buf[BUF_CAP];
+  fvec wik_buf[BUF_CAP];
+  fvec dwik_buf[BUF_CAP];
+  fvec cutN_buf[BUF_CAP];
+  fvec dcutN_buf[BUF_CAP];
+  bvec ktype_buf[BUF_CAP];
+  bvec mask_buf[BUF_CAP];
+  fvec force_k_x_buf[BUF_CAP];
+  fvec force_k_y_buf[BUF_CAP];
+  fvec force_k_z_buf[BUF_CAP];
+  int buf_len;
+  fvec x_i;
+  fvec y_i;
+  fvec z_i;
+  fvec x_j;
+  fvec y_j;
+  fvec z_j;
+  fvec nCi;
+  fvec nHi;
+  fvec force_i_x;
+  fvec force_i_y;
+  fvec force_i_z;
+  fvec force_j_x;
+  fvec force_j_y;
+  fvec force_j_z;
+};
+
+/*
+ * Initialize values in aut_frebo_data and perform the calculations
+ * for p_ij.
+ */
+static fvec aut_frebo_pij_pd_2(
+    KernelArgsAIREBOT<flt_t,acc_t> * _noalias ka,
+    struct aut_frebo_data * _noalias data,
+    int itype, int jtype,
+    ivec vi, ivec vj,
+    fvec rijx, fvec rijy, fvec rijz, fvec rijmag,
+    fvec wij, fvec VA, fvec * sum_N, fvec fij[3]
+) {
+  AtomAIREBOT<flt_t> * _noalias x = ka->x;
+  int * _noalias map = ka->map;
+  flt_t * _noalias nC = ka->nC;
+  flt_t * _noalias nH = ka->nH;
+  fvec x_i, y_i, z_i;
+  fvec x_j, y_j, z_j;
+  x_i = data->x_i;
+  y_i = data->y_i;
+  z_i = data->z_i;
+  x_j = data->x_j;
+  y_j = data->y_j;
+  z_j = data->z_j;
+  fvec invrijm = fvec::recip(rijmag);
+  fvec invrijm2 = invrijm *  invrijm;
+  fvec rcminij = fvec::set1(ka->params.rcmin[itype][jtype]);
+  fvec rcmaxij = fvec::set1(ka->params.rcmax[itype][jtype]);
+  fvec Nmin = fvec::set1(ka->params.Nmin);
+  fvec Nmax = fvec::set1(ka->params.Nmax);
+  int map_i_scalar = 0;
+  {
+    int i;
+    for (i = 1; i < ka->num_types; i++) {
+      if (ka->map[i])
+        map_i_scalar |= (1 << i);
+    }
+  }
+  ivec map_i = ivec::set1(map_i_scalar);
+  fvec nCi = data->nCi;
+  fvec nHi = data->nHi;
+  fvec Nij = nHi +  nCi -  wij;
+  fvec factor_jtype, factor_not_jtype;
+  if (jtype) {
+    factor_jtype = fvec::set1(1);
+    factor_not_jtype = fvec::set1(0);
+  } else {
+    factor_jtype = fvec::set1(0);
+    factor_not_jtype = fvec::set1(1);
+  }
+  fvec NijC = nCi -  wij *  factor_not_jtype;
+  fvec NijH = nHi -  wij *  factor_jtype;
+  fvec sum_pij = fvec::setzero();
+  fvec sum_dpij_dN = fvec::setzero();
+  fvec dN2[2];
+  ivec offseti = ivec::mask_gather(ivec::setzero(), bvec::full(), vi, 
+				   ka->neigh_rebo.offset, sizeof(int));
+  int buf_len = 0;
+  ivec knum = ivec::mask_gather(ivec::setzero(), bvec::full(), vi, 
+				ka->neigh_rebo.num, sizeof(int));
+  ivec kk = ivec::setzero();
+  bvec active_mask = ivec::cmplt(kk, knum);
+  ivec c_i1 = ivec::set1(1);
+  fvec rho_j = fvec::set1(ka->params.rho[jtype][1]);
+  fvec rho_k0 = fvec::set1(ka->params.rho[0][1]);
+  fvec rho_k1 = fvec::set1(ka->params.rho[1][1]);
+  fvec c_4 = fvec::set1(4);
+  fvec c_2_0 = fvec::set1(2.0);
+  fvec c_m2_0 = fvec::set1(-2.0);
+  fvec c_4_0 = fvec::set1(4.0);
+  fvec c_0_5 = fvec::set1(0.5);
+  fvec c_m0_5 = fvec::set1(-0.5);
+  fvec c_1 = fvec::set1(1);
+  fvec c_m1 = fvec::set1(-1);
+  fvec factor_itype = itype ? c_1 : fvec::setzero();
+  fvec rcmax0 = fvec::set1(ka->params.rcmax[itype][0]);
+  fvec rcmax1 = fvec::set1(ka->params.rcmax[itype][1]);
+  fvec rcmin0 = fvec::set1(ka->params.rcmin[itype][0]);
+  fvec rcmin1 = fvec::set1(ka->params.rcmin[itype][1]);
+  fvec result_f_i_x = fvec::setzero();
+  fvec result_f_i_y = fvec::setzero();
+  fvec result_f_i_z = fvec::setzero();
+  fvec result_f_j_x = fvec::setzero();
+  fvec result_f_j_y = fvec::setzero();
+  fvec result_f_j_z = fvec::setzero();
+  *sum_N = fvec::setzero();
+  {
+    while (bvec::test_any_set(active_mask)) {
+      ivec k = ivec::mask_gather(ivec::setzero(), active_mask, kk +  offseti, 
+				 ka->neigh_rebo.entries, sizeof(int));
+      bvec excluded_mask = ivec::cmpeq(k, vj) & active_mask;
+      if (bvec::test_any_set(excluded_mask)) {
+        kk = ivec::mask_add(kk, excluded_mask, kk, c_i1);
+        active_mask = ivec::cmplt(kk, knum);
+        continue;
+      }
+      fvec x_k, y_k, z_k;
+      bvec ktype_mask;
+      aut_loadatoms_vec(x, k, &x_k, &y_k, &z_k, &ktype_mask, ka->map, map_i, 
+			c_i1);
+      fvec rikx = x_i -  x_k;
+      fvec riky = y_i -  y_k;
+      fvec rikz = z_i -  z_k;
+      fvec rikmag = fvec::sqrt(rikx *  rikx +  riky *  riky +  rikz *  rikz);
+      fvec rho_k = fvec::mask_blend(ktype_mask, rho_k0, rho_k1);
+      fvec lamdajik = c_4 *  factor_itype * ( rho_k -  rikmag - ( rho_j -  
+								  rijmag));
+      fvec ex_lam = fvec::exp(lamdajik);
+      fvec rcmax = fvec::mask_blend(ktype_mask, rcmax0, rcmax1);
+      fvec rcmin = fvec::mask_blend(ktype_mask, rcmin0, rcmin1);
+      fvec dwik;
+      fvec wik = aut_Sp_deriv(rikmag, rcmin, rcmax, &dwik);
+      fvec Nki = fvec::gather(k, nC, sizeof(flt_t)) +  
+	fvec::gather(k, nH, sizeof(flt_t)) -  wik;
+      fvec cosjik = (rijx *  rikx +  rijy *  riky +  rijz *  rikz) / 
+	( rijmag *  rikmag);
+      cosjik = fvec::min(c_1, fvec::max(c_m1, cosjik));
+      fvec dgdc, dgdN;
+      fvec g = aut_mask_gSpline_pd_2(ka, active_mask, itype, cosjik, Nij, 
+				     &dgdc, &dgdN);
+      sum_pij = fvec::mask_add(sum_pij, active_mask, sum_pij, wik * g * ex_lam);
+      sum_dpij_dN = fvec::mask_add(sum_dpij_dN, active_mask, sum_dpij_dN, 
+				   wik * ex_lam * dgdN);
+      fvec dcutN;
+      fvec cutN = aut_Sp_deriv(Nki, Nmin, Nmax, &dcutN);
+      *sum_N = fvec::mask_add(*sum_N, active_mask, *sum_N, 
+			      fvec::mask_blend(ktype_mask, c_1, 
+					       fvec::setzero()) * wik * cutN);
+      if (buf_len == BUF_CAP) goto exceed_buffer;
+      data->rikx_buf[buf_len] = rikx;
+      data->riky_buf[buf_len] = riky;
+      data->rikz_buf[buf_len] = rikz;
+      data->rikmag_buf[buf_len] = rikmag;
+      data->cosjik_buf[buf_len] = cosjik;
+      data->ktype_buf[buf_len] = ktype_mask;
+      data->k_buf[buf_len] = k;
+      data->g_buf[buf_len] = g;
+      data->dgdc_buf[buf_len] = dgdc;
+      data->ex_lam_buf[buf_len] = ex_lam;
+      data->wik_buf[buf_len] = wik;
+      data->dwik_buf[buf_len] = dwik;
+      data->mask_buf[buf_len] = active_mask;
+      data->cutN_buf[buf_len] = cutN;
+      data->dcutN_buf[buf_len] = dcutN;
+      buf_len += 1;
+      kk = ivec::mask_add(kk, active_mask, kk, c_i1);
+      active_mask = ivec::cmplt(kk, knum);
+    }
+    data->buf_len = buf_len;
+    fvec PijS = aut_PijSpline(ka, itype, jtype, NijC, NijH, &dN2[0]);
+    fvec pij = fvec::invsqrt(c_1 + sum_pij + PijS);
+    fvec tmp = c_m0_5 * pij * pij * pij;
+    int buf_idx;
+    for (buf_idx = 0; buf_idx < buf_len; buf_idx++) {
+      fvec rikx = data->rikx_buf[buf_idx];
+      fvec riky = data->riky_buf[buf_idx];
+      fvec rikz = data->rikz_buf[buf_idx];
+      fvec rikmag = data->rikmag_buf[buf_idx];
+      fvec cosjik = data->cosjik_buf[buf_idx];
+      bvec ktype_mask = data->ktype_buf[buf_idx];
+      ivec k = data->k_buf[buf_idx];
+      fvec g = data->g_buf[buf_idx];
+      fvec dgdc = data->dgdc_buf[buf_idx];
+      fvec ex_lam = data->ex_lam_buf[buf_idx];
+      fvec wik = data->wik_buf[buf_idx];
+      fvec dwik = data->dwik_buf[buf_idx];
+      bvec mask = data->mask_buf[buf_idx];
+      fvec invrikm = fvec::recip(rikmag);
+      fvec rjkx = rikx -  rijx;
+      fvec rjky = riky -  rijy;
+      fvec rjkz = rikz -  rijz;
+      fvec rjkmag = fvec::sqrt(
+           rjkx *  rjkx +  rjky *  rjky +  rjkz *  rjkz);
+      fvec rijrik = c_2_0 *  rijmag *  rikmag;
+      fvec rr = rijmag *  rijmag -  rikmag *  rikmag;
+      fvec dctdjk = c_m2_0 /  rijrik;
+      fvec dctdik = (rjkmag *  rjkmag -  rr) / ( rijrik *  rikmag *  rikmag);
+      fvec dctdij = (rjkmag *  rjkmag +  rr) / ( rijrik *  rijmag *  rijmag);
+      fvec fi[3], fj[3], fk[3];
+      fvec pref = c_0_5 *  VA *  tmp;
+      fvec tmp20 = pref *  wik *  dgdc *  ex_lam;
+      fj[0] = fj[1] = fj[2] = fvec::setzero();
+      fvec tmpdik = tmp20 *  dctdik;
+      fi[0] = fvec::setzero() -  tmpdik *  rikx;
+      fi[1] = fvec::setzero() -  tmpdik *  riky;
+      fi[2] = fvec::setzero() -  tmpdik *  rikz;
+      fk[0] = tmpdik *  rikx;
+      fk[1] = tmpdik *  riky;
+      fk[2] = tmpdik *  rikz;
+
+      fvec tmpdij = tmp20 *  dctdij;
+      fij[0] = fvec::mask_sub(fij[0], mask, fij[0], tmpdij *  rijx);
+      fij[1] = fvec::mask_sub(fij[1], mask, fij[1], tmpdij *  rijy);
+      fij[2] = fvec::mask_sub(fij[2], mask, fij[2], tmpdij *  rijz);
+
+      fvec tmpdjk = tmp20 *  dctdjk;
+      fi[0] = fi[0] -  tmpdjk *  rjkx;
+      fi[1] = fi[1] -  tmpdjk *  rjky;
+      fi[2] = fi[2] -  tmpdjk *  rjkz;
+      fk[0] = fk[0] +  tmpdjk *  rjkx;
+      fk[1] = fk[1] +  tmpdjk *  rjky;
+      fk[2] = fk[2] +  tmpdjk *  rjkz;
+      fij[0] = fvec::mask_add(fij[0], mask, fij[0], tmpdjk *  rjkx);
+      fij[1] = fvec::mask_add(fij[1], mask, fij[1], tmpdjk *  rjky);
+      fij[2] = fvec::mask_add(fij[2], mask, fij[2], tmpdjk *  rjkz);
+
+      if (itype) {
+        fvec tmp21 = pref *  wik *  g *  ex_lam *  c_4_0;
+        fvec tmp21ij = tmp21 *  invrijm;
+        fij[0] = fvec::mask_sub(fij[0], mask, fij[0], tmp21ij * rijx);
+        fij[1] = fvec::mask_sub(fij[1], mask, fij[1], tmp21ij * rijy);
+        fij[2] = fvec::mask_sub(fij[2], mask, fij[2], tmp21ij * rijz);
+        fvec tmp21ik = tmp21 * invrikm;
+        fi[0] = fi[0] +  tmp21ik *  rikx;
+        fi[1] = fi[1] +  tmp21ik *  riky;
+        fi[2] = fi[2] +  tmp21ik *  rikz;
+        fk[0] = fk[0] -  tmp21ik *  rikx;
+        fk[1] = fk[1] -  tmp21ik *  riky;
+        fk[2] = fk[2] -  tmp21ik *  rikz;
+      }
+
+      // coordination forces
+
+      // dwik forces
+      fvec tmp22 = pref *  dwik *  g *  ex_lam *  invrikm;
+      fi[0] = fi[0] -  tmp22 *  rikx;
+      fi[1] = fi[1] -  tmp22 *  riky;
+      fi[2] = fi[2] -  tmp22 *  rikz;
+      fk[0] = fk[0] +  tmp22 *  rikx;
+      fk[1] = fk[1] +  tmp22 *  riky;
+      fk[2] = fk[2] +  tmp22 *  rikz;
+
+      // PIJ forces
+      fvec dN2ktype = fvec::mask_blend(ktype_mask, dN2[0], dN2[1]);
+      fvec tmp23 = pref *  dN2ktype *  dwik *  invrikm;
+      fi[0] = fi[0] -  tmp23 *  rikx;
+      fi[1] = fi[1] -  tmp23 *  riky;
+      fi[2] = fi[2] -  tmp23 *  rikz;
+      fk[0] = fk[0] +  tmp23 *  rikx;
+      fk[1] = fk[1] +  tmp23 *  riky;
+      fk[2] = fk[2] +  tmp23 *  rikz;
+
+      // dgdN forces
+      fvec tmp24 = pref *  sum_dpij_dN *  dwik *  invrikm;
+      fi[0] = fi[0] -  tmp24 *  rikx;
+      fi[1] = fi[1] -  tmp24 *  riky;
+      fi[2] = fi[2] -  tmp24 *  rikz;
+      fk[0] = fk[0] +  tmp24 *  rikx;
+      fk[1] = fk[1] +  tmp24 *  riky;
+      fk[2] = fk[2] +  tmp24 *  rikz;
+
+      result_f_i_x = fvec::mask_add(result_f_i_x, mask, result_f_i_x, fi[0]);
+      result_f_i_y = fvec::mask_add(result_f_i_y, mask, result_f_i_y, fi[1]);
+      result_f_i_z = fvec::mask_add(result_f_i_z, mask, result_f_i_z, fi[2]);
+      result_f_j_x = fvec::mask_add(result_f_j_x, mask, result_f_j_x, fj[0]);
+      result_f_j_y = fvec::mask_add(result_f_j_y, mask, result_f_j_y, fj[1]);
+      result_f_j_z = fvec::mask_add(result_f_j_z, mask, result_f_j_z, fj[2]);
+
+      data->force_k_x_buf[buf_idx] = fk[0];
+      data->force_k_y_buf[buf_idx] = fk[1];
+      data->force_k_z_buf[buf_idx] = fk[2];
+    }
+    data->force_i_x = result_f_i_x;
+    data->force_i_y = result_f_i_y;
+    data->force_i_z = result_f_i_z;
+    data->force_j_x = result_f_j_x;
+    data->force_j_y = result_f_j_y;
+    data->force_j_z = result_f_j_z;
+    return pij;
+  }
+  exceed_buffer:
+  data->buf_len = -1;
+  return fvec::setzero();
+}
+
+/*
+ * Apply the force values stored iin aut_frebo_data to
+ * the respective neighbors.
+ */
+static void aut_frebo_data_writeback(
+    KernelArgsAIREBOT<flt_t,acc_t> * _noalias ka, 
+    struct aut_frebo_data * _noalias data) {
+  ResultForceT<acc_t> * _noalias result_f = ka->result_f;
+  flt_t fk_x_buf[fvec::VL] __attribute__((aligned(64)));
+  flt_t fk_y_buf[fvec::VL] __attribute__((aligned(64)));
+  flt_t fk_z_buf[fvec::VL] __attribute__((aligned(64)));
+  int fk_k_buf[ivec::VL] __attribute__((aligned(64)));
+  int buf_idx;
+  for (buf_idx = 0; buf_idx < data->buf_len; buf_idx++) {
+    ivec k = data->k_buf[buf_idx];
+    bvec active_mask = data->mask_buf[buf_idx];
+
+    fvec::store(fk_x_buf, data->force_k_x_buf[buf_idx]);
+    fvec::store(fk_y_buf, data->force_k_y_buf[buf_idx]);
+    fvec::store(fk_z_buf, data->force_k_z_buf[buf_idx]);
+    ivec::store(fk_k_buf, k);
+
+    int lane;
+    for (lane = 0; lane < fvec::VL; lane++) {
+      if (bvec::test_at(active_mask, lane)) {} else continue;
+      int kk = fk_k_buf[lane];
+      result_f[kk].x += fk_x_buf[lane];
+      result_f[kk].y += fk_y_buf[lane];
+      result_f[kk].z += fk_z_buf[lane];
+    }
+  }
+}
+
+static void aut_frebo_N_spline_force(
+     KernelArgsAIREBOT<flt_t,acc_t> * _noalias ka, 
+     struct aut_frebo_data * _noalias data, int itype, int jtype, ivec vi, 
+     ivec vj, fvec VA, fvec dN, fvec dNconj, fvec Nconj) {
+  ivec c_i1 = ivec::set1(1);
+  fvec c_2 = fvec::set1(2);
+  fvec c_TOL = fvec::set1(TOL);
+  ResultForceT<acc_t> * _noalias result_f = ka->result_f;
+  AtomAIREBOT<flt_t> * _noalias x = ka->x;
+  int * _noalias map = ka->map;
+  flt_t * _noalias nC = ka->nC;
+  flt_t * _noalias nH = ka->nH;
+  fvec x_i, y_i, z_i;
+  x_i = data->x_i;
+  y_i = data->y_i;
+  z_i = data->z_i;
+  fvec Nmin = fvec::set1(ka->params.Nmin);
+  fvec Nmax = fvec::set1(ka->params.Nmax);
+  int map_i_scalar = 0;
+  {
+    int i;
+    for (i = 1; i < ka->num_types; i++) {
+      if (ka->map[i])
+        map_i_scalar |= (1 << i);
+    }
+  }
+  ivec map_i = ivec::set1(map_i_scalar);
+  fvec dN2[2];
+  ivec kk = ivec::setzero();
+  fvec rcmax0 = fvec::set1(ka->params.rcmax[itype][0]);
+  fvec rcmax1 = fvec::set1(ka->params.rcmax[itype][1]);
+  fvec rcmin0 = fvec::set1(ka->params.rcmin[itype][0]);
+  fvec rcmin1 = fvec::set1(ka->params.rcmin[itype][1]);
+  fvec result_f_i_x = fvec::setzero();
+  fvec result_f_i_y = fvec::setzero();
+  fvec result_f_i_z = fvec::setzero();
+  int buf_idx;
+  for (buf_idx = 0; buf_idx < data->buf_len; buf_idx++) {
+    ivec k = data->k_buf[buf_idx];
+    bvec active_mask = data->mask_buf[buf_idx];
+    fvec rikx = data->rikx_buf[buf_idx];
+    fvec riky = data->riky_buf[buf_idx];
+    fvec rikz = data->rikz_buf[buf_idx];
+    fvec rikmag = data->rikmag_buf[buf_idx];
+    bvec ktype_mask = data->ktype_buf[buf_idx];
+
+    fvec dwik = data->dwik_buf[buf_idx];
+    fvec wik = data->wik_buf[buf_idx];
+
+    fvec dNki = data->dcutN_buf[buf_idx];
+    fvec SpN = data->cutN_buf[buf_idx];
+
+    fvec invrikmag = fvec::recip(rikmag);
+    fvec pref = VA *  dwik *  invrikmag;
+    fvec fdN = dN *  pref;
+    fvec fdNconj = pref *  SpN *  c_2 *  dNconj *  Nconj;
+    fvec ffactor = fdN;
+    bvec ktype_is_C = ~ ktype_mask;
+    ffactor = fvec::mask_add(ffactor, ktype_is_C, ffactor,  fdNconj);
+
+    fvec fkx = ffactor *  rikx;
+    fvec fky = ffactor *  riky;
+    fvec fkz = ffactor *  rikz;
+
+    data->force_k_x_buf[buf_idx] = data->force_k_x_buf[buf_idx] +  fkx;
+    data->force_k_y_buf[buf_idx] = data->force_k_y_buf[buf_idx] +  fky;
+    data->force_k_z_buf[buf_idx] = data->force_k_z_buf[buf_idx] +  fkz;
+
+    result_f_i_x = fvec::mask_sub(result_f_i_x, active_mask, result_f_i_x, fkx);
+    result_f_i_y = fvec::mask_sub(result_f_i_y, active_mask, result_f_i_y, fky);
+    result_f_i_z = fvec::mask_sub(result_f_i_z, active_mask, result_f_i_z, fkz);
+
+    bvec need_k_neighs = fvec::mask_cmpnle(active_mask, fvec::abs(dNki), c_TOL)
+      & ktype_is_C;
+    if (bvec::test_any_set(need_k_neighs)) {
+      int lane;
+      for (lane = 0; lane < fvec::VL; lane++) {
+        if (! bvec::test_at(need_k_neighs, lane)) continue;
+        int kk = ivec::at(k, lane);
+        int k = kk;
+        int ktype = map[x[k].w];
+        int i = ivec::at(vi, lane);
+        fvec oldVA = VA;
+        double VA = fvec::at(oldVA, lane);
+        fvec oldwik = wik;
+        double wik = fvec::at(oldwik, lane);
+        fvec olddNconj = dNconj;
+        double dNconj = fvec::at(olddNconj, lane);
+        fvec oldNconj = Nconj;
+        double Nconj = fvec::at(oldNconj, lane);
+        fvec olddNki = dNki;
+        double dNki = fvec::at(olddNki, lane);
+        int * neighs_k = ka->neigh_rebo.entries + ka->neigh_rebo.offset[k];
+        int nnum = ka->neigh_rebo.num[k];
+        int nn;
+        for (nn = 0; nn < nnum; nn++) {
+          int n = neighs_k[nn];
+          if (n == i) continue;
+          double rknx = x[k].x - x[n].x;
+          double rkny = x[k].y - x[n].y;
+          double rknz = x[k].z - x[n].z;
+          double rknmag = sqrt(rknx * rknx + rkny * rkny + rknz * rknz);
+          int ntype = map[x[n].w];
+          double rcminkn = ka->params.rcmin[ktype][ntype];
+          double rcmaxkn = ka->params.rcmax[ktype][ntype];
+          double dwkn;
+          Sp(rknmag, rcminkn, rcmaxkn, &dwkn);
+          double ffactor = VA * dNconj * 2 * Nconj * wik * dNki * dwkn / rknmag;
+          result_f[k].x -= ffactor * rknx;
+          result_f[k].y -= ffactor * rkny;
+          result_f[k].z -= ffactor * rknz;
+          result_f[n].x += ffactor * rknx;
+          result_f[n].y += ffactor * rkny;
+          result_f[n].z += ffactor * rknz;
+        }
+      }
+    }
+  }
+  data->force_i_x = data->force_i_x +  result_f_i_x;
+  data->force_i_y = data->force_i_y +  result_f_i_y;
+  data->force_i_z = data->force_i_z +  result_f_i_z;
+}
+
+static fvec aut_frebo_pi_rc_pd(KernelArgsAIREBOT<flt_t,acc_t> * ka, int itype,
+			       int jtype, fvec Nij, fvec Nji, fvec Nijconj, 
+			       fvec * dN3) {
+  flt_t ret[fvec::VL] __attribute__((aligned(64)));
+  flt_t dN3ret[3][fvec::VL] __attribute__((aligned(64)));
+  int i;
+  for (i = 0; i < fvec::VL; i++) {
+    flt_t dN3tmp[3];
+    ret[i] = frebo_pi_rc(ka, itype, jtype, fvec::at(Nij, i), fvec::at(Nji, i), 
+			 fvec::at(Nijconj, i), &dN3tmp[0]);
+    dN3ret[0][i] = dN3tmp[0];
+    dN3ret[1][i] = dN3tmp[1];
+    dN3ret[2][i] = dN3tmp[2];
+  }
+  dN3[0] = fvec::load(&dN3ret[0][0]);
+  dN3[1] = fvec::load(&dN3ret[1][0]);
+  dN3[2] = fvec::load(&dN3ret[2][0]);
+  return fvec::load(&ret[0]);
+}
+
+static fvec aut_frebo_Tij(KernelArgsAIREBOT<flt_t,acc_t> * ka, int itype, 
+			  int jtype, fvec Nij, fvec Nji, fvec Nijconj, 
+			  fvec * dN3) {
+  flt_t ret[fvec::VL] __attribute__((aligned(64)));
+  flt_t dN3ret[3][fvec::VL] __attribute__((aligned(64)));
+  int i;
+  for (i = 0; i < fvec::VL; i++) {
+    flt_t dN3tmp[3];
+    ret[i] = frebo_Tij(ka, itype, jtype, fvec::at(Nij, i), fvec::at(Nji, i), 
+		       fvec::at(Nijconj, i), &dN3tmp[0]);
+    dN3ret[0][i] = dN3tmp[0];
+    dN3ret[1][i] = dN3tmp[1];
+    dN3ret[2][i] = dN3tmp[2];
+  }
+  dN3[0] = fvec::load(&dN3ret[0][0]);
+  dN3[1] = fvec::load(&dN3ret[1][0]);
+  dN3[2] = fvec::load(&dN3ret[2][0]);
+  return fvec::load(&ret[0]);
+}
+
+static fvec aut_frebo_sum_omega(
+    KernelArgsAIREBOT<flt_t,acc_t> * _noalias ka,
+    struct aut_frebo_data * _noalias i_data,
+    struct aut_frebo_data * _noalias j_data,
+    int itype, int jtype,
+    ivec vi, ivec vj,
+    fvec r23x, fvec r23y, fvec r23z, fvec r23mag,
+    fvec VA, fvec fij[3]
+) {
+  fvec c_1 = fvec::set1(1);
+  fvec c_m1 = fvec::set1(-1);
+  fvec c_2 = fvec::set1(2);
+  fvec c_m2 = fvec::set1(-2);
+  fvec sum_omega = fvec::setzero();
+  fvec thmin = fvec::set1(ka->params.thmin);
+  fvec thmax = fvec::set1(ka->params.thmax);
+  // 2 == i, 3 == j
+  fvec r32x = fvec::setzero() -  r23x;
+  fvec r32y = fvec::setzero() -  r23y;
+  fvec r32z = fvec::setzero() -  r23z;
+  int buf_idx_i, buf_idx_j;
+  for (buf_idx_i = 0; buf_idx_i < i_data->buf_len; buf_idx_i++) {
+    // a1 == k == buf_idx_i
+    bvec mask_start = i_data->mask_buf[buf_idx_i];
+    fvec r21x = i_data->rikx_buf[buf_idx_i]; // a2 - a1 -> i - k
+    fvec r21y = i_data->riky_buf[buf_idx_i]; // a2 - a1 -> i - k
+    fvec r21z = i_data->rikz_buf[buf_idx_i]; // a2 - a1 -> i - k
+    fvec r21mag = i_data->rikmag_buf[buf_idx_i];
+    // TODO use buffered cosjik
+    fvec cos321 = (
+        r23x *  r21x +  r23y *  r21y +  r23z *  r21z) / ( r23mag *  r21mag);
+    cos321 = fvec::min(c_1, fvec::max(c_m1, cos321));
+    fvec sin321 = fvec::sqrt(c_1 -  cos321 *  cos321);
+    bvec mask_outer = fvec::cmpneq(fvec::setzero(), sin321) & mask_start;
+    // add "continue"
+    fvec sink2i = fvec::mask_recip(fvec::undefined(), mask_outer, 
+				   sin321 * sin321);
+    fvec rik2i = fvec::mask_recip(fvec::undefined(), mask_outer, 
+				  r21mag * r21mag);
+    fvec rr = r23mag *  r23mag -  r21mag *  r21mag;
+    fvec r31x = r21x -  r23x;
+    fvec r31y = r21y -  r23y;
+    fvec r31z = r21z -  r23z;
+    fvec r31mag2 = r31x *  r31x +  r31y *  r31y +  r31z *  r31z;
+    fvec rijrik = c_2 *  r23mag *  r21mag;
+    fvec r21mag2 = r21mag *  r21mag;
+    fvec dctik = fvec::mask_div(fvec::undefined(), mask_outer, r31mag2 -  rr, 
+				rijrik *  r21mag2);
+    fvec dctij = fvec::mask_div(fvec::undefined(), mask_outer, r31mag2 +  rr, 
+				rijrik *  r23mag *  r23mag);
+    fvec dctjk = fvec::mask_div(fvec::undefined(), mask_outer, c_m2, rijrik);
+    fvec dw21 = i_data->dwik_buf[buf_idx_i];
+    fvec w21 = i_data->wik_buf[buf_idx_i];
+    fvec dtsjik;
+    fvec tspjik = aut_Sp2_deriv(cos321, thmin, thmax, &dtsjik);
+    dtsjik = fvec::setzero() -  dtsjik; // todo replace by appropriate xor.
+    ivec k = i_data->k_buf[buf_idx_i];
+    for (buf_idx_j = 0; buf_idx_j < j_data->buf_len; buf_idx_j++) {
+      // check l == k in second loop.
+      // l == a4 == buf_idx_j
+      ivec l = j_data->k_buf[buf_idx_j];
+      bvec mask_inner_0 = ivec::mask_cmpneq(mask_outer, k, l) & 
+	j_data->mask_buf[buf_idx_j];
+      // add "continue"
+      fvec r34x = j_data->rikx_buf[buf_idx_j];
+      fvec r34y = j_data->riky_buf[buf_idx_j];
+      fvec r34z = j_data->rikz_buf[buf_idx_j];
+      fvec r34mag = j_data->rikmag_buf[buf_idx_j];
+      fvec cos234 = fvec::mask_div(fvec::undefined(), mask_inner_0, 
+				   r32x * r34x + r32y * r34y + r32z * r34z, 
+				   r23mag * r34mag);
+      cos234 = fvec::min(c_1, fvec::max(c_m1, cos234));
+      fvec sin234 = fvec::mask_sqrt(fvec::undefined(), mask_inner_0, 
+				    c_1 - cos234 * cos234);
+      bvec mask_inner_1 = fvec::mask_cmpneq(mask_inner_0, sin234, 
+					    fvec::setzero());
+      // add "continue"
+      fvec sinl2i = fvec::mask_recip(fvec::undefined(), mask_inner_1, 
+				     sin234 * sin234);
+      fvec rjl2i = fvec::mask_recip(fvec::undefined(), mask_inner_1, 
+				    r34mag * r34mag);
+      fvec dw34 = j_data->dwik_buf[buf_idx_j];
+      fvec w34 = j_data->wik_buf[buf_idx_j];
+      fvec rr = r23mag *  r23mag - r34mag * r34mag;
+      fvec r24x = r23x +  r34x;
+      fvec r24y = r23y +  r34y;
+      fvec r24z = r23z +  r34z;
+      fvec r242 = r24x *  r24x +  r24y *  r24y +  r24z *  r24z;
+      fvec rijrjl = c_2 *  r23mag *  r34mag;
+      fvec rjl2 = r34mag *  r34mag;
+      fvec dctjl = fvec::mask_div(fvec::undefined(), mask_inner_1, r242 -  rr, 
+				  rijrjl *  rjl2);
+      fvec dctji = fvec::mask_div(fvec::undefined(), mask_inner_1, r242 +  rr, 
+				  rijrjl *  r23mag *  r23mag);
+      fvec dctil = fvec::mask_div(fvec::undefined(), mask_inner_1, c_m2, 
+				  rijrjl);
+      fvec dtsijl;
+      fvec tspijl = aut_Sp2_deriv(cos234, thmin, thmax, &dtsijl);
+      dtsijl = fvec::setzero() -  dtsijl;
+      fvec prefactor = VA;
+
+      fvec cross321x = r32y *  r21z -  r32z *  r21y;
+      fvec cross321y = r32z *  r21x -  r32x *  r21z;
+      fvec cross321z = r32x *  r21y -  r32y *  r21x;
+      fvec cross234x = r23y *  r34z -  r23z *  r34y;
+      fvec cross234y = r23z *  r34x -  r23x *  r34z;
+      fvec cross234z = r23x *  r34y -  r23y *  r34x;
+
+      fvec cwnum = cross321x * cross234x + cross321y * cross234y + cross321z *
+	cross234z;
+      fvec cwnom = r21mag * r34mag * r23mag * r23mag * sin321 * sin234;
+      fvec om1234 = fvec::mask_div(fvec::undefined(), mask_inner_1, cwnum, 
+				   cwnom);
+      fvec cw = om1234;
+      fvec sum_omega_contrib = (c_1 -  om1234 *  om1234) *  w21 *  w34 *
+	(c_1 -  tspjik) * ( c_1 -  tspijl);
+      sum_omega = fvec::mask_add(sum_omega, mask_inner_1, sum_omega, 
+				 sum_omega_contrib);
+      fvec dt1dik = rik2i -  dctik *  sink2i *  cos321;
+      fvec dt1djk = fvec::setzero() -  dctjk *  sink2i *  cos321;
+      fvec dt1djl = rjl2i -  dctjl *  sinl2i *  cos234;
+      fvec dt1dil = fvec::setzero() -  dctil *  sinl2i *  cos234;
+      fvec dt1dij =   fvec::mask_div(fvec::undefined(), mask_inner_1, c_2, 
+				     r23mag * r23mag) - 
+	dctij * sink2i * cos321 -  dctji *  sinl2i *  cos234;
+
+      fvec dt2dikx = r23y *  cross234z -  r23z *  cross234y;
+      fvec dt2diky = r23z *  cross234x -  r23x *  cross234z;
+      fvec dt2dikz = r23x *  cross234y -  r23y *  cross234x;
+
+      fvec dt2djlx = r23z *  cross321y -  r23y *  cross321z;
+      fvec dt2djly = r23x *  cross321z -  r23z *  cross321x;
+      fvec dt2djlz = r23y *  cross321x -  r23x *  cross321y;
+
+      fvec dt2dijx = r21z *  cross234y +  r34y *  cross321z -
+	( r34z *  cross321y +  r21y *  cross234z);
+      fvec dt2dijy = r21x *  cross234z +  r34z *  cross321x -
+	( r34x *  cross321z +  r21z *  cross234x);
+      fvec dt2dijz = r21y *  cross234x +  r34x *  cross321y -
+	( r34y *  cross321x +  r21x *  cross234y);
+
+      fvec aa = prefactor *  c_2 *  fvec::mask_div(fvec::undefined(), 
+						   mask_inner_1, cw, cwnom) *
+	w21 *  w34 *  (c_1 -  tspjik) * ( c_1 -  tspijl);
+      fvec aaa1 = (fvec::setzero() - prefactor) * (c_1 - om1234 * om1234) *
+	(c_1 - tspjik) * (c_1 - tspijl);
+      fvec aaa2 = (fvec::setzero() -  prefactor) * (c_1 -  om1234 *  om1234) *
+	w21 * w34;
+      fvec at2 = aa * cwnum;
+
+      fvec fcijpc = aaa2 * dtsjik * dctij * (c_1 - tspijl) +  aaa2 * dtsijl * 
+	dctji * (c_1 - tspjik) - dt1dij * at2;
+      fvec fcikpc =  aaa2 * dtsjik * dctik * (c_1 - tspijl) - dt1dik * at2;
+      fvec fcjlpc =  aaa2 * dtsijl * dctjl * (c_1 - tspjik) - dt1djl * at2;
+      fvec fcjkpc =  aaa2 * dtsjik * dctjk * (c_1 - tspijl) - dt1djk * at2;
+      fvec fcilpc =  aaa2 * dtsijl * dctil * (c_1 - tspjik) - dt1dil * at2;
+
+      fvec F23x = fcijpc *  r23x +  aa *  dt2dijx;
+      fvec F23y = fcijpc *  r23y +  aa *  dt2dijy;
+      fvec F23z = fcijpc *  r23z +  aa *  dt2dijz;
+
+      fvec F12x = fcikpc *  r21x +  aa *  dt2dikx;
+      fvec F12y = fcikpc *  r21y +  aa *  dt2diky;
+      fvec F12z = fcikpc *  r21z +  aa *  dt2dikz;
+
+      fvec F34x = fcjlpc *  r34x +  aa *  dt2djlx;
+      fvec F34y = fcjlpc *  r34y +  aa *  dt2djly;
+      fvec F34z = fcjlpc *  r34z +  aa *  dt2djlz;
+
+      fvec F31x = fcjkpc *  r31x;
+      fvec F31y = fcjkpc *  r31y;
+      fvec F31z = fcjkpc *  r31z;
+
+      fvec F24x = fcilpc *  r24x;
+      fvec F24y = fcilpc *  r24y;
+      fvec F24z = fcilpc *  r24z;
+
+      fvec f1x = fvec::setzero() - ( F12x +  F31x);
+      fvec f1y = fvec::setzero() - ( F12y +  F31y);
+      fvec f1z = fvec::setzero() - ( F12z +  F31z);
+      fvec f2x = F12x +  F31x;
+      fvec f2y = F12y +  F31y;
+      fvec f2z = F12z +  F31z;
+      fvec f3x = F34x +  F24x;
+      fvec f3y = F34y +  F24y;
+      fvec f3z = F34z +  F24z;
+      fvec f4x = fvec::setzero() - ( F34x +  F24x);
+      fvec f4y = fvec::setzero() - ( F34y +  F24y);
+      fvec f4z = fvec::setzero() - ( F34z +  F24z);
+
+      fij[0] = fvec::mask_add(fij[0], mask_inner_1, fij[0],
+          F23x +  F24x -  F31x);
+      fij[1] = fvec::mask_add(fij[1], mask_inner_1, fij[1],
+          F23y +  F24y -  F31y);
+      fij[2] = fvec::mask_add(fij[2], mask_inner_1, fij[2],
+          F23z +  F24z -  F31z);
+
+      fvec tmp20 = VA * (c_1 - om1234 * om1234) * (c_1 - tspjik) * 
+	(c_1 - tspijl) * dw21 * w34 * fvec::mask_recip(fvec::undefined(), 
+						       mask_inner_1, r21mag);
+      f2x = f2x -  tmp20 *  r21x;
+      f2y = f2y -  tmp20 *  r21y;
+      f2z = f2z -  tmp20 *  r21z;
+      f1x = f1x +  tmp20 *  r21x;
+      f1y = f1y +  tmp20 *  r21y;
+      f1z = f1z +  tmp20 *  r21z;
+
+      fvec tmp21 = VA * (c_1 - om1234 * om1234) * (c_1 - tspjik) * 
+	(c_1 - tspijl) * w21 * dw34 * fvec::mask_recip(fvec::undefined(), 
+						       mask_inner_1, r34mag);
+      f3x = f3x -  tmp21 *  r34x;
+      f3y = f3y -  tmp21 *  r34y;
+      f3z = f3z -  tmp21 *  r34z;
+      f4x = f4x +  tmp21 *  r34x;
+      f4y = f4y +  tmp21 *  r34y;
+      f4z = f4z +  tmp21 *  r34z;
+
+      // 1 == buf_idx_i, 2 == i, 3 == j, 4 == buf_idx_j
+      i_data->force_k_x_buf[buf_idx_i] = 
+	fvec::mask_add(i_data->force_k_x_buf[buf_idx_i], 
+		       mask_inner_1, i_data->force_k_x_buf[buf_idx_i], f1x);
+      i_data->force_k_y_buf[buf_idx_i] = 
+	fvec::mask_add(i_data->force_k_y_buf[buf_idx_i], mask_inner_1, 
+		       i_data->force_k_y_buf[buf_idx_i], f1y);
+      i_data->force_k_z_buf[buf_idx_i] = 
+	fvec::mask_add(i_data->force_k_z_buf[buf_idx_i], mask_inner_1, 
+		       i_data->force_k_z_buf[buf_idx_i], f1z);
+      i_data->force_i_x = 
+	fvec::mask_add(i_data->force_i_x, mask_inner_1, i_data->force_i_x, f2x);
+      i_data->force_i_y = 
+	fvec::mask_add(i_data->force_i_y, mask_inner_1, i_data->force_i_y, f2y);
+      i_data->force_i_z = 
+	fvec::mask_add(i_data->force_i_z, mask_inner_1, i_data->force_i_z, f2z);
+      j_data->force_i_x = 
+	fvec::mask_add(j_data->force_i_x, mask_inner_1, j_data->force_i_x, f3x);
+      j_data->force_i_y = 
+	fvec::mask_add(j_data->force_i_y, mask_inner_1, j_data->force_i_y, f3y);
+      j_data->force_i_z = 
+	fvec::mask_add(j_data->force_i_z, mask_inner_1, j_data->force_i_z, f3z);
+      j_data->force_k_x_buf[buf_idx_j] = 
+	fvec::mask_add(j_data->force_k_x_buf[buf_idx_j], mask_inner_1, 
+		       j_data->force_k_x_buf[buf_idx_j], f4x);
+      j_data->force_k_y_buf[buf_idx_j] = 
+	fvec::mask_add(j_data->force_k_y_buf[buf_idx_j], mask_inner_1, 
+		       j_data->force_k_y_buf[buf_idx_j], f4y);
+      j_data->force_k_z_buf[buf_idx_j] = 
+	fvec::mask_add(j_data->force_k_z_buf[buf_idx_j], mask_inner_1, 
+		       j_data->force_k_z_buf[buf_idx_j], f4z);
+    }
+  }
+  return sum_omega;
+}
+
+static fvec aut_frebo_pi_dh(
+    KernelArgsAIREBOT<flt_t,acc_t> * _noalias ka,
+    struct aut_frebo_data * _noalias i_data,
+    struct aut_frebo_data * _noalias j_data,
+    int itype, int jtype, ivec vi, ivec vj,
+    fvec r23x, fvec r23y, fvec r23z, fvec r23mag,
+    fvec VA,
+    fvec Nij, fvec Nji, fvec Nijconj, fvec NconjtmpI, fvec NconjtmpJ,
+    fvec fij[3]
+) {
+  fvec c_TOL = fvec::set1(TOL);
+  fvec dN3[3];
+  fvec Tij = aut_frebo_Tij(ka, itype, jtype, Nij, Nji, Nijconj, &dN3[0]);
+  bvec TijgtTOLmask = fvec::cmpnle(fvec::abs(Tij), c_TOL);
+  fvec sum_omega = fvec::setzero();
+  if (bvec::test_any_set(TijgtTOLmask)) {
+    sum_omega = aut_frebo_sum_omega(
+        ka, i_data, j_data, itype, jtype, vi, vj,
+        r23x, r23y, r23z, r23mag, VA *  Tij, fij);
+    sum_omega = fvec::mask_blend(TijgtTOLmask, fvec::setzero(), sum_omega);
+    aut_frebo_N_spline_force(ka, i_data, itype, jtype, vi, vj, VA * sum_omega,
+			     dN3[0], dN3[2], NconjtmpI);
+    aut_frebo_N_spline_force(ka, j_data, jtype, itype, vj, vi, VA * sum_omega,
+			     dN3[1], dN3[2], NconjtmpJ);
+  }
+  return Tij *  sum_omega;
+}
+
+/*
+ We can reuse the aut_frebo_data buffers here to do this calculation very 
+ cheaply.
+*/
+static void aut_torsion_vec(
+    KernelArgsAIREBOT<flt_t,acc_t> * ka,
+    struct aut_frebo_data * i_data,
+    struct aut_frebo_data * j_data,
+    ivec i, ivec j, fvec wij, fvec dwij
+) {
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * map = ka->map;
+  flt_t (*epsilonT)[2] = ka->params.epsilonT;
+  fvec epsilonT00 = fvec::set1(epsilonT[0][0]);
+  fvec epsilonT01 = fvec::set1(epsilonT[0][1]);
+  fvec epsilonT10 = fvec::set1(epsilonT[1][0]);
+  fvec epsilonT11 = fvec::set1(epsilonT[1][1]);
+  fvec thmin = fvec::set1(ka->params.thmin);
+  fvec thmax = fvec::set1(ka->params.thmax);
+
+  const fvec c_1_0 = fvec::set1(1.0);
+  const fvec c_0_5 = fvec::set1(0.5);
+  const fvec c_0_1 = fvec::set1(0.1);
+  const fvec c_2_0 = fvec::set1(2.0);
+  const fvec c_2_5 = fvec::set1(2.5);
+  const fvec c_256_405 = fvec::set1(256.0/405.0);
+
+  fvec del32x = j_data->x_i -  i_data->x_i;
+  fvec del32y = j_data->y_i -  i_data->y_i;
+  fvec del32z = j_data->z_i -  i_data->z_i;
+  fvec rsq = del32x * del32x +  del32y * del32y +  del32z * del32z;
+  fvec r32 = fvec::sqrt(rsq);
+  fvec del23x = fvec::setzero() -  del32x;
+  fvec del23y = fvec::setzero() -  del32y;
+  fvec del23z = fvec::setzero() -  del32z;
+  fvec r23 = r32;
+  fvec w23 = wij;
+  fvec dw23 = dwij;
+
+  for (int buf_idx_i = 0; buf_idx_i < i_data->buf_len; buf_idx_i++) {
+    bvec mask_start = i_data->mask_buf[buf_idx_i];
+    fvec del21x = i_data->rikx_buf[buf_idx_i]; // a2 - a1 -> i - k
+    fvec del21y = i_data->riky_buf[buf_idx_i]; // a2 - a1 -> i - k
+    fvec del21z = i_data->rikz_buf[buf_idx_i]; // a2 - a1 -> i - k
+    fvec r21 = i_data->rikmag_buf[buf_idx_i];
+    fvec cos321 = i_data->cosjik_buf[buf_idx_i];
+    fvec sin321 = fvec::sqrt(c_1_0 -  cos321 *  cos321);
+    // strictly equivalent to sin321 < TOL
+    mask_start = fvec::mask_cmpneq(mask_start, fvec::setzero(), sin321);
+    if (! bvec::test_any_set(mask_start)) continue;
+
+    fvec deljkx = del21x -  del23x;
+    fvec deljky = del21y -  del23y;
+    fvec deljkz = del21z -  del23z;
+    fvec rjk2 = deljkx * deljkx +  deljky * deljky + deljkz * deljkz;
+    fvec rjk = fvec::sqrt(rjk2);
+    fvec rik2 = r21 *  r21;
+    fvec w21 = i_data->wik_buf[buf_idx_i];
+    fvec dw21 = i_data->dwik_buf[buf_idx_i];
+
+    fvec rij = r32;
+    fvec rik = r21;
+    fvec rij2 = r32 *  r32;
+    fvec dtsjik;
+    fvec tspjik = aut_Sp2_deriv(cos321, thmin, thmax, &dtsjik);
+    dtsjik = fvec::setzero() -  dtsjik;
+
+    bvec ktype_mask = i_data->ktype_buf[buf_idx_i];
+    fvec epsilonT0 = fvec::mask_blend(ktype_mask, epsilonT00, epsilonT10);
+    fvec epsilonT1 = fvec::mask_blend(ktype_mask, epsilonT01, epsilonT11);
+
+    ivec k = i_data->k_buf[buf_idx_i];
+    for (int buf_idx_j = 0; buf_idx_j < j_data->buf_len; buf_idx_j++) {
+      ivec l = j_data->k_buf[buf_idx_j];
+      bvec mask_inner_0 = ivec::mask_cmpneq(mask_start, k, l) & 
+	j_data->mask_buf[buf_idx_j];
+      if (! bvec::test_any_set(mask_inner_0)) continue;
+      fvec del34x = j_data->rikx_buf[buf_idx_j];
+      fvec del34y = j_data->riky_buf[buf_idx_j];
+      fvec del34z = j_data->rikz_buf[buf_idx_j];
+      fvec r34 = j_data->rikmag_buf[buf_idx_j];
+      bvec ltype_mask = j_data->ktype_buf[buf_idx_j];
+      fvec cos234 = j_data->cosjik_buf[buf_idx_j];
+      fvec sin234 = fvec::sqrt(c_1_0 -  cos234 *  cos234);
+      // strictly equivalent to sin234 < TOL
+      mask_inner_0 = fvec::mask_cmpneq(mask_inner_0, sin234, fvec::setzero());
+      if (! bvec::test_any_set(mask_inner_0)) continue;
+      fvec dw34 = j_data->dwik_buf[buf_idx_j];
+      fvec w34 = j_data->wik_buf[buf_idx_j];
+      fvec delilx = del23x +  del34x;
+      fvec delily = del23y +  del34y;
+      fvec delilz = del23z +  del34z;
+      fvec ril2 = delilx * delilx +  delily * delily + delilz * delilz;
+      fvec ril = fvec::sqrt(ril2);
+      fvec rjl2 = r34 *  r34;
+
+      fvec rjl = r34;
+      fvec dtsijl;
+      fvec tspijl = aut_Sp2_deriv(cos234, thmin, thmax, &dtsijl);
+      dtsijl = fvec::setzero() -  dtsijl;
+      fvec cross321x = del32y * del21z - del32z * del21y;
+      fvec cross321y = del32z * del21x - del32x * del21z;
+      fvec cross321z = del32x * del21y - del32y * del21x;
+      fvec cross321mag = fvec::sqrt(cross321x * cross321x + 
+				    cross321y * cross321y + 
+				    cross321z * cross321z);
+      fvec cross234x = del23y * del34z - del23z * del34y;
+      fvec cross234y = del23z * del34x - del23x * del34z;
+      fvec cross234z = del23x * del34y - del23y * del34x;
+      fvec cross234mag = fvec::sqrt(cross234x * cross234x + 
+				    cross234y * cross234y + 
+				    cross234z * cross234z);
+      fvec cwnum = cross321x * cross234x + cross321y * cross234y + 
+	cross321z * cross234z;
+      fvec cwnom = r21 * r34 * r32 * r32 * sin321 * sin234;
+      fvec cw = cwnum /  cwnom;
+
+      fvec cw2 = c_0_5 * ( c_1_0 - cw);
+      fvec ekijl = fvec::mask_blend(ltype_mask, epsilonT0, epsilonT1);
+      fvec Ec = c_256_405 * ekijl;
+      fvec cw2_5 = cw2 *  cw2 *  cw2 *  cw2 *  cw2;
+      fvec Vtors = Ec *  cw2_5 -  ekijl *  c_0_1;
+
+      fvec evdwl = Vtors * w21 * w23 * w34 * (c_1_0-tspjik) * (c_1_0-tspijl);
+      ka->result_eng += fvec::mask_reduce_add(mask_inner_0, evdwl);
+
+      fvec dndijx  = cross234y * del21z - cross234z * del21y;
+      fvec dndijy  = cross234z * del21x - cross234x * del21z;
+      fvec dndijz  = cross234x * del21y - cross234y * del21x;
+
+      fvec tmpvecx = del34y * cross321z - del34z * cross321y;
+      fvec tmpvecy = del34z * cross321x - del34x * cross321z;
+      fvec tmpvecz = del34x * cross321y - del34y * cross321x;
+
+      dndijx = dndijx + tmpvecx;
+      dndijy = dndijy + tmpvecy;
+      dndijz = dndijz + tmpvecz;
+
+      fvec dndikx = del23y * cross234z - del23z * cross234y;
+      fvec dndiky = del23z * cross234x - del23x * cross234z;
+      fvec dndikz = del23x * cross234y - del23y * cross234x;
+
+      fvec dndjlx = cross321y * del23z - cross321z * del23y;
+      fvec dndjly = cross321z * del23x - cross321x * del23z;
+      fvec dndjlz = cross321x * del23y - cross321y * del23x;
+
+      fvec r23sq = r23 *  r23;
+      fvec r21sq = r21 *  r21;
+      fvec r34sq = r34 *  r34;
+      fvec rjksq = rjk *  rjk;
+      fvec rilsq = ril *  ril;
+      fvec dcidij = (r23sq -  r21sq +  rjksq) / ( c_2_0 *  r23sq *  r21);
+      fvec dcidik = (r21sq -  r23sq +  rjksq) / ( c_2_0 *  r21sq *  r23);
+      fvec dcidjk = fvec::setzero() -  rjk / ( r23 *  r21);
+      fvec dcjdji = (r23sq -  r34sq +  rilsq) / ( c_2_0 *  r23sq *  r34);
+      fvec dcjdjl = (r34sq -  r23sq +  rilsq) / ( c_2_0 *  r34sq *  r23);
+      fvec dcjdil = fvec::setzero() -  ril / ( r23 *  r34);
+
+      fvec dsidij = fvec::setzero() -  cos321 / sin321 * dcidij;
+      fvec dsidik = fvec::setzero() -  cos321 / sin321 * dcidik;
+      fvec dsidjk = fvec::setzero() -  cos321 / sin321 * dcidjk;
+
+      fvec dsjdji = fvec::setzero() -  cos234 / sin234 * dcjdji;
+      fvec dsjdjl = fvec::setzero() -  cos234 / sin234 * dcjdjl;
+      fvec dsjdil = fvec::setzero() -  cos234 / sin234 * dcjdil;
+
+      fvec dxidij = r21 * sin321 + r23 * r21 * dsidij;
+      fvec dxidik = r23 * sin321 + r23 * r21 * dsidik;
+      fvec dxidjk = r23 * r21 * dsidjk;
+
+      fvec dxjdji = r34 * sin234 + r23 * r34 * dsjdji;
+      fvec dxjdjl = r23 * sin234 + r23 * r34 * dsjdjl;
+      fvec dxjdil = r23 * r34 * dsjdil;
+
+      fvec ddndij = dxidij * cross234mag + cross321mag * dxjdji;
+      fvec ddndik = dxidik * cross234mag;
+      fvec ddndjk = dxidjk * cross234mag;
+      fvec ddndjl = cross321mag * dxjdjl;
+      fvec ddndil = cross321mag * dxjdil;
+      fvec dcwddn = fvec::setzero() -  cwnum / ( cwnom * cwnom);
+      fvec dcwdn = fvec::recip(cwnom);
+      fvec cw2_4 = cw2 *  cw2 *  cw2 *  cw2;
+      fvec dvpdcw = c_2_5 * Ec * cw2_4 * w23 * w21 * w34 * (c_1_0 - tspjik) *
+	(c_1_0 - tspijl);
+
+      fvec Ftmpx = dvpdcw * (dcwdn * dndijx + dcwddn * ddndij * del23x / r23);
+      fvec Ftmpy = dvpdcw * (dcwdn * dndijy + dcwddn * ddndij * del23y / r23);
+      fvec Ftmpz = dvpdcw * (dcwdn * dndijz + dcwddn * ddndij * del23z / r23);
+      fvec fix = Ftmpx;
+      fvec fiy = Ftmpy;
+      fvec fiz = Ftmpz;
+      fvec fjx = fvec::setzero() - Ftmpx;
+      fvec fjy = fvec::setzero() - Ftmpy;
+      fvec fjz = fvec::setzero() - Ftmpz;
+
+      Ftmpx = dvpdcw * (dcwdn * dndikx + dcwddn * ddndik * del21x / r21);
+      Ftmpy = dvpdcw * (dcwdn * dndiky + dcwddn * ddndik * del21y / r21);
+      Ftmpz = dvpdcw * (dcwdn * dndikz + dcwddn * ddndik * del21z / r21);
+      fix = fix +  Ftmpx;
+      fiy = fiy +  Ftmpy;
+      fiz = fiz +  Ftmpz;
+      fvec fkx = fvec::setzero() -  Ftmpx;
+      fvec fky = fvec::setzero() -  Ftmpy;
+      fvec fkz = fvec::setzero() -  Ftmpz;
+
+      Ftmpx = dvpdcw * dcwddn * ddndjk * deljkx / rjk;
+      Ftmpy = dvpdcw * dcwddn * ddndjk * deljky / rjk;
+      Ftmpz = dvpdcw * dcwddn * ddndjk * deljkz / rjk;
+      fjx = fjx +  Ftmpx;
+      fjy = fjy +  Ftmpy;
+      fjz = fjz +  Ftmpz;
+      fkx = fkx -  Ftmpx;
+      fky = fky -  Ftmpy;
+      fkz = fkz -  Ftmpz;
+
+      Ftmpx = dvpdcw * (dcwdn * dndjlx + dcwddn * ddndjl * del34x / r34);
+      Ftmpy = dvpdcw * (dcwdn * dndjly + dcwddn * ddndjl * del34y / r34);
+      Ftmpz = dvpdcw * (dcwdn * dndjlz + dcwddn * ddndjl * del34z / r34);
+      fjx = fjx +  Ftmpx;
+      fjy = fjy +  Ftmpy;
+      fjz = fjz +  Ftmpz;
+      fvec flx = fvec::setzero() -  Ftmpx;
+      fvec fly = fvec::setzero() -  Ftmpy;
+      fvec flz = fvec::setzero() -  Ftmpz;
+
+      Ftmpx = dvpdcw * dcwddn * ddndil * delilx / ril;
+      Ftmpy = dvpdcw * dcwddn * ddndil * delily / ril;
+      Ftmpz = dvpdcw * dcwddn * ddndil * delilz / ril;
+      fix = fix +  Ftmpx;
+      fiy = fiy +  Ftmpy;
+      fiz = fiz +  Ftmpz;
+      flx = flx -  Ftmpx;
+      fly = fly -  Ftmpy;
+      flz = flz -  Ftmpz;
+
+      // coordination forces
+
+      fvec fpair = Vtors * dw21 * w23 * w34 * (c_1_0 - tspjik) * 
+	(c_1_0 - tspijl) /  r21;
+      fix = fix -  del21x * fpair;
+      fiy = fiy -  del21y * fpair;
+      fiz = fiz -  del21z * fpair;
+      fkx = fkx +  del21x * fpair;
+      fky = fky +  del21y * fpair;
+      fkz = fkz +  del21z * fpair;
+
+      fpair = Vtors * w21 * dw23 * w34 * (c_1_0 - tspjik) * (c_1_0 - tspijl) /
+	r23;
+      fix = fix -  del23x * fpair;
+      fiy = fiy -  del23y * fpair;
+      fiz = fiz -  del23z * fpair;
+      fjx = fjx +  del23x * fpair;
+      fjy = fjy +  del23y * fpair;
+      fjz = fjz +  del23z * fpair;
+
+      fpair = Vtors * w21 * w23 * dw34 * (c_1_0 - tspjik) * (c_1_0 - tspijl) /
+	r34;
+      fjx = fjx -  del34x * fpair;
+      fjy = fjy -  del34y * fpair;
+      fjz = fjz -  del34z * fpair;
+      flx = flx +  del34x * fpair;
+      fly = fly +  del34y * fpair;
+      flz = flz +  del34z * fpair;
+
+      // additional cut off function forces
+
+      fvec fcpc = fvec::setzero() - Vtors * w21 * w23 * w34 * dtsjik * (c_1_0 -
+									tspijl);
+      fpair = fcpc * dcidij / rij;
+      fix = fix +  fpair * del23x;
+      fiy = fiy +  fpair * del23y;
+      fiz = fiz +  fpair * del23z;
+      fjx = fjx -  fpair * del23x;
+      fjy = fjy -  fpair * del23y;
+      fjz = fjz -  fpair * del23z;
+
+      fpair = fcpc * dcidik / rik;
+      fix = fix +  fpair * del21x;
+      fiy = fiy +  fpair * del21y;
+      fiz = fiz +  fpair * del21z;
+      fkx = fkx -  fpair * del21x;
+      fky = fky -  fpair * del21y;
+      fkz = fkz -  fpair * del21z;
+
+      fpair = fcpc * dcidjk / rjk;
+      fjx = fjx +  fpair * deljkx;
+      fjy = fjy +  fpair * deljky;
+      fjz = fjz +  fpair * deljkz;
+      fkx = fkx -  fpair * deljkx;
+      fky = fky -  fpair * deljky;
+      fkz = fkz -  fpair * deljkz;
+
+      fcpc = fvec::setzero() - Vtors * w21 * w23 * w34 * (c_1_0 - tspjik) * 
+	dtsijl;
+      fpair = fcpc * dcjdji / rij;
+      fix = fix +  fpair * del23x;
+      fiy = fiy +  fpair * del23y;
+      fiz = fiz +  fpair * del23z;
+      fjx = fjx -  fpair * del23x;
+      fjy = fjy -  fpair * del23y;
+      fjz = fjz -  fpair * del23z;
+
+      fpair = fcpc * dcjdjl / rjl;
+      fjx = fjx +  fpair * del34x;
+      fjy = fjy +  fpair * del34y;
+      fjz = fjz +  fpair * del34z;
+      flx = flx -  fpair * del34x;
+      fly = fly -  fpair * del34y;
+      flz = flz -  fpair * del34z;
+
+      fpair = fcpc * dcjdil / ril;
+      fix = fix +  fpair * delilx;
+      fiy = fiy +  fpair * delily;
+      fiz = fiz +  fpair * delilz;
+      flx = flx -  fpair * delilx;
+      fly = fly -  fpair * delily;
+      flz = flz -  fpair * delilz;
+
+      // sum per-atom forces into atom force array
+
+      i_data->force_i_x = fvec::mask_add(i_data->force_i_x, mask_inner_0, 
+					 i_data->force_i_x, fix);
+      i_data->force_i_y = fvec::mask_add(i_data->force_i_y, mask_inner_0, 
+					 i_data->force_i_y, fiy);
+      i_data->force_i_z = fvec::mask_add(i_data->force_i_z, mask_inner_0, 
+					 i_data->force_i_z, fiz);
+      i_data->force_j_x = fvec::mask_add(i_data->force_j_x, mask_inner_0, 
+					 i_data->force_j_x, fjx);
+      i_data->force_j_y = fvec::mask_add(i_data->force_j_y, mask_inner_0, 
+					 i_data->force_j_y, fjy);
+      i_data->force_j_z = fvec::mask_add(i_data->force_j_z, mask_inner_0, 
+					 i_data->force_j_z, fjz);
+      i_data->force_k_x_buf[buf_idx_i] = 
+	fvec::mask_add(i_data->force_k_x_buf[buf_idx_i], mask_inner_0, 
+		       i_data->force_k_x_buf[buf_idx_i], fkx);
+      i_data->force_k_y_buf[buf_idx_i] = 
+	fvec::mask_add(i_data->force_k_y_buf[buf_idx_i], mask_inner_0, 
+		       i_data->force_k_y_buf[buf_idx_i], fky);
+      i_data->force_k_z_buf[buf_idx_i] = 
+	fvec::mask_add(i_data->force_k_z_buf[buf_idx_i], mask_inner_0, 
+		       i_data->force_k_z_buf[buf_idx_i], fkz);
+      j_data->force_k_x_buf[buf_idx_j] = 
+	fvec::mask_add(j_data->force_k_x_buf[buf_idx_j], mask_inner_0, 
+		       j_data->force_k_x_buf[buf_idx_j], flx);
+      j_data->force_k_y_buf[buf_idx_j] = 
+	fvec::mask_add(j_data->force_k_y_buf[buf_idx_j], mask_inner_0, 
+		       j_data->force_k_y_buf[buf_idx_j], fly);
+      j_data->force_k_z_buf[buf_idx_j] = 
+	fvec::mask_add(j_data->force_k_z_buf[buf_idx_j], mask_inner_0, 
+		       j_data->force_k_z_buf[buf_idx_j], flz);
+    }
+  }
+}
+
+/*
+ * Processes VL elements of the same type itype/jtype for REBO and TORSION
+ * interactions. This allows us to reuse the aut_frebo_data buffes in the 
+ * torsion calculaltion.
+ */
+static void aut_frebo_batch_of_kind(KernelArgsAIREBOT<flt_t,acc_t> * ka, 
+				    int torflag, int itype, int jtype, 
+				    int * i_buf, int * j_buf) {
+ { // jump-scope for exceed_limits
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * tag = ka->tag;
+  int * map = ka->map;
+  ResultForceT<acc_t> * result_f = ka->result_f;
+  flt_t rcminij = ka->params.rcmin[itype][jtype];
+  flt_t rcmaxij = ka->params.rcmax[itype][jtype];
+  flt_t Qij = ka->params.Q[itype][jtype];
+  flt_t Aij = ka->params.A[itype][jtype];
+  flt_t alphaij = ka->params.alpha[itype][jtype];
+  fvec vrcminij = fvec::set1(ka->params.rcmin[itype][jtype]);
+  fvec vrcmaxij = fvec::set1(ka->params.rcmax[itype][jtype]);
+  fvec vQij = fvec::set1(ka->params.Q[itype][jtype]);
+  fvec vAij = fvec::set1(ka->params.A[itype][jtype]);
+  fvec malphaij = fvec::set1(-ka->params.alpha[itype][jtype]);
+  fvec c_1_0 = fvec::set1(1);
+  fvec c_0_5 = fvec::set1(0.5);
+  fvec c_TOL = fvec::set1(1e-9);
+  struct aut_frebo_data i_data, j_data;
+
+  fvec evdwl_vacc = fvec::setzero();
+  ivec vi = ivec::maskz_loadu(bvec::full(), i_buf);
+  int tmp;
+  ivec vj = ivec::maskz_loadu(bvec::full(), j_buf);
+  fvec x_i, y_i, z_i;
+  fvec x_j, y_j, z_j;
+  aut_loadatoms_vec_notype(x, vi, &x_i, &y_i, &z_i);
+  aut_loadatoms_vec_notype(x, vj, &x_j, &y_j, &z_j);
+  i_data.x_i = x_i;
+  i_data.y_i = y_i;
+  i_data.z_i = z_i;
+  i_data.x_j = x_j;
+  i_data.y_j = y_j;
+  i_data.z_j = z_j;
+  j_data.x_i = x_j;
+  j_data.y_i = y_j;
+  j_data.z_i = z_j;
+  j_data.x_j = x_i;
+  j_data.y_j = y_i;
+  j_data.z_j = z_i;
+  fvec delx = x_i -  x_j;
+  fvec dely = y_i -  y_j;
+  fvec delz = z_i -  z_j;
+  fvec rsq = delx *  delx +  dely *  dely +  delz *  delz;
+  fvec rij = fvec::sqrt(rsq);
+  fvec dwij;
+  fvec wij = aut_Sp_deriv(rij, vrcminij, vrcmaxij, &dwij);
+
+  fvec exp_alphar = fvec::exp(malphaij *  rij);
+  fvec Qij_over_rij = vQij /  rij;
+  fvec Qij_over_rsq = vQij /  rsq;
+  fvec VR_by_wij = ( c_1_0 +  Qij_over_rij) *  vAij *  exp_alphar;
+  fvec VR = wij * VR_by_wij;
+  fvec pre = wij *  vAij *  exp_alphar;
+  fvec dVRdi = pre * ( malphaij +  malphaij *  Qij_over_rij -  Qij_over_rsq);
+  dVRdi = dVRdi + VR_by_wij *  dwij;
+
+  fvec VA_by_wij = fvec::setzero();
+  fvec dVA = fvec::setzero();
+
+  int k;
+  for (k = 0; k < 3; k++) {
+    fvec mBIJc = fvec::set1(-ka->params.BIJc[itype][jtype][k]);
+    fvec mBetaij = fvec::set1(-ka->params.Beta[itype][jtype][k]);
+    fvec term = mBIJc *  fvec::exp(mBetaij *  rij);
+    VA_by_wij = VA_by_wij +  term;
+    dVA = dVA +  mBetaij * wij * term;
+  }
+
+  dVA = dVA +  dwij *  VA_by_wij;
+  fvec VA = wij * VA_by_wij;
+
+  bvec tol_check = fvec::cmplt(wij, c_TOL);
+  VA = fvec::mask_blend(tol_check, VA, fvec::setzero());
+  dVA = fvec::mask_blend(tol_check, dVA, fvec::setzero());
+  VR = fvec::mask_blend(tol_check, VR, fvec::setzero());
+  dVRdi = fvec::mask_blend(tol_check, dVRdi, fvec::setzero());
+
+  fvec nHi = fvec::gather(vi, ka->nH, sizeof(flt_t));
+  fvec nCi = fvec::gather(vi, ka->nC, sizeof(flt_t));
+  fvec nHj = fvec::gather(vj, ka->nH, sizeof(flt_t));
+  fvec nCj = fvec::gather(vj, ka->nC, sizeof(flt_t));
+  fvec Nij = (nHi +  nCi) -  wij;
+  fvec Nji = (nHj +  nCj) -  wij;
+  i_data.nHi = nHi;
+  i_data.nCi = nCi;
+  j_data.nHi = nHj;
+  j_data.nCi = nCj;
+  fvec fij[3], fji[3];
+  fij[0] = fvec::setzero(); fij[1] = fvec::setzero();
+  fij[2] = fvec::setzero();
+  fji[0] = fvec::setzero(); fji[1] = fvec::setzero();
+  fji[2] = fvec::setzero();
+
+  fvec NconjtmpI;
+  fvec pij = aut_frebo_pij_pd_2(
+      ka, &i_data, itype, jtype, vi, vj,
+      delx, dely, delz, rij, wij, VA, &NconjtmpI, fij);
+
+  if (i_data.buf_len < 0) goto exceed_limits;
+
+  fvec NconjtmpJ;
+  fvec rjix = fvec::setzero() -  delx;
+  fvec rjiy = fvec::setzero() -  dely;
+  fvec rjiz = fvec::setzero() -  delz;
+  fvec pji = aut_frebo_pij_pd_2(
+      ka, &j_data, jtype, itype, vj, vi,
+      rjix, rjiy, rjiz, rij, wij, VA, &NconjtmpJ, fji);
+  fij[0] = fij[0] -  fji[0];
+  fij[1] = fij[1] -  fji[1];
+  fij[2] = fij[2] -  fji[2];
+
+  if (j_data.buf_len < 0) goto exceed_limits;
+
+  if (torflag && itype == 0 && jtype == 0)
+    aut_torsion_vec(ka, &i_data, &j_data, vi, vj, wij, dwij);
+
+  fvec Nijconj = c_1_0 +  NconjtmpI *  NconjtmpI +  NconjtmpJ *  NconjtmpJ;
+  fvec dN3[3];
+  fvec pi_rc = aut_frebo_pi_rc_pd(ka, itype, jtype, Nij, Nji, Nijconj, dN3);
+  aut_frebo_N_spline_force(ka, &i_data, itype, jtype, vi, vj, VA, dN3[0], 
+			   dN3[2], NconjtmpI);
+  aut_frebo_N_spline_force(ka, &j_data, jtype, itype, vj, vi, VA, dN3[1], 
+			   dN3[2], NconjtmpJ);
+  fvec pi_dh = aut_frebo_pi_dh(ka, &i_data, &j_data, itype, jtype, vi, vj, 
+			       delx, dely, delz, rij, VA, Nij, Nji, Nijconj,
+			       NconjtmpI, NconjtmpJ, fij);
+
+  fvec bij = c_0_5 * ( pij +  pji) +  pi_rc +  pi_dh;
+  fvec dVAdi = bij *  dVA;
+  fvec fpair = (dVAdi +  dVRdi) *  fvec::recip(rij);
+  fvec result_f_j_x = fpair *  delx -  fij[0];
+  fvec result_f_j_y = fpair *  dely -  fij[1];
+  fvec result_f_j_z = fpair *  delz -  fij[2];
+  fvec result_f_i_x = fvec::setzero() -  result_f_j_x;
+  fvec result_f_i_y = fvec::setzero() -  result_f_j_y;
+  fvec result_f_i_z = fvec::setzero() -  result_f_j_z;
+  fvec evdwl = VR +  bij *  VA;
+  evdwl_vacc = evdwl_vacc +  evdwl;
+
+  aut_frebo_data_writeback(ka, &i_data);
+  aut_frebo_data_writeback(ka, &j_data);
+
+  flt_t fi_x_buf[fvec::VL] __attribute__((aligned(64)));
+  flt_t fi_y_buf[fvec::VL] __attribute__((aligned(64)));
+  flt_t fi_z_buf[fvec::VL] __attribute__((aligned(64)));
+  int fi_i_buf[ivec::VL] __attribute__((aligned(64)));
+  flt_t fj_x_buf[fvec::VL] __attribute__((aligned(64)));
+  flt_t fj_y_buf[fvec::VL] __attribute__((aligned(64)));
+  flt_t fj_z_buf[fvec::VL] __attribute__((aligned(64)));
+  int fj_j_buf[ivec::VL] __attribute__((aligned(64)));
+  flt_t evdwl_buf[fvec::VL] __attribute__((aligned(64)));
+
+  result_f_i_x = i_data.force_i_x +  result_f_i_x;
+  result_f_i_y = i_data.force_i_y +  result_f_i_y;
+  result_f_i_z = i_data.force_i_z +  result_f_i_z;
+  result_f_j_x = i_data.force_j_x +  result_f_j_x;
+  result_f_j_y = i_data.force_j_y +  result_f_j_y;
+  result_f_j_z = i_data.force_j_z +  result_f_j_z;
+
+  result_f_i_x = j_data.force_j_x +  result_f_i_x;
+  result_f_i_y = j_data.force_j_y +  result_f_i_y;
+  result_f_i_z = j_data.force_j_z +  result_f_i_z;
+  result_f_j_x = j_data.force_i_x +  result_f_j_x;
+  result_f_j_y = j_data.force_i_y +  result_f_j_y;
+  result_f_j_z = j_data.force_i_z +  result_f_j_z;
+
+  fvec::store(fi_x_buf, result_f_i_x);
+  fvec::store(fi_y_buf, result_f_i_y);
+  fvec::store(fi_z_buf, result_f_i_z);
+  ivec::store(fi_i_buf, vi);
+  fvec::store(fj_x_buf, result_f_j_x);
+  fvec::store(fj_y_buf, result_f_j_y);
+  fvec::store(fj_z_buf, result_f_j_z);
+  ivec::store(fj_j_buf, vj);
+  fvec::store(evdwl_buf, evdwl);
+
+  int lane;
+  for (lane = 0; lane < fvec::VL; lane++) {
+    int ii = fi_i_buf[lane];
+    result_f[ii].x += fi_x_buf[lane];
+    result_f[ii].y += fi_y_buf[lane];
+    result_f[ii].z += fi_z_buf[lane];
+    result_f[ii].w += 0.5 * evdwl_buf[lane];
+    int jj = fj_j_buf[lane];
+    result_f[jj].x += fj_x_buf[lane];
+    result_f[jj].y += fj_y_buf[lane];
+    result_f[jj].z += fj_z_buf[lane];
+    result_f[jj].w += 0.5 * evdwl_buf[lane];
+  }
+  ka->result_eng += fvec::reduce_add(evdwl_vacc);
+  return;
+ }
+exceed_limits:
+  for (int l = 0; l < fvec::VL; l++) {
+    int i = i_buf[l];
+    int j = j_buf[l];
+    ref_frebo_single_interaction(ka, i, j);
+    if (torflag && itype == 0 && jtype == 0) 
+      ref_torsion_single_interaction(ka, i, j);
+  }
+}
+
+/*
+ Orders the interactions by itype and jtype and passes chunks to the above 
+ method.
+*/
+static void aut_frebo(KernelArgsAIREBOT<flt_t,acc_t> * ka, int torflag) {
+  AtomAIREBOT<flt_t> * _noalias x = ka->x;
+  int * _noalias tag = ka->tag;
+  int * _noalias map = ka->map;
+  int i_buf[2][2][fvec::VL];
+  int j_buf[2][2][fvec::VL];
+  int n_buf[2][2] = {0};
+  for (int i = ka->frebo_from_atom; i < ka->frebo_to_atom; i++) {
+    int itag = tag[i];
+    int itype = map[x[i].w];
+    flt_t x_i = x[i].x;
+    flt_t y_i = x[i].y;
+    flt_t z_i = x[i].z;
+    int * neighs = ka->neigh_rebo.entries + ka->neigh_rebo.offset[i];
+    int jnum = ka->neigh_rebo.num[i];
+    for (int jj = 0; jj < jnum; jj++) {
+      int j = neighs[jj];
+      int jtag = tag[j];
+      if (itag > jtag) {
+        if (((itag + jtag) & 1) == 0)
+          continue;
+      } else if (itag < jtag) {
+        if (((itag + jtag) & 1) == 1)
+          continue;
+      } else {
+        if (x[j].z < z_i)
+          continue;
+        if (x[j].z == z_i && x[j].y < y_i)
+          continue;
+        if (x[j].z == z_i && x[j].y == y_i && x[j].x < x_i)
+          continue;
+      }
+      int jtype = map[x[j].w];
+      int ins = n_buf[itype][jtype];
+      i_buf[itype][jtype][ins] = i;
+      j_buf[itype][jtype][ins] = j;
+      n_buf[itype][jtype] += 1;
+      if (n_buf[itype][jtype] == fvec::VL) {
+        aut_frebo_batch_of_kind(ka, torflag, itype, jtype,
+            i_buf[itype][jtype], j_buf[itype][jtype]);
+        n_buf[itype][jtype] = 0;
+      }
+    }
+  }
+  for (int itype = 0; itype < 2; itype++) {
+    for (int jtype = 0; jtype < 2; jtype++) {
+      for (int l = 0; l < n_buf[itype][jtype]; l++) {
+        int i = i_buf[itype][jtype][l];
+        int j = j_buf[itype][jtype][l];
+        ref_frebo_single_interaction(ka, i, j);
+        if (torflag && itype == 0 && jtype == 0) 
+	  ref_torsion_single_interaction(ka, i, j);
+      }
+    }
+  }
+}
+
+/*
+ * Apply paths in scalar fashion, not crucial for performance.
+ */
+static void aut_airebo_lj_force_path(KernelArgsAIREBOT<flt_t,acc_t> * ka, 
+   bvec mask, fvec dC, LennardJonesPathAIREBOT<flt_t> path[fvec::VL]) {
+  for (int i = 0; i < fvec::VL; i++) {
+    if (bvec::test_at(mask, i)) {
+      ref_lennard_jones_force_path(ka, fvec::at(dC, i), &path[i]);
+    }
+  }
+}
+
+/*
+ * Hash-Map for efficient calculation of C_ij.
+ * Can have up to ITEMS entries with associated paths, as well as
+ * 1024 entries. Open addressing, invalidation by using a different i.
+ * Only needs to be reset once per timestep.
+ */
+static const int OPT_TEST_PATH_SIZE = 1024;
+static const int OPT_TEST_PATH_ITEMS = 128;
+struct aut_airebo_lj_test_path_result_data {
+  LennardJonesPathAIREBOT<flt_t> testpath[OPT_TEST_PATH_ITEMS];
+  int i[OPT_TEST_PATH_SIZE];
+  int j[OPT_TEST_PATH_SIZE];
+  flt_t cij[OPT_TEST_PATH_SIZE];
+  int testpath_idx[OPT_TEST_PATH_SIZE];
+};
+static const unsigned int OPT_TEST_PATH_HASH = 2654435761;
+
+static int aut_lj_tap_hash_fn(int j, int attempt) {
+  uint32_t result = j;
+  result *= (uint32_t) OPT_TEST_PATH_HASH;
+  result += (uint32_t) attempt;
+  result %= (uint32_t) OPT_TEST_PATH_SIZE;
+  return result;
+}
+
+static ivec aut_airebo_lj_tap_hash_fn_vec(ivec val, ivec attempt) {
+  const ivec golden = ivec::set1(OPT_TEST_PATH_HASH);
+  const ivec mask = ivec::set1(OPT_TEST_PATH_SIZE - 1);
+  ivec a = ivec::mullo(golden, val);
+  ivec b = a +  attempt;
+  ivec c = ivec::the_and(b, mask);
+  return c;
+}
+
+/*
+ * Enter all those (potential) neighbors of i (including 2nd and 3rd degree) 
+ * into the hash-map. There is no good way to vectorize this, and it does not 
+ * seem time-critical.
+ */
+static bool aut_airebo_lj_test_all_paths(KernelArgsAIREBOT<flt_t,acc_t> * ka, 
+    int i, struct aut_airebo_lj_test_path_result_data * result) {
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * map = ka->map;
+  flt_t (*rcmin)[2] = &ka->params.rcmin[0];
+  flt_t (*rcmax)[2] = &ka->params.rcmax[0];
+  flt_t rcminsq[2][2];
+  rcminsq[0][0] = rcmin[0][0] * rcmin[0][0];
+  rcminsq[0][1] = rcmin[0][1] * rcmin[0][1];
+  rcminsq[1][0] = rcmin[1][0] * rcmin[1][0];
+  rcminsq[1][1] = rcmin[1][1] * rcmin[1][1];
+  int * neighs_i = &ka->neigh_rebo.entries[ka->neigh_rebo.offset[i]];
+  int itype = map[x[i].w];
+  int path_insert_pos = 0;
+  for (int jj = 0; jj < ka->neigh_rebo.num[i]; jj++) {
+    int j = neighs_i[jj];
+    int jtype = map[x[j].w];
+    flt_t dijx = x[j].x - x[i].x;
+    flt_t dijy = x[j].y - x[i].y;
+    flt_t dijz = x[j].z - x[i].z;
+    flt_t rijsq = dijx * dijx + dijy * dijy + dijz * dijz;
+    flt_t wj = 1, dwj = 0;
+    flt_t rij = 0;
+    if (rijsq >= rcminsq[itype][jtype]) {
+      rij = overloaded::sqrt(rijsq);
+      wj = Sp(rij, rcmin[itype][jtype], rcmax[itype][jtype], &dwj);
+    }
+    int attempt = 0;
+    int start_hash_slot = aut_lj_tap_hash_fn(j, attempt);
+    int hash_slot = start_hash_slot;
+    while (result->i[hash_slot] == i && result->j[hash_slot] != j && 
+	   attempt < OPT_TEST_PATH_SIZE) {
+      hash_slot = aut_lj_tap_hash_fn(j, ++attempt);
+    }
+    if (attempt >= OPT_TEST_PATH_SIZE) goto exceed_limits;
+    bool init_slot = result->i[hash_slot] != i;
+    if (init_slot || (1 - wj < result->cij[hash_slot])) {
+      result->i[hash_slot] = i;
+      result->j[hash_slot] = j;
+      result->cij[hash_slot] = 1 - wj;
+      if (wj != 1.0) {
+        if (path_insert_pos >= OPT_TEST_PATH_ITEMS) goto exceed_limits;
+        result->testpath_idx[hash_slot] = path_insert_pos;
+        LennardJonesPathAIREBOT<flt_t> *path = 
+	  &result->testpath[path_insert_pos++];
+        path->num = 2;
+        path->del[0].x = dijx;
+        path->del[0].y = dijy;
+        path->del[0].z = dijz;
+        if (rij == 0) rij = sqrt(rijsq);
+        path->r[0] = rij;
+        path->w[0] = wj;
+        path->dw[0] = dwj;
+        path->idx[0] = i;
+        path->idx[1] = j;
+      }
+    }
+    int * neighs_j = &ka->neigh_rebo.entries[ka->neigh_rebo.offset[j]];
+    for (int kk = 0; kk < ka->neigh_rebo.num[j]; kk++) {
+      int k = neighs_j[kk];
+      if (k == i) continue;
+      int ktype = map[x[k].w];
+      flt_t djkx = x[k].x - x[j].x;
+      flt_t djky = x[k].y - x[j].y;
+      flt_t djkz = x[k].z - x[j].z;
+      flt_t rjksq = djkx * djkx + djky * djky + djkz * djkz;
+      flt_t wk = 1, dwk = 0;
+      flt_t rjk = 0;
+      if (rjksq >= rcminsq[jtype][ktype]) {
+        rjk = overloaded::sqrt(rjksq);
+        wk = Sp(rjk, rcmin[jtype][ktype], rcmax[jtype][ktype], &dwk);
+      }
+      int attempt = 0;
+      int start_hash_slot = aut_lj_tap_hash_fn(k, attempt);
+      int hash_slot = start_hash_slot;
+      while (result->i[hash_slot] == i && result->j[hash_slot] != k && 
+	     attempt < OPT_TEST_PATH_SIZE) {
+        hash_slot = aut_lj_tap_hash_fn(k, ++attempt);
+      }
+      if (attempt >= OPT_TEST_PATH_SIZE) goto exceed_limits;
+      bool init_slot = result->i[hash_slot] != i;
+      if (init_slot || (1 - wj * wk < result->cij[hash_slot])) {
+        result->i[hash_slot] = i;
+        result->j[hash_slot] = k;
+        result->cij[hash_slot] = 1 - wj * wk;
+        if (wj * wk != 1.0) {
+          if (path_insert_pos >= OPT_TEST_PATH_ITEMS) goto exceed_limits;
+          result->testpath_idx[hash_slot] = path_insert_pos;
+          LennardJonesPathAIREBOT<flt_t> *path = 
+	    &result->testpath[path_insert_pos++];
+          path->num = 3;
+          path->del[0].x = dijx;
+          path->del[0].y = dijy;
+          path->del[0].z = dijz;
+          if (rij == 0) rij = sqrt(rijsq);
+          path->r[0] = rij;
+          path->del[1].x = djkx;
+          path->del[1].y = djky;
+          path->del[1].z = djkz;
+          if (rjk == 0) rjk = sqrt(rjksq);
+          path->r[1] = rjk;
+          path->w[0] = wj;
+          path->dw[0] = dwj;
+          path->w[1] = wk;
+          path->dw[1] = dwk;
+          path->idx[0] = i;
+          path->idx[1] = j;
+          path->idx[2] = k;
+        }
+      }
+      int * neighs_k = &ka->neigh_rebo.entries[ka->neigh_rebo.offset[k]];
+      for (int ll = 0; ll < ka->neigh_rebo.num[k]; ll++) {
+        int l = neighs_k[ll];
+        if ((l == i) || (l == j)) continue;
+        int ltype = map[x[l].w];
+        flt_t dklx = x[l].x - x[k].x;
+        flt_t dkly = x[l].y - x[k].y;
+        flt_t dklz = x[l].z - x[k].z;
+        flt_t rklsq = dklx * dklx + dkly * dkly + dklz * dklz;
+        flt_t wl = 1, dwl = 0;
+        flt_t rkl = 0;
+        if (rklsq >= rcminsq[ktype][ltype]) {
+          rkl = overloaded::sqrt(rklsq);
+          wl = Sp(rkl, rcmin[ktype][ltype], rcmax[ktype][ltype], &dwl);
+        }
+        int attempt = 0;
+        int start_hash_slot = aut_lj_tap_hash_fn(l, attempt);
+        int hash_slot = start_hash_slot;
+        while (result->i[hash_slot] == i && result->j[hash_slot] != l && 
+	       attempt < OPT_TEST_PATH_SIZE) {
+          hash_slot = aut_lj_tap_hash_fn(l, ++attempt);
+        }
+        if (attempt >= OPT_TEST_PATH_SIZE) goto exceed_limits;
+        bool init_slot = result->i[hash_slot] != i;
+        if (init_slot || (1 - wj * wk * wl < result->cij[hash_slot])) {
+          result->i[hash_slot] = i;
+          result->j[hash_slot] = l;
+          result->cij[hash_slot] = 1 - wj * wk * wl;
+          if (wj * wk * wl != 1.0) {
+            if (path_insert_pos >= OPT_TEST_PATH_ITEMS) goto exceed_limits;
+            result->testpath_idx[hash_slot] = path_insert_pos;
+            LennardJonesPathAIREBOT<flt_t> *path = 
+	      &result->testpath[path_insert_pos++];
+            path->num = 4;
+            path->del[0].x = dijx;
+            path->del[0].y = dijy;
+            path->del[0].z = dijz;
+            if (rij == 0) rij = sqrt(rijsq);
+            path->r[0] = rij;
+            path->del[1].x = djkx;
+            path->del[1].y = djky;
+            path->del[1].z = djkz;
+            if (rjk == 0) rjk = sqrt(rjksq);
+            path->r[1] = rjk;
+            path->del[2].x = dklx;
+            path->del[2].y = dkly;
+            path->del[2].z = dklz;
+            if (rkl == 0) rkl = sqrt(rklsq);
+            path->r[2] = rkl;
+            path->w[0] = wj;
+            path->dw[0] = dwj;
+            path->w[1] = wk;
+            path->dw[1] = dwk;
+            path->w[2] = wl;
+            path->dw[2] = dwl;
+            path->idx[0] = i;
+            path->idx[1] = j;
+            path->idx[2] = k;
+            path->idx[3] = l;
+          }
+        }
+      }
+    }
+  }
+  return true;
+exceed_limits:
+  return false;
+}
+
+/*
+ * Attempt to look up an element in the hash-map.
+ */
+static fvec aut_airebo_lj_tap_test_path(KernelArgsAIREBOT<flt_t,acc_t> * ka, 
+  struct aut_airebo_lj_test_path_result_data * test_path_result,
+  bvec need_search, ivec i_bc, ivec j, 
+  LennardJonesPathAIREBOT<flt_t> path[fvec::VL]
+) {
+  const ivec c_i1 = ivec::set1(1);
+  fvec cij = fvec::set1(1.0);
+  // first round: hash all j
+  // lookup i/j in hash list.
+  // if i matches and j matches: congrats
+  // if i matches and j does not: look up attempts
+  // if attempts > current_attempts:
+  //   do another round of hashing
+  // for all those found:
+
+  //   fill in the path
+  // -----------------------------------------------
+  // find all the correct hash slots, and a mask of where found.
+  ivec attempt = ivec::setzero();
+  ivec hash_slot = aut_airebo_lj_tap_hash_fn_vec(j, attempt);
+  ivec lookup_i = ivec::mask_gather(ivec::undefined(), need_search, hash_slot,
+      &test_path_result->i[0], sizeof(int));
+  bvec correct_i = ivec::mask_cmpeq(need_search, lookup_i, i_bc);
+  ivec lookup_j = ivec::mask_gather(ivec::undefined(), correct_i, hash_slot,
+      &test_path_result->j[0], sizeof(int));
+  bvec found_items = ivec::mask_cmpeq(correct_i, lookup_j, j);
+  bvec another_attempt = correct_i & ~ found_items;
+  while (bvec::test_any_set(another_attempt)) {
+    attempt = ivec::mask_add(attempt, another_attempt, attempt, c_i1);
+    hash_slot = aut_airebo_lj_tap_hash_fn_vec(j, attempt);
+    ivec lookup_i_2 = ivec::mask_gather(lookup_i, another_attempt, hash_slot,
+        &test_path_result->i[0], sizeof(int));
+    lookup_i = lookup_i_2;
+    correct_i = ivec::mask_cmpeq(need_search, lookup_i, i_bc);
+    lookup_j = ivec::mask_gather(lookup_j, another_attempt, hash_slot,
+        &test_path_result->j[0], sizeof(int));
+    found_items = ivec::mask_cmpeq(correct_i, lookup_j, j);
+    another_attempt = correct_i & ~ found_items;
+  }
+  cij = fvec::mask_gather(cij, found_items, hash_slot, 
+			  &test_path_result->cij[0], sizeof(flt_t));
+  bvec need_testpath = fvec::mask_cmplt(found_items, fvec::setzero(), cij);
+  if (bvec::test_any_set(need_testpath)) {
+    for (int i = 0; i < fvec::VL; i++) {
+      if (bvec::test_at(need_testpath, i)) {
+        int testpath_idx = 
+          test_path_result->testpath_idx[ivec::at(hash_slot, i)];
+        path[i] = test_path_result->testpath[testpath_idx];
+      }
+    }
+  }
+  return cij;
+}
+
+/*
+ * This function calculates the Lennard-Jones interaciton for those
+ * elements that require a bond-order calculation.
+ * It is similarly structured as the aut_frebo_batch_of_kind function.
+ * The forces due to bondorders are calculated speculatively and later
+ * updated with the correct outer derivative.
+ */
+template<int MORSEFLAG>
+static void aut_lj_with_bo(
+    KernelArgsAIREBOT<flt_t,acc_t> * ka,
+    int itype, int jtype,
+    ivec i, ivec j,
+    fvec cij, LennardJonesPathAIREBOT<flt_t> testpath[fvec::VL]
+) {
+ { // jump-scope for exceed_limits
+  AtomAIREBOT<flt_t> * _noalias x = ka->x;
+  ResultForceT<acc_t> * result_f = ka->result_f;
+
+  ivec c_i4 = ivec::set1(4);
+  fvec c_1_0 = fvec::set1(1.0);
+  fvec c_2_0 = fvec::set1(2.0);
+  fvec c_0_5 = fvec::set1(0.5);
+
+  fvec x_i, y_i, z_i;
+  aut_loadatoms_vec_notype(x, i, &x_i, &y_i, &z_i);
+  fvec x_j, y_j, z_j;
+  aut_loadatoms_vec_notype(x, j, &x_j, &y_j, &z_j);
+  fvec delx = x_i -  x_j;
+  fvec dely = y_i -  y_j;
+  fvec delz = z_i -  z_j;
+  fvec rsq = delx *  delx +  dely *  dely +  delz *  delz;
+
+  fvec rij = fvec::sqrt(rsq);
+  bvec need_path_force = fvec::cmplt(cij, c_1_0);
+  flt_t sigcut = ka->params.sigcut;
+  flt_t sigmin = ka->params.sigmin;
+  flt_t sigma = ka->params.sigma[itype][jtype];
+  flt_t rljmax = sigcut * sigma;
+  flt_t rljmin = sigmin * sigma;
+  fvec p_rljmin = fvec::set1(rljmin);
+  fvec p_rljmax = fvec::set1(rljmax);
+
+  fvec dslw, slw = aut_Sp2_deriv(rij, p_rljmin, p_rljmax, &dslw);
+
+  fvec p_lj1 = fvec::set1(ka->params.lj1[itype][jtype]);
+  fvec p_lj2 = fvec::set1(ka->params.lj2[itype][jtype]);
+  fvec p_lj3 = fvec::set1(ka->params.lj3[itype][jtype]);
+  fvec p_lj4 = fvec::set1(ka->params.lj4[itype][jtype]);
+
+  fvec r2inv = fvec::recip(rsq);
+
+  fvec vdw, dvdw;
+  if (MORSEFLAG) {
+    fvec exr = fvec::exp(fvec::setzero() - rij * p_lj4);
+    vdw = p_lj1 * exr * (p_lj2 * exr - c_2_0);
+    dvdw = p_lj3 * exr * (c_1_0 - p_lj2 * exr);
+  } else {
+    fvec r6inv = r2inv *  r2inv *  r2inv;
+
+    vdw = r6inv * ( p_lj3 *  r6inv -  p_lj4);
+    fvec r7inv = r6inv *  rij *  r2inv;
+    dvdw = r7inv * ( p_lj2 -  p_lj1 *  r6inv);
+  }
+
+  fvec VLJ = vdw *  slw;
+  fvec dVLJ = dvdw *  slw +  vdw *  dslw;
+
+  fvec p_rcLJmin = fvec::set1(ka->params.rcLJmin[itype][jtype]);
+  fvec p_rcLJmax = fvec::set1(ka->params.rcLJmax[itype][jtype]);
+  fvec dStr, Str = aut_Sp2_deriv(rij, p_rcLJmin, p_rcLJmax, &dStr);
+  fvec VA = cij *  VLJ *  Str;
+
+  fvec fij[3], fji[3];
+  fij[0] = fvec::setzero(); fij[1] = fvec::setzero();
+  fij[2] = fvec::setzero();
+  fji[0] = fvec::setzero(); fji[1] = fvec::setzero();
+  fji[2] = fvec::setzero();
+
+  ivec vi = i;
+  ivec vj = j;
+
+  struct aut_frebo_data i_data, j_data;
+  i_data.x_i = x_i;
+  i_data.y_i = y_i;
+  i_data.z_i = z_i;
+  i_data.x_j = x_j;
+  i_data.y_j = y_j;
+  i_data.z_j = z_j;
+  j_data.x_i = x_j;
+  j_data.y_i = y_j;
+  j_data.z_i = z_j;
+  j_data.x_j = x_i;
+  j_data.y_j = y_i;
+  j_data.z_j = z_i;
+
+  fvec p_rcmin = fvec::set1(ka->params.rcmin[itype][jtype]);
+  fvec p_rcmax = fvec::set1(ka->params.rcmax[itype][jtype]);
+  fvec dwij;
+  fvec wij = aut_Sp_deriv(rij, p_rcmin, p_rcmax, &dwij);
+
+  fvec nHi = fvec::gather(vi, ka->nH, sizeof(flt_t));
+  fvec nCi = fvec::gather(vi, ka->nC, sizeof(flt_t));
+  fvec nHj = fvec::gather(vj, ka->nH, sizeof(flt_t));
+  fvec nCj = fvec::gather(vj, ka->nC, sizeof(flt_t));
+  fvec Nij = nHi +  nCi -  wij;
+  fvec Nji = nHj +  nCj -  wij;
+  i_data.nHi = nHi;
+  i_data.nCi = nCi;
+  j_data.nHi = nHj;
+  j_data.nCi = nCj;
+
+  fvec the_r = fvec::set1(ka->params.rcmin[itype][jtype]);
+  fvec scale = the_r / rij;
+
+  fvec NconjtmpI;
+  fvec pij = aut_frebo_pij_pd_2(ka, &i_data, itype, jtype, vi, vj, 
+				delx * scale, dely * scale, delz * scale, 
+				the_r, wij, VA, &NconjtmpI, fij);
+
+  if (i_data.buf_len < 0) goto exceed_limits;
+
+  fvec NconjtmpJ;
+  fvec rjix = fvec::setzero() -  delx;
+  fvec rjiy = fvec::setzero() -  dely;
+  fvec rjiz = fvec::setzero() -  delz;
+  fvec pji = aut_frebo_pij_pd_2(ka, &j_data, jtype, itype, vj, vi, 
+				rjix * scale, rjiy * scale, rjiz * scale, 
+				the_r, wij, VA, &NconjtmpJ, fji);
+  fij[0] = fij[0] -  fji[0];
+  fij[1] = fij[1] -  fji[1];
+  fij[2] = fij[2] -  fji[2];
+
+  if (j_data.buf_len < 0) goto exceed_limits;
+
+  fvec Nijconj = c_1_0 +  NconjtmpI *  NconjtmpI +  NconjtmpJ *  NconjtmpJ;
+  fvec dN3[3];
+  fvec pi_rc = aut_frebo_pi_rc_pd(ka, itype, jtype, Nij, Nji, Nijconj, dN3);
+
+  fvec c_TOL = fvec::set1(TOL);
+  fvec dN3_dh[3];
+  fvec Tij = aut_frebo_Tij(ka, itype, jtype, Nij, Nji, Nijconj, &dN3_dh[0]);
+  bvec TijgtTOLmask = fvec::cmpnle(fvec::abs(Tij), c_TOL);
+  fvec sum_omega = fvec::setzero();
+  if (bvec::test_any_set(TijgtTOLmask)) {
+    sum_omega = aut_frebo_sum_omega(
+        ka, &i_data, &j_data, itype, jtype, vi, vj,
+        delx * scale, dely * scale, delz * scale, the_r, VA *  Tij, fij);
+    sum_omega = fvec::mask_blend(TijgtTOLmask, fvec::setzero(), sum_omega);
+  }
+  fvec pi_dh = Tij *  sum_omega;
+
+  fvec bij = c_0_5 * ( pij +  pji) + pi_rc +  pi_dh;
+
+  fvec p_bLJmin = fvec::set1(ka->params.bLJmin[itype][jtype]);
+  fvec p_bLJmax = fvec::set1(ka->params.bLJmax[itype][jtype]);
+  fvec dStb, Stb = aut_Sp2_deriv(bij, p_bLJmin, p_bLJmax, &dStb);
+
+  bvec need_bo_deriv = fvec::cmpneq(dStb, fvec::setzero());
+  // fix up j_data, i_data, fij:
+  // multiply each by dStb
+  if (bvec::test_any_set(need_bo_deriv)) {
+    i_data.force_i_x = dStb * i_data.force_i_x;
+    i_data.force_i_y = dStb * i_data.force_i_y;
+    i_data.force_i_z = dStb * i_data.force_i_z;
+    i_data.force_j_x = dStb * i_data.force_j_x;
+    i_data.force_j_y = dStb * i_data.force_j_y;
+    i_data.force_j_z = dStb * i_data.force_j_z;
+    j_data.force_i_x = dStb * j_data.force_i_x;
+    j_data.force_i_y = dStb * j_data.force_i_y;
+    j_data.force_i_z = dStb * j_data.force_i_z;
+    j_data.force_j_x = dStb * j_data.force_j_x;
+    j_data.force_j_y = dStb * j_data.force_j_y;
+    j_data.force_j_z = dStb * j_data.force_j_z;
+    for (int k = 0; k < i_data.buf_len; k++) {
+      i_data.force_k_x_buf[k] = dStb * i_data.force_k_x_buf[k];
+      i_data.force_k_y_buf[k] = dStb * i_data.force_k_y_buf[k];
+      i_data.force_k_z_buf[k] = dStb * i_data.force_k_z_buf[k];
+    }
+    for (int k = 0; k < j_data.buf_len; k++) {
+      j_data.force_k_x_buf[k] = dStb * j_data.force_k_x_buf[k];
+      j_data.force_k_y_buf[k] = dStb * j_data.force_k_y_buf[k];
+      j_data.force_k_z_buf[k] = dStb * j_data.force_k_z_buf[k];
+    }
+    fvec fijc[3];
+    fijc[0] = dStb * fij[0];
+    fijc[1] = dStb * fij[1];
+    fijc[2] = dStb * fij[2];
+    fij[0] = scale * (fijc[0] - (delx * delx * fijc[0] + dely * delx * 
+				 fijc[1] + delz * delx * fijc[2]) / rsq);
+    fij[1] = scale * (fijc[1] - (delx * dely * fijc[0] + dely * dely * 
+				 fijc[1] + delz * dely * fijc[2]) / rsq);
+    fij[2] = scale * (fijc[2] - (delx * delz * fijc[0] + dely * delz * 
+				 fijc[1] + delz * delz * fijc[2]) / rsq);
+
+    aut_frebo_N_spline_force(ka, &i_data, itype, jtype, vi, vj, dStb * VA, 
+			     dN3[0], dN3[2], NconjtmpI);
+    aut_frebo_N_spline_force(ka, &j_data, jtype, itype, vj, vi, dStb * VA, 
+			     dN3[1], dN3[2], NconjtmpJ);
+    if (bvec::test_any_set(TijgtTOLmask)) {
+      aut_frebo_N_spline_force(ka, &i_data, itype, jtype, vi, vj, 
+			       dStb * VA * sum_omega, dN3_dh[0], dN3_dh[2], 
+			       NconjtmpI);
+      aut_frebo_N_spline_force(ka, &j_data, jtype, itype, vj, vi, 
+			       dStb * VA * sum_omega, dN3_dh[1], dN3_dh[2], 
+			       NconjtmpJ);
+    }
+
+    aut_frebo_data_writeback(ka, &i_data);
+    aut_frebo_data_writeback(ka, &j_data);
+  } else {
+    fij[0] = fvec::setzero();
+    fij[1] = fvec::setzero();
+    fij[2] = fvec::setzero();
+  }
+
+  fvec fpdVLJ = cij *  dVLJ * ( c_1_0 +  Str * ( Stb -  c_1_0));
+  fvec fpdStr = dStr *  cij * ( Stb *  VLJ -  VLJ);
+  fvec fpair = r2inv *  rij * ( fvec::setzero() - ( fpdVLJ +  fpdStr));
+  fvec evdwl = VA *  Stb +  cij *  VLJ * ( c_1_0 -  Str);
+
+  fvec result_f_i_x = fpair *  delx +  fij[0];
+  fvec result_f_i_y = fpair *  dely +  fij[1];
+  fvec result_f_i_z = fpair *  delz +  fij[2];
+  fvec result_f_j_x = fvec::setzero() -  result_f_i_x;
+  fvec result_f_j_y = fvec::setzero() -  result_f_i_y;
+  fvec result_f_j_z = fvec::setzero() -  result_f_i_z;
+
+  flt_t fi_x_buf[fvec::VL] __attribute__((aligned(64)));
+  flt_t fi_y_buf[fvec::VL] __attribute__((aligned(64)));
+  flt_t fi_z_buf[fvec::VL] __attribute__((aligned(64)));
+  int fi_i_buf[ivec::VL] __attribute__((aligned(64)));
+  flt_t fj_x_buf[fvec::VL] __attribute__((aligned(64)));
+  flt_t fj_y_buf[fvec::VL] __attribute__((aligned(64)));
+  flt_t fj_z_buf[fvec::VL] __attribute__((aligned(64)));
+  int fj_j_buf[ivec::VL] __attribute__((aligned(64)));
+  flt_t evdwl_buf[fvec::VL] __attribute__((aligned(64)));
+
+  if (bvec::test_any_set(need_bo_deriv)) {
+    result_f_i_x = i_data.force_i_x +  result_f_i_x;
+    result_f_i_y = i_data.force_i_y +  result_f_i_y;
+    result_f_i_z = i_data.force_i_z +  result_f_i_z;
+    result_f_j_x = i_data.force_j_x +  result_f_j_x;
+    result_f_j_y = i_data.force_j_y +  result_f_j_y;
+    result_f_j_z = i_data.force_j_z +  result_f_j_z;
+
+    result_f_i_x = j_data.force_j_x +  result_f_i_x;
+    result_f_i_y = j_data.force_j_y +  result_f_i_y;
+    result_f_i_z = j_data.force_j_z +  result_f_i_z;
+    result_f_j_x = j_data.force_i_x +  result_f_j_x;
+    result_f_j_y = j_data.force_i_y +  result_f_j_y;
+    result_f_j_z = j_data.force_i_z +  result_f_j_z;
+  }
+
+  fvec::store(fi_x_buf, result_f_i_x);
+  fvec::store(fi_y_buf, result_f_i_y);
+  fvec::store(fi_z_buf, result_f_i_z);
+  ivec::store(fi_i_buf, vi);
+  fvec::store(fj_x_buf, result_f_j_x);
+  fvec::store(fj_y_buf, result_f_j_y);
+  fvec::store(fj_z_buf, result_f_j_z);
+  ivec::store(fj_j_buf, vj);
+  fvec::store(evdwl_buf, evdwl);
+
+  int lane;
+  for (lane = 0; lane < fvec::VL; lane++) {
+    int ii = fi_i_buf[lane];
+    result_f[ii].x += fi_x_buf[lane];
+    result_f[ii].y += fi_y_buf[lane];
+    result_f[ii].z += fi_z_buf[lane];
+    result_f[ii].w += 0.5 * evdwl_buf[lane];
+    int jj = fj_j_buf[lane];
+    result_f[jj].x += fj_x_buf[lane];
+    result_f[jj].y += fj_y_buf[lane];
+    result_f[jj].z += fj_z_buf[lane];
+    result_f[jj].w += 0.5 * evdwl_buf[lane];
+  }
+  ka->result_eng += fvec::reduce_add(evdwl);
+
+  if (bvec::test_any_set(need_path_force)) {
+    fvec dC = VLJ * ( Str *  Stb +  c_1_0 -  Str);
+    aut_airebo_lj_force_path(ka, need_path_force, dC, testpath);
+  }
+  return;
+ }
+exceed_limits:
+  for (int l = 0; l < fvec::VL; l++) {
+    ref_lennard_jones_single_interaction(ka, ivec::at(i, l), ivec::at(j, l), 
+					 MORSEFLAG);
+  }
+  return;
+}
+
+/*
+ * Calculate the lennard-jones interaction.
+ * Uses the above hash-map, and outlines the calculation if the bondorder is
+ *  needed.
+ * Agressively compresses to get the most values calculated.
+ */
+template<int MORSEFLAG>
+static void aut_lennard_jones(KernelArgsAIREBOT<flt_t,acc_t> * ka) {
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * tag = ka->tag;
+  int * map = ka->map;
+  ResultForceT<acc_t> * result_f = ka->result_f;
+  ivec c_i1 = ivec::set1(1);
+  ivec c_i4 = ivec::set1(4);
+  fvec c_1_0 = fvec::set1(1.0);
+  fvec c_2_0 = fvec::set1(2.0);
+  fvec c_0_0 = fvec::set1(0.0);
+  int map_i_scalar = 0;
+  {
+    int i;
+    for (i = 1; i < ka->num_types; i++) {
+      if (ka->map[i])
+        map_i_scalar |= (1 << i);
+    }
+  }
+  ivec map_i = ivec::set1(map_i_scalar);
+  fvec result_eng = fvec::setzero();
+
+  struct aut_airebo_lj_test_path_result_data test_path_result;
+  for (int i = 0; i < OPT_TEST_PATH_SIZE; i++) {
+    test_path_result.i[i] = -1;
+  }
+
+  ivec i_bo[2][2];
+  ivec j_bo[2][2];
+  fvec cij_bo[2][2];
+  LennardJonesPathAIREBOT<flt_t> testpath_bo[2][2][fvec::VL];
+  int num_bo[2][2] = {0};
+
+  for (int i = ka->frebo_from_atom; i < ka->frebo_to_atom; i++) {
+    ivec itag_bc = ivec::set1(tag[i]);
+    int itype = map[x[i].w];
+    fvec x_i = fvec::set1(x[i].x);
+    fvec y_i = fvec::set1(x[i].y);
+    fvec z_i = fvec::set1(x[i].z);
+    ivec i_bc = ivec::set1(i);
+
+    fvec cutljsq0 = fvec::set1(ka->params.cutljsq[itype][0]);
+    fvec cutljsq1 = fvec::set1(ka->params.cutljsq[itype][1]);
+    fvec p_rcmax0 = fvec::set1(ka->params.rcmax[itype][0]);
+    fvec p_rcmax1 = fvec::set1(ka->params.rcmax[itype][1]);
+    flt_t sigcut = ka->params.sigcut;
+    flt_t sigmin = ka->params.sigmin;
+    flt_t sigma0 = ka->params.sigma[itype][0];
+    flt_t rljmax0 = sigcut * sigma0;
+    flt_t rljmin0 = sigmin * sigma0;
+    flt_t sigma1 = ka->params.sigma[itype][1];
+    flt_t rljmax1 = sigcut * sigma1;
+    flt_t rljmin1 = sigmin * sigma1;
+    fvec p_rljmax0 = fvec::set1(rljmax0);
+    fvec p_rljmax1 = fvec::set1(rljmax1);
+    fvec p_rljmin0 = fvec::set1(rljmin0);
+    fvec p_rljmin1 = fvec::set1(rljmin1);
+    fvec p_rcLJmax0 = fvec::set1(ka->params.rcLJmax[itype][0]);
+    fvec p_rcLJmax1 = fvec::set1(ka->params.rcLJmax[itype][1]);
+    fvec p_rcLJmin0 = fvec::set1(ka->params.rcLJmin[itype][0]);
+    fvec p_rcLJmin1 = fvec::set1(ka->params.rcLJmin[itype][1]);
+    fvec p_lj10 = fvec::set1(ka->params.lj1[itype][0]);
+    fvec p_lj11 = fvec::set1(ka->params.lj1[itype][1]);
+    fvec p_lj20 = fvec::set1(ka->params.lj2[itype][0]);
+    fvec p_lj21 = fvec::set1(ka->params.lj2[itype][1]);
+    fvec p_lj30 = fvec::set1(ka->params.lj3[itype][0]);
+    fvec p_lj31 = fvec::set1(ka->params.lj3[itype][1]);
+    fvec p_lj40 = fvec::set1(ka->params.lj4[itype][0]);
+    fvec p_lj41 = fvec::set1(ka->params.lj4[itype][1]);
+
+    int * neighs = ka->neigh_lmp.entries + ka->neigh_lmp.offset[i];
+    int jnum = ka->neigh_lmp.num_half[i];
+
+    bool tap_success = aut_airebo_lj_test_all_paths(ka, i, &test_path_result);
+    if (! tap_success) {
+      for (int jj = 0; jj < jnum; jj++) {
+        ref_lennard_jones_single_interaction(ka, i, neighs[jj], MORSEFLAG);
+      }
+      continue;
+    }
+
+    ivec j_2;
+    fvec delx_2, dely_2, delz_2, rsq_2;
+    bvec jtype_mask_2;
+    int num_2 = 0;
+
+    fvec result_f_i_x = fvec::setzero();
+    fvec result_f_i_y = fvec::setzero();
+    fvec result_f_i_z = fvec::setzero();
+
+    int jj = 0;
+    bool rest_j = jj < jnum;
+    bool rest_2 = fvec::fast_compress();
+    #pragma forceinline recursive
+    while (rest_j || rest_2) {
+      fvec delx, dely, delz, rsq;
+      bvec jtype_mask, within_cutoff;
+      ivec j;
+      if (rest_j) {
+        bvec mask_0 = bvec::full();
+	//0xFF >> (8 - (jnum - jj));
+        if (jj + (fvec::VL - 1) >= jnum) mask_0 = bvec::only(jnum - jj);
+        j = ivec::maskz_loadu(mask_0, &neighs[jj]);
+        fvec x_j, y_j, z_j;
+        aut_loadatoms_vec(x, j, &x_j, &y_j, &z_j, &jtype_mask, map, map_i, 
+			  c_i1);
+        fvec::gather_prefetch0(ivec::mullo(c_i4, 
+	  ivec::maskz_loadu(bvec::full(), &neighs[jj + fvec::VL])), x);
+        _mm_prefetch((const char*)&neighs[jj + 2 * fvec::VL], _MM_HINT_T0);
+        delx = x_i -  x_j;
+        dely = y_i -  y_j;
+        delz = z_i -  z_j;
+        rsq = delx *  delx +  dely *  dely +  delz *  delz;
+        fvec cutoff_sq = fvec::mask_blend(jtype_mask, cutljsq0, cutljsq1);
+        within_cutoff = fvec::mask_cmplt(mask_0, rsq, cutoff_sq);
+
+        if (fvec::fast_compress()) {
+          j = ivec::masku_compress(within_cutoff, j);
+          delx = fvec::masku_compress(within_cutoff, delx);
+          dely = fvec::masku_compress(within_cutoff, dely);
+          delz = fvec::masku_compress(within_cutoff, delz);
+          rsq = fvec::masku_compress(within_cutoff, rsq);
+          jtype_mask = bvec::masku_compress(within_cutoff, jtype_mask);
+          //within_cutoff = 0xFF >> (8 - _cc_popcnt(within_cutoff));
+
+          bvec mask_2 = bvec::after(num_2);//0xFF << num_2;
+          j_2 = ivec::mask_expand(j_2, mask_2, j);
+          delx_2 = fvec::mask_expand(delx_2, mask_2, delx);
+          dely_2 = fvec::mask_expand(dely_2, mask_2, dely);
+          delz_2 = fvec::mask_expand(delz_2, mask_2, delz);
+          rsq_2 = fvec::mask_expand(rsq_2, mask_2, rsq);
+          jtype_mask_2 = bvec::mask_expand(jtype_mask_2, mask_2, jtype_mask);
+          num_2 = num_2 + bvec::popcnt(within_cutoff);
+          if (num_2 < fvec::VL) {
+            jj += fvec::VL;
+            rest_j = jj < jnum;
+            continue;
+          }
+
+          num_2 -= fvec::VL;
+	  //(0xFF >> (8 - num_2)) << (_cc_popcnt(within_cutoff) - num_2);
+          mask_2 = bvec::onlyafter(num_2, bvec::popcnt(within_cutoff) - num_2);
+          {
+            ivec tmp_j = j_2;
+            j_2 = ivec::masku_compress(mask_2, j);
+            j = tmp_j;
+            fvec tmp_delx = delx_2;
+            delx_2 = fvec::masku_compress(mask_2, delx);
+            delx = tmp_delx;
+            fvec tmp_dely = dely_2;
+            dely_2 = fvec::masku_compress(mask_2, dely);
+            dely = tmp_dely;
+            fvec tmp_delz = delz_2;
+            delz_2 = fvec::masku_compress(mask_2, delz);
+            delz = tmp_delz;
+            fvec tmp_rsq = rsq_2;
+            rsq_2 = fvec::masku_compress(mask_2, rsq);
+            rsq = tmp_rsq;
+            bvec tmp_jtype_mask = jtype_mask_2;
+            jtype_mask_2 = bvec::masku_compress(mask_2, jtype_mask);
+            jtype_mask = tmp_jtype_mask;
+            within_cutoff = bvec::full();
+          }
+        }
+      } else if (rest_2) {
+        rest_2 = false;
+        j = j_2;
+        delx = delx_2;
+        dely = dely_2;
+        delz = delz_2;
+        rsq = rsq_2;
+        jtype_mask = jtype_mask_2;
+        within_cutoff = bvec::only(num_2);
+        num_2 = 0;
+      }
+
+      bvec current_mask = within_cutoff;
+      if (bvec::test_all_unset(current_mask)) {
+        jj += fvec::VL;
+        rest_j = jj < jnum;
+        continue;
+      }
+
+      fvec rij = fvec::sqrt(rsq);
+      LennardJonesPathAIREBOT<flt_t> testpath[fvec::VL];
+      fvec cij = c_1_0;
+      fvec p_cut3rebo = fvec::set1(ka->params.cut3rebo);
+      bvec need_search = fvec::mask_cmplt(current_mask, rij, p_cut3rebo);
+      if (bvec::test_any_set(need_search)) {
+        fvec p_rcmax = fvec::mask_blend(jtype_mask, p_rcmax0, p_rcmax1);
+        #pragma noinline
+        cij = aut_airebo_lj_tap_test_path(ka, &test_path_result, need_search, 
+					  i_bc, j, testpath);
+      }
+      current_mask = fvec::mask_cmplt(current_mask, c_0_0, cij);
+      if (bvec::test_all_unset(current_mask)) {
+        jj += fvec::VL;
+        rest_j = jj < jnum;
+        continue;
+      }
+      bvec need_path_force = fvec::mask_cmplt(current_mask, cij, c_1_0);
+
+      fvec p_rljmax = fvec::mask_blend(jtype_mask, p_rljmax0, p_rljmax1);
+      fvec p_rljmin = fvec::mask_blend(jtype_mask, p_rljmin0, p_rljmin1);
+
+      fvec dslw, slw = aut_Sp2_deriv(rij, p_rljmin, p_rljmax, &dslw);
+
+      fvec p_lj1 = fvec::mask_blend(jtype_mask, p_lj10, p_lj11);
+      fvec p_lj2 = fvec::mask_blend(jtype_mask, p_lj20, p_lj21);
+      fvec p_lj3 = fvec::mask_blend(jtype_mask, p_lj30, p_lj31);
+      fvec p_lj4 = fvec::mask_blend(jtype_mask, p_lj40, p_lj41);
+
+      fvec vdw, dvdw;
+
+      fvec r2inv = fvec::recip(rsq);
+
+      if (MORSEFLAG) {
+        fvec exr = fvec::exp(fvec::setzero() - rij * p_lj4);
+        vdw = p_lj1 * exr * (p_lj2 * exr - c_2_0);
+        dvdw = p_lj3 * exr * (c_1_0 - p_lj2 * exr);
+      } else {
+        fvec r6inv = r2inv *  r2inv *  r2inv;
+
+        vdw = r6inv * ( p_lj3 *  r6inv -  p_lj4);
+        fvec r7inv = r6inv *  rij *  r2inv;
+        dvdw = r7inv * ( p_lj2 -  p_lj1 *  r6inv);
+      }
+
+      fvec VLJ = vdw *  slw;
+      fvec dVLJ = dvdw *  slw +  vdw *  dslw;
+
+      fvec p_rcLJmin = fvec::mask_blend(jtype_mask, p_rcLJmin0, p_rcLJmin1);
+      fvec p_rcLJmax = fvec::mask_blend(jtype_mask, p_rcLJmax0, p_rcLJmax1);
+      fvec dStr, Str = aut_Sp2_deriv(rij, p_rcLJmin, p_rcLJmax, &dStr);
+      fvec VA = cij *  VLJ *  Str;
+      bvec need_bondorder = fvec::mask_cmplt(current_mask, c_0_0, Str);
+      fvec Stb = fvec::setzero();
+      fvec fij[3];
+      fij[0] = fvec::setzero();
+      fij[1] = fvec::setzero();
+      fij[2] = fvec::setzero();
+      if (bvec::test_any_set(need_bondorder)) {
+        for (int jtype = 0; jtype < 2; jtype++) {
+          bvec need_bo_with_jtype = need_bondorder;
+          if (jtype) need_bo_with_jtype = need_bo_with_jtype & jtype_mask;
+          else need_bo_with_jtype = need_bo_with_jtype & ~ jtype_mask;
+          ivec jtmp = ivec::masku_compress(need_bo_with_jtype, j);
+          ivec itmp = ivec::masku_compress(need_bo_with_jtype, ivec::set1(i));
+          fvec cijtmp = fvec::masku_compress(need_bo_with_jtype, cij);
+          bvec insert_mask = bvec::after(num_bo[itype][jtype]);
+          i_bo[itype][jtype] = ivec::mask_expand(i_bo[itype][jtype], 
+						 insert_mask, itmp);
+          j_bo[itype][jtype] = ivec::mask_expand(j_bo[itype][jtype], 
+						 insert_mask, jtmp);
+          cij_bo[itype][jtype] = fvec::mask_expand(cij_bo[itype][jtype], 
+						   insert_mask, cijtmp);
+          bvec need_path_force_with_jtype = need_bo_with_jtype & 
+	    need_path_force;
+          int testpath_end = fvec::VL;
+          if (bvec::test_any_set(need_path_force_with_jtype)) {
+            int pos = num_bo[itype][jtype];
+            for (int l = 0; l < fvec::VL; l++) {
+              if (pos >= fvec::VL) {
+                testpath_end = l;
+                break;
+              }
+              if (bvec::test_at(need_path_force_with_jtype, l)) {
+                testpath_bo[itype][jtype][pos] = testpath[l];
+              }
+              if (bvec::test_at(need_bo_with_jtype, l)) {
+                pos += 1;
+              }
+            }
+          }
+          num_bo[itype][jtype] = num_bo[itype][jtype] + 
+	    bvec::popcnt(need_bo_with_jtype);
+          if (num_bo[itype][jtype] >= fvec::VL) {
+            #pragma noinline
+            aut_lj_with_bo<MORSEFLAG>(ka, itype, jtype, i_bo[itype][jtype], 
+				      j_bo[itype][jtype], cij_bo[itype][jtype],
+				      testpath_bo[itype][jtype]);
+            num_bo[itype][jtype] -= fvec::VL;
+            insert_mask = bvec::onlyafter(num_bo[itype][jtype], 
+					  bvec::popcnt(need_bo_with_jtype) - 
+					  num_bo[itype][jtype]);
+            i_bo[itype][jtype] = ivec::masku_compress(insert_mask, itmp);
+            j_bo[itype][jtype] = ivec::masku_compress(insert_mask, jtmp);
+            cij_bo[itype][jtype] = fvec::masku_compress(insert_mask, cijtmp);
+            if (bvec::test_any_set(need_path_force_with_jtype)) {
+              int pos = 0;
+              for (int l = testpath_end; l < fvec::VL; l++) {
+                if (bvec::test_at(need_path_force_with_jtype, l)) {
+                  testpath_bo[itype][jtype][pos] = testpath[l];
+                }
+                if (bvec::test_at(need_bo_with_jtype, l)) {
+                  pos += 1;
+                }
+              }
+            }
+          }
+        }
+        current_mask = current_mask & ~ need_bondorder;
+        need_path_force = need_path_force & ~ need_bondorder;
+      }
+
+      fvec fpdVLJ = cij *  dVLJ * ( c_1_0 +  Str * ( Stb -  c_1_0));
+      fvec fpdStr = dStr *  cij * ( Stb *  VLJ -  VLJ);
+      fvec fpair = r2inv *  rij * ( fvec::setzero() - ( fpdVLJ +  fpdStr));
+      fvec evdwl = VA *  Stb +  cij *  VLJ * ( c_1_0 -  Str);
+
+      fvec fix = fpair *  delx +  fij[0];
+      fvec fiy = fpair *  dely +  fij[1];
+      fvec fiz = fpair *  delz +  fij[2];
+      result_f_i_x = fvec::mask_add(result_f_i_x, current_mask, result_f_i_x, 
+				    fix);
+      result_f_i_y = fvec::mask_add(result_f_i_y, current_mask, result_f_i_y, 
+				    fiy);
+      result_f_i_z = fvec::mask_add(result_f_i_z, current_mask, result_f_i_z, 
+				    fiz);
+      result_eng = fvec::mask_add(result_eng, current_mask, result_eng, evdwl);
+
+      ivec j_dbl_idx = ivec::mullo(j, c_i4);
+      avec fjx = avec::mask_gather(avec::undefined(), current_mask, j_dbl_idx, 
+				   &ka->result_f[0].x, sizeof(acc_t));
+      avec fjy = avec::mask_gather(avec::undefined(), current_mask, j_dbl_idx, 
+				   &ka->result_f[0].y, sizeof(acc_t));
+      avec fjz = avec::mask_gather(avec::undefined(), current_mask, j_dbl_idx, 
+				   &ka->result_f[0].z, sizeof(acc_t));
+
+      fjx = fjx -  fix;
+      fjy = fjy -  fiy;
+      fjz = fjz -  fiz;
+      avec::mask_i32loscatter(&ka->result_f[0].x, current_mask, j_dbl_idx, fjx, 
+			      sizeof(acc_t));
+      avec::mask_i32loscatter(&ka->result_f[0].y, current_mask, j_dbl_idx, fjy, 
+			      sizeof(acc_t));
+      avec::mask_i32loscatter(&ka->result_f[0].z, current_mask, j_dbl_idx, fjz, 
+			      sizeof(acc_t));
+
+      if (bvec::test_any_set(need_path_force)) {
+        fvec dC = VLJ * ( Str *  Stb +  c_1_0 -  Str);
+        #pragma noinline
+        aut_airebo_lj_force_path(ka, need_path_force, dC, testpath);
+      }
+      jj += fvec::VL;
+      rest_j = jj < jnum;
+    }
+    ka->result_f[i].x += fvec::reduce_add(result_f_i_x);
+    ka->result_f[i].y += fvec::reduce_add(result_f_i_y);
+    ka->result_f[i].z += fvec::reduce_add(result_f_i_z);
+  }
+  for (int itype = 0; itype < 2; itype++) {
+    for (int jtype = 0; jtype < 2; jtype++) {
+      for (int l = 0; l < num_bo[itype][jtype]; l++) {
+        ref_lennard_jones_single_interaction(ka,ivec::at(i_bo[itype][jtype],l),
+					     ivec::at(j_bo[itype][jtype], l),
+					     MORSEFLAG);
+      }
+    }
+  }
+  ka->result_eng += fvec::reduce_add(result_eng);
+}
+
+};
+
+template<typename flt_t, typename acc_t>
+void aut_lennard_jones(KernelArgsAIREBOT<flt_t,acc_t> * ka, int morseflag) {
+#ifdef LMP_INTEL_AIREBO_REF
+  ref_lennard_jones(ka, morseflag);
+#else
+  if (morseflag) {
+    aut_wrap<flt_t,acc_t>::template aut_lennard_jones<1>(ka);
+  } else {
+    aut_wrap<flt_t,acc_t>::template aut_lennard_jones<0>(ka);
+  }
+#endif
+}
+
+template<typename flt_t, typename acc_t>
+void aut_rebo_neigh(KernelArgsAIREBOT<flt_t,acc_t> * ka) {
+#ifdef LMP_INTEL_AIREBO_REF
+  ref_rebo_neigh(ka);
+#else
+  aut_wrap<flt_t,acc_t>::aut_rebo_neigh(ka);
+#endif
+}
+
+template<typename flt_t, typename acc_t>
+void aut_frebo(KernelArgsAIREBOT<flt_t,acc_t> * ka, int torsion_flag) {
+#ifdef LMP_INTEL_AIREBO_REF
+  ref_frebo(ka, torsion_flag);
+#else
+  aut_wrap<flt_t,acc_t>::aut_frebo(ka, torsion_flag);
+#endif
+}
+
+#ifdef __INTEL_OFFLOAD
+#pragma offload_attribute(pop)
+#endif
+
+}
+
diff --git a/src/USER-INTEL/pair_airebo_intel.h b/src/USER-INTEL/pair_airebo_intel.h
new file mode 100644
index 0000000000..d3179c09f1
--- /dev/null
+++ b/src/USER-INTEL/pair_airebo_intel.h
@@ -0,0 +1,110 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Markus Hohnerbach (RWTH)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(airebo/intel,PairAIREBOIntel)
+
+#else
+
+#ifndef LMP_PAIR_AIREBO_INTEL_H
+#define LMP_PAIR_AIREBO_INTEL_H
+
+#include "pair.h"
+#include "fix_intel.h"
+#include "pair_airebo.h"
+//#include "airebo_common.h"
+
+namespace LAMMPS_NS {
+
+template<class flt_t, class acc_t>
+struct PairAIREBOIntelParam;
+
+class PairAIREBOIntel : public PairAIREBO {
+ public:
+  PairAIREBOIntel(class LAMMPS *);
+  virtual ~PairAIREBOIntel();
+  virtual void compute(int, int);
+  virtual void init_style();
+ protected:
+
+  template <class flt_t, class acc_t>
+  void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers);
+
+  template <int EVFLAG, int EFLAG, class flt_t, class acc_t>
+  void eval(const int offload, const int vflag,
+	    IntelBuffers<flt_t,acc_t> * buffers,
+	    const int astart, const int aend);
+
+  template <class flt_t, class acc_t>
+  void pack_force_const(IntelBuffers<flt_t,acc_t> * buffers);
+
+  template <class flt_t, class acc_t>
+  PairAIREBOIntelParam<flt_t,acc_t> get_param();
+
+  FixIntel * fix;
+  int _cop;
+
+  int * REBO_cnumneigh;
+  int * REBO_num_skin;
+  int * REBO_list_data;
+
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E: Incorrect args for pair coefficients
+
+Self-explanatory.  Check the input script or data file.
+
+E: Pair style AIREBO requires atom IDs
+
+This is a requirement to use the AIREBO potential.
+
+E: Pair style AIREBO requires newton pair on
+
+See the newton command.  This is a restriction to use the AIREBO
+potential.
+
+E: All pair coeffs are not set
+
+All pair coefficients must be set in the data file or by the
+pair_coeff command before running a simulation.
+
+E: Neighbor list overflow, boost neigh_modify one
+
+There are too many neighbors of a single atom.  Use the neigh_modify
+command to increase the max number of neighbors allowed for one atom.
+You may also want to boost the page size.
+
+E: Cannot open AIREBO potential file %s
+
+The specified AIREBO potential file cannot be opened.  Check that the
+path and name are correct.
+
+*/
diff --git a/src/USER-INTEL/pair_airebo_morse_intel.cpp b/src/USER-INTEL/pair_airebo_morse_intel.cpp
new file mode 100644
index 0000000000..9c0f3b8ed0
--- /dev/null
+++ b/src/USER-INTEL/pair_airebo_morse_intel.cpp
@@ -0,0 +1,37 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Markus Hohnerbach (RWTH)
+------------------------------------------------------------------------- */
+
+#include "pair_airebo_morse_intel.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairAIREBOMorseIntel::PairAIREBOMorseIntel(LAMMPS *lmp) 
+  : PairAIREBOIntel(lmp) {}
+
+/* ----------------------------------------------------------------------
+   global settings
+------------------------------------------------------------------------- */
+
+void PairAIREBOMorseIntel::settings(int narg, char **arg)
+{
+  PairAIREBOIntel::settings(narg,arg);
+
+  morseflag = 1;
+}
diff --git a/src/USER-INTEL/pair_airebo_morse_intel.h b/src/USER-INTEL/pair_airebo_morse_intel.h
new file mode 100644
index 0000000000..5210ea80ee
--- /dev/null
+++ b/src/USER-INTEL/pair_airebo_morse_intel.h
@@ -0,0 +1,40 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Markus Hohnerbach (RWTH)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(airebo/morse/intel,PairAIREBOMorseIntel)
+
+#else
+
+#ifndef LMP_PAIR_AIREBO_MORSE_INTEL_H
+#define LMP_PAIR_AIREBO_MORSE_INTEL_H
+
+#include "pair_airebo_intel.h"
+
+namespace LAMMPS_NS {
+
+class PairAIREBOMorseIntel : public PairAIREBOIntel {
+ public:
+  PairAIREBOMorseIntel(class LAMMPS *);
+  virtual void settings(int, char **);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-INTEL/pair_eam_alloy_intel.cpp b/src/USER-INTEL/pair_eam_alloy_intel.cpp
new file mode 100644
index 0000000000..4f47c7ee23
--- /dev/null
+++ b/src/USER-INTEL/pair_eam_alloy_intel.cpp
@@ -0,0 +1,326 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Stephen Foiles (SNL), Murray Daw (SNL)
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pair_eam_alloy_intel.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "memory.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+#define MAXLINE 1024
+
+/* ---------------------------------------------------------------------- */
+
+PairEAMAlloyIntel::PairEAMAlloyIntel(LAMMPS *lmp) : PairEAMIntel(lmp)
+{
+  one_coeff = 1;
+}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+   read DYNAMO setfl file
+------------------------------------------------------------------------- */
+
+void PairEAMAlloyIntel::coeff(int narg, char **arg)
+{
+  int i,j;
+
+  if (!allocated) allocate();
+
+  if (narg != 3 + atom->ntypes)
+    error->all(FLERR,"Incorrect args for pair coefficients");
+
+  // insure I,J args are * *
+
+  if (strcmp(arg[0],"*") != 0 || strcmp(arg[1],"*") != 0)
+    error->all(FLERR,"Incorrect args for pair coefficients");
+
+  // read EAM setfl file
+
+  if (setfl) {
+    for (i = 0; i < setfl->nelements; i++) delete [] setfl->elements[i];
+    delete [] setfl->elements;
+    delete [] setfl->mass;
+    memory->destroy(setfl->frho);
+    memory->destroy(setfl->rhor);
+    memory->destroy(setfl->z2r);
+    delete setfl;
+  }
+  setfl = new Setfl();
+  read_file(arg[2]);
+
+  // read args that map atom types to elements in potential file
+  // map[i] = which element the Ith atom type is, -1 if NULL
+
+  for (i = 3; i < narg; i++) {
+    if (strcmp(arg[i],"NULL") == 0) {
+      map[i-2] = -1;
+      continue;
+    }
+    for (j = 0; j < setfl->nelements; j++)
+      if (strcmp(arg[i],setfl->elements[j]) == 0) break;
+    if (j < setfl->nelements) map[i-2] = j;
+    else error->all(FLERR,"No matching element in EAM potential file");
+  }
+
+  // clear setflag since coeff() called once with I,J = * *
+
+  int n = atom->ntypes;
+  for (i = 1; i <= n; i++)
+    for (j = i; j <= n; j++)
+      setflag[i][j] = 0;
+
+  // set setflag i,j for type pairs where both are mapped to elements
+  // set mass of atom type if i = j
+
+  int count = 0;
+  for (i = 1; i <= n; i++) {
+    for (j = i; j <= n; j++) {
+      if (map[i] >= 0 && map[j] >= 0) {
+        setflag[i][j] = 1;
+        if (i == j) atom->set_mass(FLERR,i,setfl->mass[map[i]]);
+        count++;
+      }
+      scale[i][j] = 1.0;
+    }
+  }
+
+  if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
+}
+
+/* ----------------------------------------------------------------------
+   read a multi-element DYNAMO setfl file
+------------------------------------------------------------------------- */
+
+void PairEAMAlloyIntel::read_file(char *filename)
+{
+  Setfl *file = setfl;
+
+  // open potential file
+
+  int me = comm->me;
+  FILE *fptr;
+  char line[MAXLINE];
+
+  if (me == 0) {
+    fptr = force->open_potential(filename);
+    if (fptr == NULL) {
+      char str[128];
+      sprintf(str,"Cannot open EAM potential file %s",filename);
+      error->one(FLERR,str);
+    }
+  }
+
+  // read and broadcast header
+  // extract element names from nelements line
+
+  int n;
+  if (me == 0) {
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    n = strlen(line) + 1;
+  }
+  MPI_Bcast(&n,1,MPI_INT,0,world);
+  MPI_Bcast(line,n,MPI_CHAR,0,world);
+
+  sscanf(line,"%d",&file->nelements);
+  int nwords = atom->count_words(line);
+  if (nwords != file->nelements + 1)
+    error->all(FLERR,"Incorrect element names in EAM potential file");
+
+  char **words = new char*[file->nelements+1];
+  nwords = 0;
+  strtok(line," \t\n\r\f");
+  while ((words[nwords++] = strtok(NULL," \t\n\r\f"))) continue;
+
+  file->elements = new char*[file->nelements];
+  for (int i = 0; i < file->nelements; i++) {
+    n = strlen(words[i]) + 1;
+    file->elements[i] = new char[n];
+    strcpy(file->elements[i],words[i]);
+  }
+  delete [] words;
+
+  if (me == 0) {
+    fgets(line,MAXLINE,fptr);
+    sscanf(line,"%d %lg %d %lg %lg",
+           &file->nrho,&file->drho,&file->nr,&file->dr,&file->cut);
+  }
+
+  MPI_Bcast(&file->nrho,1,MPI_INT,0,world);
+  MPI_Bcast(&file->drho,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&file->nr,1,MPI_INT,0,world);
+  MPI_Bcast(&file->dr,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&file->cut,1,MPI_DOUBLE,0,world);
+
+  file->mass = new double[file->nelements];
+  memory->create(file->frho,file->nelements,file->nrho+1,"pair:frho");
+  memory->create(file->rhor,file->nelements,file->nr+1,"pair:rhor");
+  memory->create(file->z2r,file->nelements,file->nelements,file->nr+1,
+                 "pair:z2r");
+
+  int i,j,tmp;
+  for (i = 0; i < file->nelements; i++) {
+    if (me == 0) {
+      fgets(line,MAXLINE,fptr);
+      sscanf(line,"%d %lg",&tmp,&file->mass[i]);
+    }
+    MPI_Bcast(&file->mass[i],1,MPI_DOUBLE,0,world);
+
+    if (me == 0) grab(fptr,file->nrho,&file->frho[i][1]);
+    MPI_Bcast(&file->frho[i][1],file->nrho,MPI_DOUBLE,0,world);
+    if (me == 0) grab(fptr,file->nr,&file->rhor[i][1]);
+    MPI_Bcast(&file->rhor[i][1],file->nr,MPI_DOUBLE,0,world);
+  }
+
+  for (i = 0; i < file->nelements; i++)
+    for (j = 0; j <= i; j++) {
+      if (me == 0) grab(fptr,file->nr,&file->z2r[i][j][1]);
+      MPI_Bcast(&file->z2r[i][j][1],file->nr,MPI_DOUBLE,0,world);
+    }
+
+  // close the potential file
+
+  if (me == 0) fclose(fptr);
+}
+
+/* ----------------------------------------------------------------------
+   copy read-in setfl potential to standard array format
+------------------------------------------------------------------------- */
+
+void PairEAMAlloyIntel::file2array()
+{
+  int i,j,m,n;
+  int ntypes = atom->ntypes;
+
+  // set function params directly from setfl file
+
+  nrho = setfl->nrho;
+  nr = setfl->nr;
+  drho = setfl->drho;
+  dr = setfl->dr;
+  rhomax = (nrho-1) * drho;
+
+  // ------------------------------------------------------------------
+  // setup frho arrays
+  // ------------------------------------------------------------------
+
+  // allocate frho arrays
+  // nfrho = # of setfl elements + 1 for zero array
+
+  nfrho = setfl->nelements + 1;
+  memory->destroy(frho);
+  memory->create(frho,nfrho,nrho+1,"pair:frho");
+
+  // copy each element's frho to global frho
+
+  for (i = 0; i < setfl->nelements; i++)
+    for (m = 1; m <= nrho; m++) frho[i][m] = setfl->frho[i][m];
+
+  // add extra frho of zeroes for non-EAM types to point to (pair hybrid)
+  // this is necessary b/c fp is still computed for non-EAM atoms
+
+  for (m = 1; m <= nrho; m++) frho[nfrho-1][m] = 0.0;
+
+  // type2frho[i] = which frho array (0 to nfrho-1) each atom type maps to
+  // if atom type doesn't point to element (non-EAM atom in pair hybrid)
+  // then map it to last frho array of zeroes
+
+  for (i = 1; i <= ntypes; i++)
+    if (map[i] >= 0) type2frho[i] = map[i];
+    else type2frho[i] = nfrho-1;
+
+  // ------------------------------------------------------------------
+  // setup rhor arrays
+  // ------------------------------------------------------------------
+
+  // allocate rhor arrays
+  // nrhor = # of setfl elements
+
+  nrhor = setfl->nelements;
+  memory->destroy(rhor);
+  memory->create(rhor,nrhor,nr+1,"pair:rhor");
+
+  // copy each element's rhor to global rhor
+
+  for (i = 0; i < setfl->nelements; i++)
+    for (m = 1; m <= nr; m++) rhor[i][m] = setfl->rhor[i][m];
+
+  // type2rhor[i][j] = which rhor array (0 to nrhor-1) each type pair maps to
+  // for setfl files, I,J mapping only depends on I
+  // OK if map = -1 (non-EAM atom in pair hybrid) b/c type2rhor not used
+
+  for (i = 1; i <= ntypes; i++)
+    for (j = 1; j <= ntypes; j++)
+      type2rhor[i][j] = map[i];
+
+  // ------------------------------------------------------------------
+  // setup z2r arrays
+  // ------------------------------------------------------------------
+
+  // allocate z2r arrays
+  // nz2r = N*(N+1)/2 where N = # of setfl elements
+
+  nz2r = setfl->nelements * (setfl->nelements+1) / 2;
+  memory->destroy(z2r);
+  memory->create(z2r,nz2r,nr+1,"pair:z2r");
+
+  // copy each element pair z2r to global z2r, only for I >= J
+
+  n = 0;
+  for (i = 0; i < setfl->nelements; i++)
+    for (j = 0; j <= i; j++) {
+      for (m = 1; m <= nr; m++) z2r[n][m] = setfl->z2r[i][j][m];
+      n++;
+    }
+
+  // type2z2r[i][j] = which z2r array (0 to nz2r-1) each type pair maps to
+  // set of z2r arrays only fill lower triangular Nelement matrix
+  // value = n = sum over rows of lower-triangular matrix until reach irow,icol
+  // swap indices when irow < icol to stay lower triangular
+  // if map = -1 (non-EAM atom in pair hybrid):
+  //   type2z2r is not used by non-opt
+  //   but set type2z2r to 0 since accessed by opt
+
+  int irow,icol;
+  for (i = 1; i <= ntypes; i++) {
+    for (j = 1; j <= ntypes; j++) {
+      irow = map[i];
+      icol = map[j];
+      if (irow == -1 || icol == -1) {
+        type2z2r[i][j] = 0;
+        continue;
+      }
+      if (irow < icol) {
+        irow = map[j];
+        icol = map[i];
+      }
+      n = 0;
+      for (m = 0; m < irow; m++) n += m + 1;
+      n += icol;
+      type2z2r[i][j] = n;
+    }
+  }
+}
diff --git a/src/USER-INTEL/pair_eam_alloy_intel.h b/src/USER-INTEL/pair_eam_alloy_intel.h
new file mode 100644
index 0000000000..4967c3709d
--- /dev/null
+++ b/src/USER-INTEL/pair_eam_alloy_intel.h
@@ -0,0 +1,43 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(eam/alloy/intel,PairEAMAlloyIntel)
+
+#else
+
+#ifndef LMP_PAIR_EAM_ALLOY_INTEL_H
+#define LMP_PAIR_EAM_ALLOY_INTEL_H
+
+#include "pair_eam_intel.h"
+
+namespace LAMMPS_NS {
+
+// need virtual public b/c of how eam/alloy/opt inherits from it
+
+class PairEAMAlloyIntel : virtual public PairEAMIntel {
+ public:
+  PairEAMAlloyIntel(class LAMMPS *);
+  virtual ~PairEAMAlloyIntel() {}
+  void coeff(int, char **);
+
+ protected:
+  void read_file(char *);
+  void file2array();
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-INTEL/pair_eam_fs_intel.cpp b/src/USER-INTEL/pair_eam_fs_intel.cpp
new file mode 100644
index 0000000000..cfcc8200cc
--- /dev/null
+++ b/src/USER-INTEL/pair_eam_fs_intel.cpp
@@ -0,0 +1,335 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Tim Lau (MIT)
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pair_eam_fs_intel.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "memory.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+#define MAXLINE 1024
+
+/* ---------------------------------------------------------------------- */
+
+PairEAMFSIntel::PairEAMFSIntel(LAMMPS *lmp) : PairEAMIntel(lmp)
+{
+  one_coeff = 1;
+}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+   read EAM Finnis-Sinclair file
+------------------------------------------------------------------------- */
+
+void PairEAMFSIntel::coeff(int narg, char **arg)
+{
+  int i,j;
+
+  if (!allocated) allocate();
+
+  if (narg != 3 + atom->ntypes)
+    error->all(FLERR,"Incorrect args for pair coefficients");
+
+  // insure I,J args are * *
+
+  if (strcmp(arg[0],"*") != 0 || strcmp(arg[1],"*") != 0)
+    error->all(FLERR,"Incorrect args for pair coefficients");
+
+  // read EAM Finnis-Sinclair file
+
+  if (fs) {
+    for (i = 0; i < fs->nelements; i++) delete [] fs->elements[i];
+    delete [] fs->elements;
+    delete [] fs->mass;
+    memory->destroy(fs->frho);
+    memory->destroy(fs->rhor);
+    memory->destroy(fs->z2r);
+    delete fs;
+  }
+  fs = new Fs();
+  read_file(arg[2]);
+
+  // read args that map atom types to elements in potential file
+  // map[i] = which element the Ith atom type is, -1 if NULL
+
+  for (i = 3; i < narg; i++) {
+    if (strcmp(arg[i],"NULL") == 0) {
+      map[i-2] = -1;
+      continue;
+    }
+    for (j = 0; j < fs->nelements; j++)
+      if (strcmp(arg[i],fs->elements[j]) == 0) break;
+    if (j < fs->nelements) map[i-2] = j;
+    else error->all(FLERR,"No matching element in EAM potential file");
+  }
+
+  // clear setflag since coeff() called once with I,J = * *
+
+  int n = atom->ntypes;
+  for (i = 1; i <= n; i++)
+    for (j = i; j <= n; j++)
+      setflag[i][j] = 0;
+
+  // set setflag i,j for type pairs where both are mapped to elements
+  // set mass of atom type if i = j
+
+  int count = 0;
+  for (i = 1; i <= n; i++) {
+    for (j = i; j <= n; j++) {
+      if (map[i] >= 0 && map[j] >= 0) {
+        setflag[i][j] = 1;
+        if (i == j) atom->set_mass(FLERR,i,fs->mass[map[i]]);
+        count++;
+      }
+      scale[i][j] = 1.0;
+    }
+  }
+
+  if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
+}
+
+/* ----------------------------------------------------------------------
+   read a multi-element DYNAMO setfl file
+------------------------------------------------------------------------- */
+
+void PairEAMFSIntel::read_file(char *filename)
+{
+  Fs *file = fs;
+
+  // open potential file
+
+  int me = comm->me;
+  FILE *fptr;
+  char line[MAXLINE];
+
+  if (me == 0) {
+    fptr = force->open_potential(filename);
+    if (fptr == NULL) {
+      char str[128];
+      sprintf(str,"Cannot open EAM potential file %s",filename);
+      error->one(FLERR,str);
+    }
+  }
+
+  // read and broadcast header
+  // extract element names from nelements line
+
+  int n;
+  if (me == 0) {
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    n = strlen(line) + 1;
+  }
+  MPI_Bcast(&n,1,MPI_INT,0,world);
+  MPI_Bcast(line,n,MPI_CHAR,0,world);
+
+  sscanf(line,"%d",&file->nelements);
+  int nwords = atom->count_words(line);
+  if (nwords != file->nelements + 1)
+    error->all(FLERR,"Incorrect element names in EAM potential file");
+
+  char **words = new char*[file->nelements+1];
+  nwords = 0;
+  strtok(line," \t\n\r\f");
+  while ((words[nwords++] = strtok(NULL," \t\n\r\f"))) continue;
+
+  file->elements = new char*[file->nelements];
+  for (int i = 0; i < file->nelements; i++) {
+    n = strlen(words[i]) + 1;
+    file->elements[i] = new char[n];
+    strcpy(file->elements[i],words[i]);
+  }
+  delete [] words;
+
+  if (me == 0) {
+    fgets(line,MAXLINE,fptr);
+    sscanf(line,"%d %lg %d %lg %lg",
+           &file->nrho,&file->drho,&file->nr,&file->dr,&file->cut);
+  }
+
+  MPI_Bcast(&file->nrho,1,MPI_INT,0,world);
+  MPI_Bcast(&file->drho,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&file->nr,1,MPI_INT,0,world);
+  MPI_Bcast(&file->dr,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&file->cut,1,MPI_DOUBLE,0,world);
+
+  file->mass = new double[file->nelements];
+  memory->create(file->frho,file->nelements,file->nrho+1,
+                                              "pair:frho");
+  memory->create(file->rhor,file->nelements,file->nelements,
+                 file->nr+1,"pair:rhor");
+  memory->create(file->z2r,file->nelements,file->nelements,
+                 file->nr+1,"pair:z2r");
+
+  int i,j,tmp;
+  for (i = 0; i < file->nelements; i++) {
+    if (me == 0) {
+      fgets(line,MAXLINE,fptr);
+      sscanf(line,"%d %lg",&tmp,&file->mass[i]);
+    }
+    MPI_Bcast(&file->mass[i],1,MPI_DOUBLE,0,world);
+
+    if (me == 0) grab(fptr,file->nrho,&file->frho[i][1]);
+    MPI_Bcast(&file->frho[i][1],file->nrho,MPI_DOUBLE,0,world);
+
+    for (j = 0; j < file->nelements; j++) {
+      if (me == 0) grab(fptr,file->nr,&file->rhor[i][j][1]);
+      MPI_Bcast(&file->rhor[i][j][1],file->nr,MPI_DOUBLE,0,world);
+    }
+  }
+
+  for (i = 0; i < file->nelements; i++)
+    for (j = 0; j <= i; j++) {
+      if (me == 0) grab(fptr,file->nr,&file->z2r[i][j][1]);
+      MPI_Bcast(&file->z2r[i][j][1],file->nr,MPI_DOUBLE,0,world);
+    }
+
+  // close the potential file
+
+  if (me == 0) fclose(fptr);
+}
+
+/* ----------------------------------------------------------------------
+   copy read-in setfl potential to standard array format
+------------------------------------------------------------------------- */
+
+void PairEAMFSIntel::file2array()
+{
+  int i,j,m,n;
+  int ntypes = atom->ntypes;
+
+  // set function params directly from fs file
+
+  nrho = fs->nrho;
+  nr = fs->nr;
+  drho = fs->drho;
+  dr = fs->dr;
+  rhomax = (nrho-1) * drho;
+
+  // ------------------------------------------------------------------
+  // setup frho arrays
+  // ------------------------------------------------------------------
+
+  // allocate frho arrays
+  // nfrho = # of fs elements + 1 for zero array
+
+  nfrho = fs->nelements + 1;
+  memory->destroy(frho);
+  memory->create(frho,nfrho,nrho+1,"pair:frho");
+
+  // copy each element's frho to global frho
+
+  for (i = 0; i < fs->nelements; i++)
+    for (m = 1; m <= nrho; m++) frho[i][m] = fs->frho[i][m];
+
+  // add extra frho of zeroes for non-EAM types to point to (pair hybrid)
+  // this is necessary b/c fp is still computed for non-EAM atoms
+
+  for (m = 1; m <= nrho; m++) frho[nfrho-1][m] = 0.0;
+
+  // type2frho[i] = which frho array (0 to nfrho-1) each atom type maps to
+  // if atom type doesn't point to element (non-EAM atom in pair hybrid)
+  // then map it to last frho array of zeroes
+
+  for (i = 1; i <= ntypes; i++)
+    if (map[i] >= 0) type2frho[i] = map[i];
+    else type2frho[i] = nfrho-1;
+
+  // ------------------------------------------------------------------
+  // setup rhor arrays
+  // ------------------------------------------------------------------
+
+  // allocate rhor arrays
+  // nrhor = square of # of fs elements
+
+  nrhor = fs->nelements * fs->nelements;
+  memory->destroy(rhor);
+  memory->create(rhor,nrhor,nr+1,"pair:rhor");
+
+  // copy each element pair rhor to global rhor
+
+  n = 0;
+  for (i = 0; i < fs->nelements; i++)
+    for (j = 0; j < fs->nelements; j++) {
+      for (m = 1; m <= nr; m++) rhor[n][m] = fs->rhor[i][j][m];
+      n++;
+    }
+
+  // type2rhor[i][j] = which rhor array (0 to nrhor-1) each type pair maps to
+  // for fs files, there is a full NxN set of rhor arrays
+  // OK if map = -1 (non-EAM atom in pair hybrid) b/c type2rhor not used
+
+  for (i = 1; i <= ntypes; i++)
+    for (j = 1; j <= ntypes; j++)
+      type2rhor[i][j] = map[i] * fs->nelements + map[j];
+
+  // ------------------------------------------------------------------
+  // setup z2r arrays
+  // ------------------------------------------------------------------
+
+  // allocate z2r arrays
+  // nz2r = N*(N+1)/2 where N = # of fs elements
+
+  nz2r = fs->nelements * (fs->nelements+1) / 2;
+  memory->destroy(z2r);
+  memory->create(z2r,nz2r,nr+1,"pair:z2r");
+
+  // copy each element pair z2r to global z2r, only for I >= J
+
+  n = 0;
+  for (i = 0; i < fs->nelements; i++)
+    for (j = 0; j <= i; j++) {
+      for (m = 1; m <= nr; m++) z2r[n][m] = fs->z2r[i][j][m];
+      n++;
+    }
+
+  // type2z2r[i][j] = which z2r array (0 to nz2r-1) each type pair maps to
+  // set of z2r arrays only fill lower triangular Nelement matrix
+  // value = n = sum over rows of lower-triangular matrix until reach irow,icol
+  // swap indices when irow < icol to stay lower triangular
+  // if map = -1 (non-EAM atom in pair hybrid):
+  //   type2z2r is not used by non-opt
+  //   but set type2z2r to 0 since accessed by opt
+
+  int irow,icol;
+  for (i = 1; i <= ntypes; i++) {
+    for (j = 1; j <= ntypes; j++) {
+      irow = map[i];
+      icol = map[j];
+      if (irow == -1 || icol == -1) {
+        type2z2r[i][j] = 0;
+        continue;
+      }
+      if (irow < icol) {
+        irow = map[j];
+        icol = map[i];
+      }
+      n = 0;
+      for (m = 0; m < irow; m++) n += m + 1;
+      n += icol;
+      type2z2r[i][j] = n;
+    }
+  }
+}
diff --git a/src/USER-INTEL/pair_eam_fs_intel.h b/src/USER-INTEL/pair_eam_fs_intel.h
new file mode 100644
index 0000000000..da2ab9d2d7
--- /dev/null
+++ b/src/USER-INTEL/pair_eam_fs_intel.h
@@ -0,0 +1,43 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(eam/fs/intel,PairEAMFSIntel)
+
+#else
+
+#ifndef LMP_PAIR_EAM_FS_INTEL_H
+#define LMP_PAIR_EAM_FS_INTEL_H
+
+#include "pair_eam_intel.h"
+
+namespace LAMMPS_NS {
+
+// need virtual public b/c of how eam/fs/opt inherits from it
+
+class PairEAMFSIntel : virtual public PairEAMIntel {
+ public:
+  PairEAMFSIntel(class LAMMPS *);
+  virtual ~PairEAMFSIntel() {}
+  void coeff(int, char **);
+
+ protected:
+  void read_file(char *);
+  void file2array();
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-INTEL/pair_gayberne_intel.cpp b/src/USER-INTEL/pair_gayberne_intel.cpp
index ed7dd424af..3fbb58308b 100644
--- a/src/USER-INTEL/pair_gayberne_intel.cpp
+++ b/src/USER-INTEL/pair_gayberne_intel.cpp
@@ -428,7 +428,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
           } else
             multiple_forms = true;
         }
-        const int edge = (packed_j % pad_width);
+        const int edge = packed_j & (pad_width - 1);
         if (edge) {
           const int packed_end = packed_j + (pad_width - edge);
           #if defined(LMP_SIMD_COMPILER)
diff --git a/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp b/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp
new file mode 100644
index 0000000000..0dc2c275e8
--- /dev/null
+++ b/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp
@@ -0,0 +1,595 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include <math.h>
+#include "pair_lj_charmm_coul_charmm_intel.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "group.h"
+#include "memory.h"
+#include "modify.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "memory.h"
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define LJ_T typename IntelBuffers<flt_t,flt_t>::vec4_t
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCharmmCoulCharmmIntel::PairLJCharmmCoulCharmmIntel(LAMMPS *lmp) :
+  PairLJCharmmCoulCharmm(lmp)
+{
+  suffix_flag |= Suffix::INTEL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCharmmCoulCharmmIntel::~PairLJCharmmCoulCharmmIntel()
+{
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulCharmmIntel::compute(int eflag, int vflag)
+{
+  if (fix->precision()==FixIntel::PREC_MODE_MIXED)
+    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
+                          force_const_single);
+  else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
+    compute<double,double>(eflag, vflag, fix->get_double_buffers(),
+                           force_const_double);
+  else
+    compute<float,float>(eflag, vflag, fix->get_single_buffers(),
+                         force_const_single);
+
+  fix->balance_stamp();
+  vflag_fdotr = 0;
+}
+
+template <class flt_t, class acc_t>
+void PairLJCharmmCoulCharmmIntel::compute(int eflag, int vflag,
+                                        IntelBuffers<flt_t,acc_t> *buffers,
+                                        const ForceConst<flt_t> &fc)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int inum = list->inum;
+  const int nthreads = comm->nthreads;
+  const int host_start = fix->host_start_pair();
+  const int offload_end = fix->offload_end_pair();
+  const int ago = neighbor->ago;
+
+  if (ago != 0 && fix->separate_buffers() == 0) {
+    fix->start_watch(TIME_PACK);
+
+    int packthreads;
+    if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
+    else packthreads = 1;
+    #if defined(_OPENMP)
+    #pragma omp parallel if(packthreads > 1)
+    #endif
+    {
+      int ifrom, ito, tid;
+      IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal+atom->nghost,
+                                packthreads, sizeof(ATOM_T));
+      buffers->thr_pack(ifrom,ito,ago);
+    }
+    fix->stop_watch(TIME_PACK);
+  }
+
+  // -------------------- Regular version
+  int ovflag = 0;
+  if (vflag_fdotr) ovflag = 2;
+  else if (vflag) ovflag = 1;
+  if (eflag) {
+    if (force->newton_pair) {
+      eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
+    } else {
+      eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
+    }
+  } else {
+    if (force->newton_pair) {
+      eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
+    } else {
+      eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag,
+                                     IntelBuffers<flt_t,acc_t> *buffers,
+                                     const ForceConst<flt_t> &fc,
+                                     const int astart, const int aend)
+{
+  const int inum = aend - astart;
+  if (inum == 0) return;
+  int nlocal, nall, minlocal;
+  fix->get_buffern(offload, nlocal, nall, minlocal);
+
+  const int ago = neighbor->ago;
+  IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
+
+  ATOM_T * _noalias const x = buffers->get_x(offload);
+  flt_t * _noalias const q = buffers->get_q(offload);
+
+  const int * _noalias const numneigh = list->numneigh;
+  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
+  const int * _noalias const firstneigh = buffers->firstneigh(list);
+
+  const flt_t * _noalias const special_coul = fc.special_coul;
+  const flt_t * _noalias const special_lj = fc.special_lj;
+  const flt_t qqrd2e = force->qqrd2e;
+  const flt_t inv_denom_lj = (flt_t)1.0/denom_lj;
+  const flt_t inv_denom_coul = (flt_t)1.0/denom_coul;
+
+  const flt_t * _noalias const cutsq = fc.cutsq[0];
+  const LJ_T * _noalias const lj = fc.lj[0];
+  const flt_t cut_ljsq = fc.cut_ljsq;
+  const flt_t cut_lj_innersq = fc.cut_lj_innersq;
+  const flt_t cut_coul_innersq = fc.cut_coul_innersq;
+  const flt_t cut_coulsq = fc.cut_coulsq;
+
+  const int ntypes = atom->ntypes + 1;
+  const int eatom = this->eflag_atom;
+
+  flt_t * _noalias const ccachex = buffers->get_ccachex();
+  flt_t * _noalias const ccachey = buffers->get_ccachey();
+  flt_t * _noalias const ccachez = buffers->get_ccachez();
+  flt_t * _noalias const ccachew = buffers->get_ccachew();
+  int * _noalias const ccachei = buffers->get_ccachei();
+  int * _noalias const ccachej = buffers->get_ccachej();
+  const int ccache_stride = _ccache_stride;
+
+  // Determine how much data to transfer
+  int x_size, q_size, f_stride, ev_size, separate_flag;
+  IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
+                       buffers, offload, fix, separate_flag,
+                       x_size, q_size, ev_size, f_stride);
+
+  int tc;
+  FORCE_T * _noalias f_start;
+  acc_t * _noalias ev_global;
+  IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
+
+  const int nthreads = tc;
+  #ifdef _LMP_INTEL_OFFLOAD
+  int *overflow = fix->get_off_overflow_flag();
+  double *timer_compute = fix->off_watch_pair();
+
+  if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
+  #pragma offload target(mic:_cop) if(offload) \
+    in(special_lj,special_coul:length(0) alloc_if(0) free_if(0)) \
+    in(cutsq,lj:length(0) alloc_if(0) free_if(0)) \
+    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
+    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
+    in(numneigh:length(0) alloc_if(0) free_if(0)) \
+    in(x:length(x_size) alloc_if(0) free_if(0)) \
+    in(q:length(q_size) alloc_if(0) free_if(0)) \
+    in(overflow:length(0) alloc_if(0) free_if(0)) \
+    in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
+    in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \
+    in(ccache_stride,nthreads,qqrd2e,inum,nall,ntypes,cut_coulsq) \
+    in(vflag,eatom,f_stride,separate_flag,offload) \
+    in(astart,cut_ljsq,cut_lj_innersq,nlocal,inv_denom_lj,minlocal) \
+    in(inv_denom_coul,cut_coul_innersq) \
+    out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
+    out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
+    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
+    signal(f_start)
+  #endif
+  {
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
+    *timer_compute = MIC_Wtime();
+    #endif
+
+    IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
+                              f_stride, x, q);
+
+    acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
+    if (EFLAG) oevdwl = oecoul = (acc_t)0;
+    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+
+    // loop over neighbors of my atoms
+    #if defined(_OPENMP)
+    #pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
+    #endif
+    {
+      int iifrom, iip, iito, tid;
+      IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
+      iifrom += astart;
+      iito += astart;
+
+      int foff;
+      if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
+      else foff = -minlocal;
+      FORCE_T * _noalias const f = f_start + foff;
+      if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
+      flt_t cutboth = cut_coulsq;
+
+      const int toffs = tid * ccache_stride;
+      flt_t * _noalias const tdelx = ccachex + toffs;
+      flt_t * _noalias const tdely = ccachey + toffs;
+      flt_t * _noalias const tdelz = ccachez + toffs;
+      flt_t * _noalias const trsq = ccachew + toffs;
+      int * _noalias const tj = ccachei + toffs;
+      int * _noalias const tjtype = ccachej + toffs;
+
+      for (int i = iifrom; i < iito; i += iip) {
+        //        const int i = ilist[ii];
+        const int itype = x[i].w;
+
+        const int ptr_off = itype * ntypes;
+        const flt_t * _noalias const cutsqi = cutsq + ptr_off;
+        const LJ_T * _noalias const lji = lj + ptr_off;
+
+        const int   * _noalias const jlist = firstneigh + cnumneigh[i];
+        const int jnum = numneigh[i];
+
+        acc_t fxtmp,fytmp,fztmp,fwtmp;
+        acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
+
+        const flt_t xtmp = x[i].x;
+        const flt_t ytmp = x[i].y;
+        const flt_t ztmp = x[i].z;
+        const flt_t qtmp = q[i];
+        fxtmp = fytmp = fztmp = (acc_t)0;
+        if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
+        if (NEWTON_PAIR == 0)
+          if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
+
+        int ej = 0;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma ivdep
+        #endif
+        for (int jj = 0; jj < jnum; jj++) {
+          const int j = jlist[jj] & NEIGHMASK;
+          const flt_t delx = xtmp - x[j].x;
+          const flt_t dely = ytmp - x[j].y;
+          const flt_t delz = ztmp - x[j].z;
+          const flt_t rsq = delx * delx + dely * dely + delz * delz;
+
+          if (rsq < cut_coulsq) {
+            trsq[ej]=rsq;
+            tdelx[ej]=delx;
+            tdely[ej]=dely;
+            tdelz[ej]=delz;
+            tjtype[ej]=x[j].w;
+            tj[ej]=jlist[jj];
+            ej++;
+          }
+        }
+
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
+                               sv0, sv1, sv2, sv3, sv4, sv5)
+        #endif
+        for (int jj = 0; jj < ej; jj++) {
+          flt_t forcecoul, forcelj, evdwl;
+          forcecoul = forcelj = evdwl = (flt_t)0.0;
+
+          const int j = tj[jj] & NEIGHMASK;
+          const int sbindex = tj[jj] >> SBBITS & 3;
+          const flt_t rsq = trsq[jj];
+          const flt_t r2inv = (flt_t)1.0 / rsq;
+	  const flt_t r_inv = (flt_t)1.0 / sqrt(rsq);
+	  forcecoul = qqrd2e * qtmp * q[j] * r_inv;
+	  if (rsq > cut_coul_innersq) {
+	    const flt_t ccr = cut_coulsq - rsq;
+	    const flt_t switch1 = ccr * ccr * inv_denom_coul *
+              (cut_coulsq + (flt_t)2.0 * rsq - (flt_t)3.0 * cut_coul_innersq);
+            forcecoul *= switch1; 
+          }
+
+          #ifdef INTEL_VMASK
+          if (rsq < cut_ljsq) {
+          #endif
+	    const int jtype = tjtype[jj];
+            flt_t r6inv = r2inv * r2inv * r2inv;
+            forcelj = r6inv * (lji[jtype].x * r6inv - lji[jtype].y);
+            if (EFLAG) evdwl = r6inv*(lji[jtype].z * r6inv - lji[jtype].w);
+
+            #ifdef INTEL_VMASK
+            if (rsq > cut_lj_innersq) {
+            #endif
+              const flt_t drsq = cut_ljsq - rsq;
+              const flt_t cut2 = (rsq - cut_lj_innersq) * drsq;
+              const flt_t switch1 = drsq * (drsq * drsq + (flt_t)3.0 * cut2) *
+                  inv_denom_lj;
+              const flt_t switch2 = (flt_t)12.0 * rsq * cut2 * inv_denom_lj;
+              if (EFLAG) {
+                #ifndef INTEL_VMASK
+                if (rsq > cut_lj_innersq) {
+                #endif
+                  forcelj = forcelj * switch1 + evdwl * switch2;
+                  evdwl *= switch1;
+                #ifndef INTEL_VMASK
+                }
+                #endif
+              } else {
+                const flt_t philj = r6inv * (lji[jtype].z*r6inv -
+                    lji[jtype].w);
+                #ifndef INTEL_VMASK
+                if (rsq > cut_lj_innersq)
+                #endif
+                  forcelj =  forcelj * switch1 + philj * switch2;
+              }
+            #ifdef INTEL_VMASK
+            }
+            #endif
+
+          #ifdef INTEL_VMASK
+          }
+          #else
+          if (rsq > cut_ljsq) { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
+          #endif
+	  if (sbindex) {
+  	    const flt_t factor_coul = special_coul[sbindex];
+	    forcecoul *= factor_coul;
+	    const flt_t factor_lj = special_lj[sbindex];
+	    forcelj *= factor_lj;
+	    if (EFLAG) evdwl *= factor_lj;
+          }
+
+          const flt_t fpair = (forcecoul + forcelj) * r2inv;
+          const flt_t fpx = fpair * tdelx[jj];
+          fxtmp += fpx;
+          if (NEWTON_PAIR) f[j].x -= fpx;
+          const flt_t fpy = fpair * tdely[jj];
+          fytmp += fpy;
+          if (NEWTON_PAIR) f[j].y -= fpy;
+          const flt_t fpz = fpair * tdelz[jj];
+          fztmp += fpz;
+          if (NEWTON_PAIR) f[j].z -= fpz;
+
+          if (EFLAG) {
+            sevdwl += evdwl;
+            secoul += forcecoul;
+            if (eatom) {
+              fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * forcecoul;
+              if (NEWTON_PAIR)
+                f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * forcecoul;
+            }
+          }
+          if (NEWTON_PAIR == 0)
+            IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
+                                  fpx, fpy, fpz);
+        } // for jj
+        if (NEWTON_PAIR) {
+          f[i].x += fxtmp;
+          f[i].y += fytmp;
+          f[i].z += fztmp;
+        } else {
+          f[i].x = fxtmp;
+          f[i].y = fytmp;
+          f[i].z = fztmp;
+        }
+        IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
+      } // for ii
+
+      IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
+                              f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
+                              ov4, ov5);
+    } // end of omp parallel region
+
+    IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
+                        ov0, ov1, ov2, ov3, ov4, ov5);
+
+    if (EFLAG) {
+      if (NEWTON_PAIR == 0) {
+        oevdwl *= (acc_t)0.5;
+        oecoul *= (acc_t)0.5;
+      }
+      ev_global[0] = oevdwl;
+      ev_global[1] = oecoul;
+    }
+    if (vflag) {
+      if (NEWTON_PAIR == 0) {
+        ov0 *= (acc_t)0.5;
+        ov1 *= (acc_t)0.5;
+        ov2 *= (acc_t)0.5;
+        ov3 *= (acc_t)0.5;
+        ov4 *= (acc_t)0.5;
+        ov5 *= (acc_t)0.5;
+      }
+      ev_global[2] = ov0;
+      ev_global[3] = ov1;
+      ev_global[4] = ov2;
+      ev_global[5] = ov3;
+      ev_global[6] = ov4;
+      ev_global[7] = ov5;
+    }
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
+    *timer_compute = MIC_Wtime() - *timer_compute;
+    #endif
+  } // end of offload region
+
+  if (offload)
+    fix->stop_watch(TIME_OFFLOAD_LATENCY);
+  else
+    fix->stop_watch(TIME_HOST_PAIR);
+
+  if (EFLAG || vflag)
+    fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
+  else
+    fix->add_result_array(f_start, 0, offload);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulCharmmIntel::init_style()
+{
+  PairLJCharmmCoulCharmm::init_style();
+  if (force->newton_pair == 0) {
+    neighbor->requests[neighbor->nrequest-1]->half = 0;
+    neighbor->requests[neighbor->nrequest-1]->full = 1;
+  }
+  neighbor->requests[neighbor->nrequest-1]->intel = 1;
+
+  int ifix = modify->find_fix("package_intel");
+  if (ifix < 0)
+    error->all(FLERR,
+               "The 'package intel' command is required for /intel styles");
+  fix = static_cast<FixIntel *>(modify->fix[ifix]);
+
+  fix->pair_init_check();
+  #ifdef _LMP_INTEL_OFFLOAD
+  _cop = fix->coprocessor_number();
+  #endif
+
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
+    pack_force_const(force_const_single, fix->get_mixed_buffers());
+  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    pack_force_const(force_const_double, fix->get_double_buffers());
+  else
+    pack_force_const(force_const_single, fix->get_single_buffers());
+}
+
+template <class flt_t, class acc_t>
+void PairLJCharmmCoulCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
+                                          IntelBuffers<flt_t,acc_t> *buffers)
+{
+  int off_ccache = 0;
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_cop >= 0) off_ccache = 1;
+  #endif
+  buffers->grow_ccache(off_ccache, comm->nthreads, 1);
+  _ccache_stride = buffers->ccache_stride();
+
+  int tp1 = atom->ntypes + 1;
+
+  fc.set_ntypes(tp1, memory, _cop);
+  buffers->set_ntypes(tp1);
+  flt_t **cutneighsq = buffers->get_cutneighsq();
+
+  // Repeat cutsq calculation because done after call to init_style
+  double cut, cutneigh;
+  if (cut_lj > cut_coul)
+    error->all(FLERR,
+         "Intel varient of lj/charmm/coul/long expects lj cutoff<=coulombic");
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i, j);
+        cutneigh = cut + neighbor->skin;
+        cutsq[i][j] = cutsq[j][i] = cut*cut;
+        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
+      }
+    }
+  }
+
+  cut_coul_innersq = cut_coul_inner * cut_coul_inner;
+  cut_lj_innersq = cut_lj_inner * cut_lj_inner;
+  cut_ljsq = cut_lj * cut_lj;
+  cut_coulsq = cut_coul * cut_coul;
+  cut_bothsq = MAX(cut_ljsq, cut_coulsq);
+
+  fc.cut_coulsq = cut_coulsq;
+  fc.cut_ljsq = cut_ljsq;
+  fc.cut_coul_innersq = cut_coul_innersq;
+  fc.cut_lj_innersq = cut_lj_innersq;
+
+  for (int i = 0; i < 4; i++) {
+    fc.special_lj[i] = force->special_lj[i];
+    fc.special_coul[i] = force->special_coul[i];
+    fc.special_coul[0] = 1.0;
+    fc.special_lj[0] = 1.0;
+  }
+
+  for (int i = 0; i < tp1; i++) {
+    for (int j = 0; j < tp1; j++) {
+      fc.lj[i][j].x = lj1[i][j];
+      fc.lj[i][j].y = lj2[i][j];
+      fc.lj[i][j].z = lj3[i][j];
+      fc.lj[i][j].w = lj4[i][j];
+      fc.cutsq[i][j] = cutsq[i][j];
+    }
+  }
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_cop < 0) return;
+  flt_t * special_lj = fc.special_lj;
+  flt_t * special_coul = fc.special_coul;
+  flt_t * cutsq = fc.cutsq[0];
+  LJ_T * lj = fc.lj[0];
+  flt_t * ocutneighsq = cutneighsq[0];
+  int tp1sq = tp1 * tp1;
+  #pragma offload_transfer target(mic:_cop) \
+    in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \
+    in(cutsq,lj: length(tp1sq) alloc_if(0) free_if(0)) \
+    in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t>
+void PairLJCharmmCoulCharmmIntel::ForceConst<flt_t>::set_ntypes(
+  const int ntypes, Memory *memory, const int cop) {
+  if (ntypes != _ntypes) {
+    if (_ntypes > 0) {
+      #ifdef _LMP_INTEL_OFFLOAD
+      flt_t * ospecial_lj = special_lj;
+      flt_t * ospecial_coul = special_coul;
+      flt_t * ocutsq = cutsq[0];
+      typename IntelBuffers<flt_t,flt_t>::vec4_t * olj = lj[0];
+      if (ospecial_lj != NULL && ocutsq != NULL && olj != NULL &&
+          ospecial_coul != NULL && cop >= 0) {
+        #pragma offload_transfer target(mic:cop) \
+          nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \
+          nocopy(ocutsq, olj: alloc_if(0) free_if(1))
+      }
+      #endif
+
+      _memory->destroy(cutsq);
+      _memory->destroy(lj);
+    }
+    if (ntypes > 0) {
+      _cop = cop;
+      memory->create(cutsq,ntypes,ntypes,"fc.cutsq");
+      memory->create(lj,ntypes,ntypes,"fc.lj");
+
+      #ifdef _LMP_INTEL_OFFLOAD
+      flt_t * ospecial_lj = special_lj;
+      flt_t * ospecial_coul = special_coul;
+      flt_t * ocutsq = cutsq[0];
+      typename IntelBuffers<flt_t,flt_t>::vec4_t * olj = lj[0];
+      int tp1sq = ntypes*ntypes;
+      if (ospecial_lj != NULL && ocutsq != NULL && olj != NULL &&
+          ospecial_coul != NULL && cop >= 0) {
+        #pragma offload_transfer target(mic:cop) \
+          nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
+          nocopy(ospecial_coul: length(4) alloc_if(1) free_if(0)) \
+          nocopy(ocutsq,olj: length(tp1sq) alloc_if(1) free_if(0))
+      }
+      #endif
+    }
+  }
+  _ntypes=ntypes;
+  _memory=memory;
+}
diff --git a/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.h b/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.h
new file mode 100644
index 0000000000..64d6077477
--- /dev/null
+++ b/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.h
@@ -0,0 +1,100 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/charmm/coul/charmm/intel,PairLJCharmmCoulCharmmIntel)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CHARMM_COUL_CHARMM_INTEL_H
+#define LMP_PAIR_LJ_CHARMM_COUL_CHARMM_INTEL_H
+
+#include "pair_lj_charmm_coul_charmm.h"
+#include "fix_intel.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCharmmCoulCharmmIntel : public PairLJCharmmCoulCharmm {
+
+ public:
+  PairLJCharmmCoulCharmmIntel(class LAMMPS *);
+  virtual ~PairLJCharmmCoulCharmmIntel();
+
+  virtual void compute(int, int);
+  void init_style();
+
+  typedef struct { float x,y,z; int w; } sng4_t;
+
+ private:
+  FixIntel *fix;
+  int _cop, _ccache_stride;
+
+  template <class flt_t> class ForceConst;
+  template <class flt_t, class acc_t>
+  void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
+               const ForceConst<flt_t> &fc);
+  template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+  void eval(const int offload, const int vflag,
+            IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc, const int astart, const int aend);
+
+  template <class flt_t, class acc_t>
+  void pack_force_const(ForceConst<flt_t> &fc,
+                        IntelBuffers<flt_t, acc_t> *buffers);
+
+  // ----------------------------------------------------------------------
+  template <class flt_t>
+  class ForceConst {
+   public:
+    _alignvar(flt_t special_coul[4],64);
+    _alignvar(flt_t special_lj[4],64);
+    flt_t **cutsq;
+    flt_t cut_coulsq, cut_ljsq;
+    flt_t cut_coul_innersq, cut_lj_innersq;
+    typename IntelBuffers<flt_t,flt_t>::vec4_t **lj;
+
+    ForceConst() : _ntypes(0) {}
+    ~ForceConst() { set_ntypes(0,NULL,_cop); }
+
+    void set_ntypes(const int ntypes, Memory *memory, const int cop);
+
+   private:
+    int _ntypes, _cop;
+    Memory *_memory;
+  };
+  ForceConst<float> force_const_single;
+  ForceConst<double> force_const_double;
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: The 'package intel' command is required for /intel styles
+
+Self-explanatory.
+
+E: Intel varient of lj/charmm/coul/charmm expects lj cutoff<=coulombic
+
+The intel accelerated version of the CHARMM style requires that the
+Lennard-Jones cutoff is not greater than the coulombic cutoff.
+
+*/
diff --git a/src/USER-INTEL/pair_rebo_intel.cpp b/src/USER-INTEL/pair_rebo_intel.cpp
new file mode 100644
index 0000000000..006830a5fa
--- /dev/null
+++ b/src/USER-INTEL/pair_rebo_intel.cpp
@@ -0,0 +1,42 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Markus Hohnerbach (RWTH)
+------------------------------------------------------------------------- */
+
+#include "pair_rebo_intel.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairREBOIntel::PairREBOIntel(LAMMPS *lmp) : PairAIREBOIntel(lmp) {}
+
+/* ----------------------------------------------------------------------
+   global settings
+------------------------------------------------------------------------- */
+
+void PairREBOIntel::settings(int narg, char **arg)
+{
+  if (narg != 0) error->all(FLERR,"Illegal pair_style command");
+
+  cutlj = 0.0;
+  ljflag = torflag = 0;
+  //
+  // this one parameter for C-C interactions is different in REBO vs AIREBO
+  // see Favata, Micheletti, Ryu, Pugno, Comp Phys Comm (2016)
+  
+  PCCf_2_0 = 0.0;
+}
diff --git a/src/USER-INTEL/pair_rebo_intel.h b/src/USER-INTEL/pair_rebo_intel.h
new file mode 100644
index 0000000000..e76279a248
--- /dev/null
+++ b/src/USER-INTEL/pair_rebo_intel.h
@@ -0,0 +1,40 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Markus Hohnerbach (RWTH)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(rebo/intel,PairREBOIntel)
+
+#else
+
+#ifndef LMP_PAIR_REBO_INTEL_H
+#define LMP_PAIR_REBO_INTEL_H
+
+#include "pair_airebo_intel.h"
+
+namespace LAMMPS_NS {
+
+class PairREBOIntel : public PairAIREBOIntel {
+ public:
+  PairREBOIntel(class LAMMPS *);
+  virtual void settings(int, char **);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-INTEL/pair_sw_intel.cpp b/src/USER-INTEL/pair_sw_intel.cpp
index 7a6b7afd92..fff104f39b 100644
--- a/src/USER-INTEL/pair_sw_intel.cpp
+++ b/src/USER-INTEL/pair_sw_intel.cpp
@@ -345,16 +345,17 @@ void PairSWIntel::eval(const int offload, const int vflag,
             if (jj < jnumhalf) ejnumhalf++;
           }
         }
-        int ejnum_pad = ejnum;
 
-        while ( (ejnum_pad % pad_width) != 0) {
-          tdelx[ejnum_pad] = (flt_t)0.0;
-          tdely[ejnum_pad] = (flt_t)0.0;
-          tdelz[ejnum_pad] = (flt_t)0.0;
-          trsq[ejnum_pad] = p2[3].cutsq + (flt_t)1.0;
-          tj[ejnum_pad] = nall;
-          if (!ONETYPE) tjtype[ejnum_pad] = 0;
-          ejnum_pad++;
+	int ejrem = ejnum & (pad_width - 1);
+	if (ejrem) ejrem = pad_width - ejrem;
+	const int ejnum_pad = ejnum + ejrem;
+	for (int jj = ejnum; jj < ejnum_pad; jj++) {
+          tdelx[jj] = (flt_t)0.0;
+          tdely[jj] = (flt_t)0.0;
+          tdelz[jj] = (flt_t)0.0;
+          trsq[jj] = p2[3].cutsq + (flt_t)1.0;
+          tj[jj] = nall;
+          if (!ONETYPE) tjtype[jj] = 0;
         }
 
         #if defined(LMP_SIMD_COMPILER)
diff --git a/src/force.cpp b/src/force.cpp
index 33e6630406..060cae10eb 100644
--- a/src/force.cpp
+++ b/src/force.cpp
@@ -833,10 +833,6 @@ void Force::set_special(int narg, char **arg)
       else if (strcmp(arg[iarg+1],"yes") == 0) special_dihedral = 1;
       else error->all(FLERR,"Illegal special_bonds command");
       iarg += 2;
-    } else if (strcmp(arg[iarg],"extra") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal special_bonds command");
-      special_extra = atoi(arg[iarg+1]);
-      iarg += 2;
     } else error->all(FLERR,"Illegal special_bonds command");
   }
 
@@ -844,8 +840,6 @@ void Force::set_special(int narg, char **arg)
     if (special_lj[i] < 0.0 || special_lj[i] > 1.0 ||
         special_coul[i] < 0.0 || special_coul[i] > 1.0)
       error->all(FLERR,"Illegal special_bonds command");
-
-  if (special_extra < 0) error->all(FLERR,"Illegal special_bonds command");
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/info.cpp b/src/info.cpp
index 9fcc24fde9..03eb1e10ed 100644
--- a/src/info.cpp
+++ b/src/info.cpp
@@ -45,7 +45,7 @@
 #include <algorithm>
 
 #ifdef _WIN32
-#define PSAPI_VERSION=1
+#define PSAPI_VERSION 1
 #include <windows.h>
 #include <stdint.h>
 #include <psapi.h>
diff --git a/src/input.cpp b/src/input.cpp
index 570560373a..7d11b8741b 100644
--- a/src/input.cpp
+++ b/src/input.cpp
@@ -1867,7 +1867,6 @@ void Input::special_bonds()
   double coul3 = force->special_coul[3];
   int angle = force->special_angle;
   int dihedral = force->special_dihedral;
-  int extra = force->special_extra;
 
   force->set_special(narg,arg);
 
@@ -1877,8 +1876,7 @@ void Input::special_bonds()
     if (lj2 != force->special_lj[2] || lj3 != force->special_lj[3] ||
         coul2 != force->special_coul[2] || coul3 != force->special_coul[3] ||
         angle != force->special_angle ||
-        dihedral != force->special_dihedral ||
-        extra != force->special_extra) {
+        dihedral != force->special_dihedral) {
       Special special(lmp);
       special.build();
     }
diff --git a/src/molecule.cpp b/src/molecule.cpp
index e0e9ec8aaf..b0fec4bcbc 100644
--- a/src/molecule.cpp
+++ b/src/molecule.cpp
@@ -427,47 +427,61 @@ void Molecule::read(int flag)
 
     // search line for header keywords and set corresponding variable
 
-    if (strstr(line,"atoms")) sscanf(line,"%d",&natoms);
-    else if (strstr(line,"bonds")) sscanf(line,"%d",&nbonds);
-    else if (strstr(line,"angles")) sscanf(line,"%d",&nangles);
-    else if (strstr(line,"dihedrals")) sscanf(line,"%d",&ndihedrals);
-    else if (strstr(line,"impropers")) sscanf(line,"%d",&nimpropers);
-
-    else if (strstr(line,"mass")) {
+    int nmatch = 0;
+    int nwant = 0;
+    if (strstr(line,"atoms")) {
+      nmatch = sscanf(line,"%d",&natoms);
+      nwant = 1;
+    } else if (strstr(line,"bonds")) {
+      nmatch = sscanf(line,"%d",&nbonds);
+      nwant = 1;
+    } else if (strstr(line,"angles")) {
+      nmatch = sscanf(line,"%d",&nangles);
+      nwant = 1;
+    } else if (strstr(line,"dihedrals")) {
+      nmatch = sscanf(line,"%d",&ndihedrals);
+      nwant = 1;
+    } else if (strstr(line,"impropers")) {
+      nmatch = sscanf(line,"%d",&nimpropers);
+      nwant = 1;
+    } else if (strstr(line,"mass")) {
       massflag = 1;
-      sscanf(line,"%lg",&masstotal);
+      nmatch = sscanf(line,"%lg",&masstotal);
+      nwant = 1;
       masstotal *= sizescale*sizescale*sizescale;
-    }
-    else if (strstr(line,"com")) {
+    } else if (strstr(line,"com")) {
       comflag = 1;
-      sscanf(line,"%lg %lg %lg",&com[0],&com[1],&com[2]);
+      nmatch = sscanf(line,"%lg %lg %lg",&com[0],&com[1],&com[2]);
+      nwant = 3;
       com[0] *= sizescale;
       com[1] *= sizescale;
       com[2] *= sizescale;
       if (domain->dimension == 2 && com[2] != 0.0)
         error->all(FLERR,"Molecule file z center-of-mass must be 0.0 for 2d");
-    }
-    else if (strstr(line,"inertia")) {
+    } else if (strstr(line,"inertia")) {
       inertiaflag = 1;
-      sscanf(line,"%lg %lg %lg %lg %lg %lg",
-             &itensor[0],&itensor[1],&itensor[2],
-             &itensor[3],&itensor[4],&itensor[5]);
-      itensor[0] *= sizescale*sizescale*sizescale*sizescale*sizescale;
-      itensor[1] *= sizescale*sizescale*sizescale*sizescale*sizescale;
-      itensor[2] *= sizescale*sizescale*sizescale*sizescale*sizescale;
-      itensor[3] *= sizescale*sizescale*sizescale*sizescale*sizescale;
-      itensor[4] *= sizescale*sizescale*sizescale*sizescale*sizescale;
-      itensor[5] *= sizescale*sizescale*sizescale*sizescale*sizescale;
-    }
-    else if (strstr(line,"body")) {
+      nmatch = sscanf(line,"%lg %lg %lg %lg %lg %lg",
+                      &itensor[0],&itensor[1],&itensor[2],
+                      &itensor[3],&itensor[4],&itensor[5]);
+      nwant = 6;
+      const double scale5 = sizescale*sizescale*sizescale*sizescale*sizescale;
+      itensor[0] *= scale5;
+      itensor[1] *= scale5;
+      itensor[2] *= scale5;
+      itensor[3] *= scale5;
+      itensor[4] *= scale5;
+      itensor[5] *= scale5;
+    } else if (strstr(line,"body")) {
       bodyflag = 1;
       avec_body = (AtomVecBody *) atom->style_match("body");
       if (!avec_body) 
         error->all(FLERR,"Molecule file requires atom style body");
-      sscanf(line,"%d %d",&nibody,&ndbody);
-    }
+      nmatch = sscanf(line,"%d %d",&nibody,&ndbody);
+      nwant = 2;
+    } else break;
 
-    else break;
+    if (nmatch != nwant)
+      error->all(FLERR,"Invalid header in molecule file");
   }
 
   // error checks
@@ -493,7 +507,7 @@ void Molecule::read(int flag)
 
   // loop over sections of molecule file
 
-  while (strlen(keyword)) {
+  while (strlen(keyword) > 0) {
     if (strcmp(keyword,"Coords") == 0) {
       xflag = 1;
       if (flag) coords(line);
@@ -517,22 +531,22 @@ void Molecule::read(int flag)
 
     } else if (strcmp(keyword,"Bonds") == 0) {
       if (nbonds == 0)
-	error->all(FLERR,"Molecule file has bonds but no nbonds setting");
+        error->all(FLERR,"Molecule file has bonds but no nbonds setting");
       bondflag = tag_require = 1;
       bonds(flag,line);
     } else if (strcmp(keyword,"Angles") == 0) {
       if (nangles == 0)
-	error->all(FLERR,"Molecule file has angles but no nangles setting");
+        error->all(FLERR,"Molecule file has angles but no nangles setting");
       angleflag = tag_require = 1;
       angles(flag,line);
     } else if (strcmp(keyword,"Dihedrals") == 0) {
       if (ndihedrals == 0) error->all(FLERR,"Molecule file has dihedrals "
-				      "but no ndihedrals setting");
+                                      "but no ndihedrals setting");
       dihedralflag = tag_require = 1;
       dihedrals(flag,line);
     } else if (strcmp(keyword,"Impropers") == 0) {
       if (nimpropers == 0) error->all(FLERR,"Molecule file has impropers "
-				      "but no nimpropers setting");
+                                      "but no nimpropers setting");
       improperflag = tag_require = 1;
       impropers(flag,line);
 
@@ -552,26 +566,26 @@ void Molecule::read(int flag)
       shakeatomflag = tag_require = 1;
       if (shaketypeflag) shakeflag = 1;
       if (!shakeflagflag)
-	error->all(FLERR,"Molecule file shake flags not before shake atoms");
+        error->all(FLERR,"Molecule file shake flags not before shake atoms");
       if (flag) shakeatom_read(line);
       else skip_lines(natoms,line);
     } else if (strcmp(keyword,"Shake Bond Types") == 0) {
       shaketypeflag = 1;
       if (shakeatomflag) shakeflag = 1;
       if (!shakeflagflag)
-	error->all(FLERR,"Molecule file shake flags not before shake bonds");
+        error->all(FLERR,"Molecule file shake flags not before shake bonds");
       if (flag) shaketype_read(line);
       else skip_lines(natoms,line);
 
     } else if (strcmp(keyword,"Body Integers") == 0) {
       if (bodyflag == 0 || nibody == 0)
-	error->all(FLERR,"Molecule file has body params "
+        error->all(FLERR,"Molecule file has body params "
                    "but no setting for them");
       ibodyflag = 1;
       body(flag,0,line);
     } else if (strcmp(keyword,"Body Doubles") == 0) {
       if (bodyflag == 0 || ndbody == 0)
-	error->all(FLERR,"Molecule file has body params "
+        error->all(FLERR,"Molecule file has body params "
                    "but no setting for them");
       dbodyflag = 1;
       body(flag,1,line);
@@ -618,7 +632,7 @@ void Molecule::read(int flag)
 
   // body particle must have natom = 1
   // set radius by having body class compute its own radius
-  
+
   if (bodyflag) {
     radiusflag = 1;
     if (natoms != 1) 
@@ -641,12 +655,9 @@ void Molecule::coords(char *line)
   int tmp;
   for (int i = 0; i < natoms; i++) {
     readline(line);
-    if (i == 0) {
-      int nwords = atom->count_words(line);
-      if (nwords != 4)
-        error->all(FLERR,"Invalid Coords section in molecule file");
-    }
-    sscanf(line,"%d %lg %lg %lg",&tmp,&x[i][0],&x[i][1],&x[i][2]);
+    if (4 != sscanf(line,"%d %lg %lg %lg",&tmp,&x[i][0],&x[i][1],&x[i][2]))
+      error->all(FLERR,"Invalid Coords section in molecule file");
+
     x[i][0] *= sizescale;
     x[i][1] *= sizescale;
     x[i][2] *= sizescale;
@@ -669,12 +680,8 @@ void Molecule::types(char *line)
   int tmp;
   for (int i = 0; i < natoms; i++) {
     readline(line);
-    if (i == 0) {
-      int nwords = atom->count_words(line);
-      if (nwords != 2)
-        error->all(FLERR,"Invalid Types section in molecule file");
-    }
-    sscanf(line,"%d %d",&tmp,&type[i]);
+    if (2 != sscanf(line,"%d %d",&tmp,&type[i]))
+      error->all(FLERR,"Invalid Types section in molecule file");
     type[i] += toffset;
   }
 
@@ -695,12 +702,8 @@ void Molecule::charges(char *line)
   int tmp;
   for (int i = 0; i < natoms; i++) {
     readline(line);
-    if (i == 0) {
-      int nwords = atom->count_words(line);
-      if (nwords != 2)
-        error->all(FLERR,"Invalid Charges section in molecule file");
-    }
-    sscanf(line,"%d %lg",&tmp,&q[i]);
+    if (2 != sscanf(line,"%d %lg",&tmp,&q[i]))
+      error->all(FLERR,"Invalid Charges section in molecule file");
   }
 }
 
@@ -714,12 +717,8 @@ void Molecule::diameters(char *line)
   maxradius = 0.0;
   for (int i = 0; i < natoms; i++) {
     readline(line);
-    if (i == 0) {
-      int nwords = atom->count_words(line);
-      if (nwords != 2)
-        error->all(FLERR,"Invalid Diameters section in molecule file");
-    }
-    sscanf(line,"%d %lg",&tmp,&radius[i]);
+    if (2 != sscanf(line,"%d %lg",&tmp,&radius[i]))
+      error->all(FLERR,"Invalid Diameters section in molecule file");
     radius[i] *= sizescale;
     radius[i] *= 0.5;
     maxradius = MAX(maxradius,radius[i]);
@@ -739,12 +738,8 @@ void Molecule::masses(char *line)
   int tmp;
   for (int i = 0; i < natoms; i++) {
     readline(line);
-    if (i == 0) {
-      int nwords = atom->count_words(line);
-      if (nwords != 2)
-        error->all(FLERR,"Invalid Masses section in molecule file");
-    }
-    sscanf(line,"%d %lg",&tmp,&rmass[i]);
+    if (2 != sscanf(line,"%d %lg",&tmp,&rmass[i]))
+      error->all(FLERR,"Invalid Masses section in molecule file");
     rmass[i] *= sizescale*sizescale*sizescale;
   }
 
@@ -773,17 +768,13 @@ void Molecule::bonds(int flag, char *line)
 
   for (int i = 0; i < nbonds; i++) {
     readline(line);
-    if (i == 0) {
-      int nwords = atom->count_words(line);
-      if (nwords != 4)
-        error->all(FLERR,"Invalid Bonds section in molecule file");
-    }
-    sscanf(line,"%d %d " TAGINT_FORMAT " " TAGINT_FORMAT,
-           &tmp,&itype,&atom1,&atom2);
+    if (4 != sscanf(line,"%d %d " TAGINT_FORMAT " " TAGINT_FORMAT,
+           &tmp,&itype,&atom1,&atom2))
+      error->all(FLERR,"Invalid Bonds section in molecule file");
     itype += boffset;
 
     if (atom1 <= 0 || atom1 > natoms ||
-	atom2 <= 0 || atom2 > natoms)
+        atom2 <= 0 || atom2 > natoms)
       error->one(FLERR,"Invalid atom ID in Bonds section of molecule file");
     if (itype <= 0)
       error->one(FLERR,"Invalid bond type in Bonds section of molecule file");
@@ -795,10 +786,10 @@ void Molecule::bonds(int flag, char *line)
       bond_atom[m][num_bond[m]] = atom2;
       num_bond[m]++;
       if (newton_bond == 0) {
-	m = atom2-1;
-	bond_type[m][num_bond[m]] = itype;
-	bond_atom[m][num_bond[m]] = atom1;
-	num_bond[m]++;
+        m = atom2-1;
+        bond_type[m][num_bond[m]] = itype;
+        bond_atom[m][num_bond[m]] = atom1;
+        num_bond[m]++;
       }
     } else {
       count[atom1-1]++;
@@ -835,13 +826,9 @@ void Molecule::angles(int flag, char *line)
 
   for (int i = 0; i < nangles; i++) {
     readline(line);
-    if (i == 0) {
-      int nwords = atom->count_words(line);
-      if (nwords != 5)
-        error->all(FLERR,"Invalid Angles section in molecule file");
-    }
-    sscanf(line,"%d %d " TAGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT,
-           &tmp,&itype,&atom1,&atom2,&atom3);
+    if (5 != sscanf(line,"%d %d " TAGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT,
+           &tmp,&itype,&atom1,&atom2,&atom3))
+      error->all(FLERR,"Invalid Angles section in molecule file");
     itype += aoffset;
 
     if (atom1 <= 0 || atom1 > natoms ||
@@ -860,24 +847,24 @@ void Molecule::angles(int flag, char *line)
       angle_atom3[m][num_angle[m]] = atom3;
       num_angle[m]++;
       if (newton_bond == 0) {
-	m = atom1-1;
-	angle_type[m][num_angle[m]] = itype;
-	angle_atom1[m][num_angle[m]] = atom1;
-	angle_atom2[m][num_angle[m]] = atom2;
-	angle_atom3[m][num_angle[m]] = atom3;
-	num_angle[m]++;
-	m = atom3-1;
-	angle_type[m][num_angle[m]] = itype;
-	angle_atom1[m][num_angle[m]] = atom1;
-	angle_atom2[m][num_angle[m]] = atom2;
-	angle_atom3[m][num_angle[m]] = atom3;
-	num_angle[m]++;
+        m = atom1-1;
+        angle_type[m][num_angle[m]] = itype;
+        angle_atom1[m][num_angle[m]] = atom1;
+        angle_atom2[m][num_angle[m]] = atom2;
+        angle_atom3[m][num_angle[m]] = atom3;
+        num_angle[m]++;
+        m = atom3-1;
+        angle_type[m][num_angle[m]] = itype;
+        angle_atom1[m][num_angle[m]] = atom1;
+        angle_atom2[m][num_angle[m]] = atom2;
+        angle_atom3[m][num_angle[m]] = atom3;
+        num_angle[m]++;
       }
     } else {
       count[atom2-1]++;
       if (newton_bond == 0) {
-	count[atom1-1]++;
-	count[atom3-1]++;
+        count[atom1-1]++;
+        count[atom3-1]++;
       }
     }
   }
@@ -911,14 +898,10 @@ void Molecule::dihedrals(int flag, char *line)
 
   for (int i = 0; i < ndihedrals; i++) {
     readline(line);
-    if (i == 0) {
-      int nwords = atom->count_words(line);
-      if (nwords != 6)
-        error->all(FLERR,"Invalid Dihedrals section in molecule file");
-    }
-    sscanf(line,"%d %d " TAGINT_FORMAT " " TAGINT_FORMAT " "
+    if (6 != sscanf(line,"%d %d " TAGINT_FORMAT " " TAGINT_FORMAT " "
            TAGINT_FORMAT " " TAGINT_FORMAT " ",
-           &tmp,&itype,&atom1,&atom2,&atom3,&atom4);
+           &tmp,&itype,&atom1,&atom2,&atom3,&atom4))
+      error->all(FLERR,"Invalid Dihedrals section in molecule file");
     itype += doffset;
 
     if (atom1 <= 0 || atom1 > natoms ||
@@ -926,10 +909,10 @@ void Molecule::dihedrals(int flag, char *line)
         atom3 <= 0 || atom3 > natoms ||
         atom4 <= 0 || atom4 > natoms)
       error->one(FLERR,
-		 "Invalid atom ID in dihedrals section of molecule file");
+                 "Invalid atom ID in dihedrals section of molecule file");
     if (itype <= 0)
       error->one(FLERR,
-		 "Invalid dihedral type in dihedrals section of molecule file");
+                 "Invalid dihedral type in dihedrals section of molecule file");
 
     if (flag) {
       m = atom2-1;
@@ -941,34 +924,34 @@ void Molecule::dihedrals(int flag, char *line)
       dihedral_atom4[m][num_dihedral[m]] = atom4;
       num_dihedral[m]++;
       if (newton_bond == 0) {
-	m = atom1-1;
-	dihedral_type[m][num_dihedral[m]] = itype;
-	dihedral_atom1[m][num_dihedral[m]] = atom1;
-	dihedral_atom2[m][num_dihedral[m]] = atom2;
-	dihedral_atom3[m][num_dihedral[m]] = atom3;
-	dihedral_atom4[m][num_dihedral[m]] = atom4;
-	num_dihedral[m]++;
-	m = atom3-1;
-	dihedral_type[m][num_dihedral[m]] = itype;
-	dihedral_atom1[m][num_dihedral[m]] = atom1;
-	dihedral_atom2[m][num_dihedral[m]] = atom2;
-	dihedral_atom3[m][num_dihedral[m]] = atom3;
-	dihedral_atom4[m][num_dihedral[m]] = atom4;
-	num_dihedral[m]++;
-	m = atom4-1;
-	dihedral_type[m][num_dihedral[m]] = itype;
-	dihedral_atom1[m][num_dihedral[m]] = atom1;
-	dihedral_atom2[m][num_dihedral[m]] = atom2;
-	dihedral_atom3[m][num_dihedral[m]] = atom3;
-	dihedral_atom4[m][num_dihedral[m]] = atom4;
-	num_dihedral[m]++;
+        m = atom1-1;
+        dihedral_type[m][num_dihedral[m]] = itype;
+        dihedral_atom1[m][num_dihedral[m]] = atom1;
+        dihedral_atom2[m][num_dihedral[m]] = atom2;
+        dihedral_atom3[m][num_dihedral[m]] = atom3;
+        dihedral_atom4[m][num_dihedral[m]] = atom4;
+        num_dihedral[m]++;
+        m = atom3-1;
+        dihedral_type[m][num_dihedral[m]] = itype;
+        dihedral_atom1[m][num_dihedral[m]] = atom1;
+        dihedral_atom2[m][num_dihedral[m]] = atom2;
+        dihedral_atom3[m][num_dihedral[m]] = atom3;
+        dihedral_atom4[m][num_dihedral[m]] = atom4;
+        num_dihedral[m]++;
+        m = atom4-1;
+        dihedral_type[m][num_dihedral[m]] = itype;
+        dihedral_atom1[m][num_dihedral[m]] = atom1;
+        dihedral_atom2[m][num_dihedral[m]] = atom2;
+        dihedral_atom3[m][num_dihedral[m]] = atom3;
+        dihedral_atom4[m][num_dihedral[m]] = atom4;
+        num_dihedral[m]++;
       }
     } else {
       count[atom2-1]++;
       if (newton_bond == 0) {
-	count[atom1-1]++;
-	count[atom3-1]++;
-	count[atom4-1]++;
+        count[atom1-1]++;
+        count[atom3-1]++;
+        count[atom4-1]++;
       }
     }
   }
@@ -1002,14 +985,10 @@ void Molecule::impropers(int flag, char *line)
 
   for (int i = 0; i < nimpropers; i++) {
     readline(line);
-    if (i == 0) {
-      int nwords = atom->count_words(line);
-      if (nwords != 6)
-        error->all(FLERR,"Invalid Impropers section in molecule file");
-    }
-    sscanf(line,"%d %d " TAGINT_FORMAT " " TAGINT_FORMAT " "
+    if (6 != sscanf(line,"%d %d " TAGINT_FORMAT " " TAGINT_FORMAT " "
            TAGINT_FORMAT " " TAGINT_FORMAT " ",
-           &tmp,&itype,&atom1,&atom2,&atom3,&atom4);
+           &tmp,&itype,&atom1,&atom2,&atom3,&atom4))
+      error->all(FLERR,"Invalid Impropers section in molecule file");
     itype += ioffset;
 
     if (atom1 <= 0 || atom1 > natoms ||
@@ -1017,10 +996,10 @@ void Molecule::impropers(int flag, char *line)
         atom3 <= 0 || atom3 > natoms ||
         atom4 <= 0 || atom4 > natoms)
       error->one(FLERR,
-		 "Invalid atom ID in impropers section of molecule file");
+                 "Invalid atom ID in impropers section of molecule file");
     if (itype <= 0)
       error->one(FLERR,
-		 "Invalid improper type in impropers section of molecule file");
+                 "Invalid improper type in impropers section of molecule file");
 
     if (flag) {
       m = atom2-1;
@@ -1032,34 +1011,34 @@ void Molecule::impropers(int flag, char *line)
       improper_atom4[m][num_improper[m]] = atom4;
       num_improper[m]++;
       if (newton_bond == 0) {
-	m = atom1-1;
-	improper_type[m][num_improper[m]] = itype;
-	improper_atom1[m][num_improper[m]] = atom1;
-	improper_atom2[m][num_improper[m]] = atom2;
-	improper_atom3[m][num_improper[m]] = atom3;
-	improper_atom4[m][num_improper[m]] = atom4;
-	num_improper[m]++;
-	m = atom3-1;
-	improper_type[m][num_improper[m]] = itype;
-	improper_atom1[m][num_improper[m]] = atom1;
-	improper_atom2[m][num_improper[m]] = atom2;
-	improper_atom3[m][num_improper[m]] = atom3;
-	improper_atom4[m][num_improper[m]] = atom4;
-	num_improper[m]++;
-	m = atom4-1;
-	improper_type[m][num_improper[m]] = itype;
-	improper_atom1[m][num_improper[m]] = atom1;
-	improper_atom2[m][num_improper[m]] = atom2;
-	improper_atom3[m][num_improper[m]] = atom3;
-	improper_atom4[m][num_improper[m]] = atom4;
-	num_improper[m]++;
+        m = atom1-1;
+        improper_type[m][num_improper[m]] = itype;
+        improper_atom1[m][num_improper[m]] = atom1;
+        improper_atom2[m][num_improper[m]] = atom2;
+        improper_atom3[m][num_improper[m]] = atom3;
+        improper_atom4[m][num_improper[m]] = atom4;
+        num_improper[m]++;
+        m = atom3-1;
+        improper_type[m][num_improper[m]] = itype;
+        improper_atom1[m][num_improper[m]] = atom1;
+        improper_atom2[m][num_improper[m]] = atom2;
+        improper_atom3[m][num_improper[m]] = atom3;
+        improper_atom4[m][num_improper[m]] = atom4;
+        num_improper[m]++;
+        m = atom4-1;
+        improper_type[m][num_improper[m]] = itype;
+        improper_atom1[m][num_improper[m]] = atom1;
+        improper_atom2[m][num_improper[m]] = atom2;
+        improper_atom3[m][num_improper[m]] = atom3;
+        improper_atom4[m][num_improper[m]] = atom4;
+        num_improper[m]++;
       }
     } else {
       count[atom2-1]++;
       if (newton_bond == 0) {
-	count[atom1-1]++;
-	count[atom3-1]++;
-	count[atom4-1]++;
+        count[atom1-1]++;
+        count[atom3-1]++;
+        count[atom4-1]++;
       }
     }
   }
@@ -1087,13 +1066,9 @@ void Molecule::nspecial_read(int flag, char *line)
 
   for (int i = 0; i < natoms; i++) {
     readline(line);
-    if (i == 0) {
-      int nwords = atom->count_words(line);
-      if (nwords != 4)
-        error->all(FLERR,"Invalid Special Bond Counts section in "
-                   "molecule file");
-    }
-    sscanf(line,"%d %d %d %d",&tmp,&c1,&c2,&c3);
+    if (4 != sscanf(line,"%d %d %d %d",&tmp,&c1,&c2,&c3))
+      error->all(FLERR,"Invalid Special Bond Counts section in "
+                 "molecule file");
 
     if (flag) {
       nspecial[i][0] = c1;
@@ -1117,13 +1092,13 @@ void Molecule::special_read(char *line)
     nwords = parse(line,words,maxspecial+1);
     if (nwords != nspecial[i][2]+1)
       error->all(FLERR,"Molecule file special list "
-		 "does not match special count");
+                 "does not match special count");
 
     for (m = 1; m < nwords; m++) {
       special[i][m-1] = ATOTAGINT(words[m]);
       if (special[i][m-1] <= 0 || special[i][m-1] > natoms ||
-	  special[i][m-1] == i+1)
-	error->all(FLERR,"Invalid special atom index in molecule file");
+          special[i][m-1] == i+1)
+        error->all(FLERR,"Invalid special atom index in molecule file");
     }
   }
 
@@ -1229,7 +1204,8 @@ void Molecule::shakeflag_read(char *line)
   int tmp;
   for (int i = 0; i < natoms; i++) {
     readline(line);
-    sscanf(line,"%d %d",&tmp,&shake_flag[i]);
+    if (2 != sscanf(line,"%d %d",&tmp,&shake_flag[i]))
+      error->all(FLERR,"Invalid Shake Flags section in molecule file");
   }
 
   for (int i = 0; i < natoms; i++)
@@ -1243,23 +1219,32 @@ void Molecule::shakeflag_read(char *line)
 
 void Molecule::shakeatom_read(char *line)
 {
-  int tmp;
+  int tmp, nmatch, nwant;
   for (int i = 0; i < natoms; i++) {
     readline(line);
-    if (shake_flag[i] == 1)
-      sscanf(line,"%d " TAGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT,
-             &tmp,&shake_atom[i][0],&shake_atom[i][1],&shake_atom[i][2]);
-    else if (shake_flag[i] == 2)
-      sscanf(line,"%d " TAGINT_FORMAT " " TAGINT_FORMAT,
-             &tmp,&shake_atom[i][0],&shake_atom[i][1]);
-    else if (shake_flag[i] == 3)
-      sscanf(line,"%d " TAGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT,
-             &tmp,&shake_atom[i][0],&shake_atom[i][1],&shake_atom[i][2]);
-    else if (shake_flag[i] == 4)
-      sscanf(line,"%d " TAGINT_FORMAT " " TAGINT_FORMAT " "
-             TAGINT_FORMAT " " TAGINT_FORMAT,
-             &tmp,&shake_atom[i][0],&shake_atom[i][1],
-             &shake_atom[i][2],&shake_atom[i][3]);
+    if (shake_flag[i] == 1) {
+      nmatch = sscanf(line,"%d " TAGINT_FORMAT " " TAGINT_FORMAT
+                      " " TAGINT_FORMAT,&tmp,&shake_atom[i][0],
+                      &shake_atom[i][1],&shake_atom[i][2]);
+      nwant = 4;
+    } else if (shake_flag[i] == 2) {
+      nmatch = sscanf(line,"%d " TAGINT_FORMAT " " TAGINT_FORMAT,
+                      &tmp,&shake_atom[i][0],&shake_atom[i][1]);
+      nwant = 3;
+    } else if (shake_flag[i] == 3) {
+      nmatch = sscanf(line,"%d " TAGINT_FORMAT " " TAGINT_FORMAT
+                      " " TAGINT_FORMAT,&tmp,&shake_atom[i][0],
+                      &shake_atom[i][1],&shake_atom[i][2]);
+      nwant = 4;
+    } else if (shake_flag[i] == 4) {
+      nmatch = sscanf(line,"%d " TAGINT_FORMAT " " TAGINT_FORMAT " "
+                      TAGINT_FORMAT " " TAGINT_FORMAT,
+                      &tmp,&shake_atom[i][0],&shake_atom[i][1],
+                      &shake_atom[i][2],&shake_atom[i][3]);
+      nwant = 5;
+    }
+    if (nmatch != nwant)
+      error->all(FLERR,"Invalid shake atom in molecule file");
   }
 
   for (int i = 0; i < natoms; i++) {
@@ -1277,19 +1262,27 @@ void Molecule::shakeatom_read(char *line)
 
 void Molecule::shaketype_read(char *line)
 {
-  int tmp;
+  int tmp,nmatch,nwant;
   for (int i = 0; i < natoms; i++) {
     readline(line);
-    if (shake_flag[i] == 1)
-      sscanf(line,"%d %d %d %d",&tmp,
-             &shake_type[i][0],&shake_type[i][1],&shake_type[i][2]);
-    else if (shake_flag[i] == 2)
-      sscanf(line,"%d %d",&tmp,&shake_type[i][0]);
-    else if (shake_flag[i] == 3)
-      sscanf(line,"%d %d %d",&tmp,&shake_type[i][0],&shake_type[i][1]);
-    else if (shake_flag[i] == 4)
-      sscanf(line,"%d %d %d %d",&tmp,
-             &shake_type[i][0],&shake_type[i][1],&shake_type[i][2]);
+    if (shake_flag[i] == 1) {
+      nmatch = sscanf(line,"%d %d %d %d",&tmp,&shake_type[i][0],
+                      &shake_type[i][1],&shake_type[i][2]);
+      nwant = 4;
+    } else if (shake_flag[i] == 2) {
+      nmatch = sscanf(line,"%d %d",&tmp,&shake_type[i][0]);
+      nwant = 2;
+    } else if (shake_flag[i] == 3) {
+      nmatch = sscanf(line,"%d %d %d",&tmp,&shake_type[i][0],
+                      &shake_type[i][1]);
+      nwant = 3;
+    } else if (shake_flag[i] == 4) {
+      nmatch = sscanf(line,"%d %d %d %d",&tmp,&shake_type[i][0],
+                      &shake_type[i][1],&shake_type[i][2]);
+      nwant = 4;
+    }
+    if (nmatch != nwant)
+      error->all(FLERR,"Invalid shake type data in molecule file");
   }
 
   for (int i = 0; i < natoms; i++) {
@@ -1501,46 +1494,46 @@ void Molecule::allocate()
 
   if (bondflag) {
     memory->create(bond_type,natoms,bond_per_atom,
-		   "molecule:bond_type");
+                   "molecule:bond_type");
     memory->create(bond_atom,natoms,bond_per_atom,
-		   "molecule:bond_atom");
+                   "molecule:bond_atom");
   }
 
   if (angleflag) {
     memory->create(angle_type,natoms,angle_per_atom,
-		   "molecule:angle_type");
+                   "molecule:angle_type");
     memory->create(angle_atom1,natoms,angle_per_atom,
-		   "molecule:angle_atom1");
+                   "molecule:angle_atom1");
     memory->create(angle_atom2,natoms,angle_per_atom,
-		   "molecule:angle_atom2");
+                   "molecule:angle_atom2");
     memory->create(angle_atom3,natoms,angle_per_atom,
-		   "molecule:angle_atom3");
+                   "molecule:angle_atom3");
   }
 
   if (dihedralflag) {
     memory->create(dihedral_type,natoms,dihedral_per_atom,
-		   "molecule:dihedral_type");
+                   "molecule:dihedral_type");
     memory->create(dihedral_atom1,natoms,dihedral_per_atom,
-		   "molecule:dihedral_atom1");
+                   "molecule:dihedral_atom1");
     memory->create(dihedral_atom2,natoms,dihedral_per_atom,
-		   "molecule:dihedral_atom2");
+                   "molecule:dihedral_atom2");
     memory->create(dihedral_atom3,natoms,dihedral_per_atom,
-		   "molecule:dihedral_atom3");
+                   "molecule:dihedral_atom3");
     memory->create(dihedral_atom4,natoms,dihedral_per_atom,
-		   "molecule:dihedral_atom4");
+                   "molecule:dihedral_atom4");
   }
 
   if (improperflag) {
     memory->create(improper_type,natoms,improper_per_atom,
-		   "molecule:improper_type");
+                   "molecule:improper_type");
     memory->create(improper_atom1,natoms,improper_per_atom,
-		   "molecule:improper_atom1");
+                   "molecule:improper_atom1");
     memory->create(improper_atom2,natoms,improper_per_atom,
-		   "molecule:improper_atom2");
+                   "molecule:improper_atom2");
     memory->create(improper_atom3,natoms,improper_per_atom,
-		   "molecule:improper_atom3");
+                   "molecule:improper_atom3");
     memory->create(improper_atom4,natoms,improper_per_atom,
-		   "molecule:improper_atom4");
+                   "molecule:improper_atom4");
   }
 
   if (shakeflag) {
@@ -1653,7 +1646,7 @@ void Molecule::parse_keyword(int flag, char *line, char *keyword)
     if (me == 0) {
       if (fgets(line,MAXLINE,fp) == NULL) eof = 1;
       while (eof == 0 && strspn(line," \t\n\r") == strlen(line)) {
-	if (fgets(line,MAXLINE,fp) == NULL) eof = 1;
+        if (fgets(line,MAXLINE,fp) == NULL) eof = 1;
       }
       if (fgets(keyword,MAXLINE,fp) == NULL) eof = 1;
     }
diff --git a/src/pair.cpp b/src/pair.cpp
index ce711c4f5d..05319e33f2 100644
--- a/src/pair.cpp
+++ b/src/pair.cpp
@@ -75,7 +75,7 @@ Pair::Pair(LAMMPS *lmp) : Pointers(lmp)
   ewaldflag = pppmflag = msmflag = dispersionflag = tip4pflag = dipoleflag = 0;
   reinitflag = 1;
 
-  // pair_modify settingsx
+  // pair_modify settings
 
   compute_flag = 1;
   manybody_flag = 0;
diff --git a/src/pair_hybrid.cpp b/src/pair_hybrid.cpp
index 48364a86c4..4a98cca614 100644
--- a/src/pair_hybrid.cpp
+++ b/src/pair_hybrid.cpp
@@ -36,7 +36,7 @@ PairHybrid::PairHybrid(LAMMPS *lmp) : Pair(lmp),
   map(NULL), special_lj(NULL), special_coul(NULL), compute_tally(NULL)
 {
   nstyles = 0;
-  
+
   outerflag = 0;
   respaflag = 0;
 }
@@ -487,7 +487,7 @@ void PairHybrid::init_style()
         if (((force->special_lj[i] == 0.0) || (force->special_lj[i] == 1.0))
             && (force->special_lj[i] != special_lj[istyle][i]))
           error->all(FLERR,"Pair_modify special setting for pair hybrid "
-		     "incompatible with global special_bonds setting");
+                     "incompatible with global special_bonds setting");
       }
     }
 
@@ -497,7 +497,7 @@ void PairHybrid::init_style()
              || (force->special_coul[i] == 1.0))
             && (force->special_coul[i] != special_coul[istyle][i]))
           error->all(FLERR,"Pair_modify special setting for pair hybrid "
-		     "incompatible with global special_bonds setting");
+                     "incompatible with global special_bonds setting");
       }
     }
   }
@@ -829,6 +829,12 @@ void PairHybrid::modify_params(int narg, char **arg)
     Pair::modify_params(narg,arg);
     for (int m = 0; m < nstyles; m++) styles[m]->modify_params(narg,arg);
   }
+
+  // reset global compute_flag since there may have been changes
+  // to any of the substyles
+  compute_flag = 0;
+  for (int m = 0; m < nstyles; m++)
+    if (styles[m]->compute_flag) compute_flag = 1;
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/version.h b/src/version.h
index 0a22a92328..ff33fa3b06 100644
--- a/src/version.h
+++ b/src/version.h
@@ -1 +1 @@
-#define LAMMPS_VERSION "10 Aug 2017"
+#define LAMMPS_VERSION "17 Aug 2017"
diff --git a/tools/msi2lmp/README b/tools/msi2lmp/README
index db9b1aca5e..9ac7af5607 100644
--- a/tools/msi2lmp/README
+++ b/tools/msi2lmp/README
@@ -140,6 +140,8 @@ msi2lmp has the following known limitations:
 - there is no support for auto-equivalences to supplement fully
   parameterized interactions with heuristic ones
 - there is no support for bond increments
+- there is no support for coordinates defined by symmetry operations,
+  i.e. the .mdf file has to be set up for space group P1
 
 ------------------------------------------------------------------------