Merge branch 'master' into USER-DPD_kokkos

2017-08-22 13:50:19 -06:00 · 2017-08-22 13:50:19 -06:00 · 5c985946d5
parent 4041db8d1a b11fe2eddb
commit 5c985946d5
276 changed files with 26793 additions and 4212 deletions
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -0,0 +1,21 @@
+# This file contains file patterns that triggers automatic
+# code review requests from users that are owners of these files
+# Order matters, the last match has the highest precedence
+
+# library folders
+lib/colvars/*         @giacomofiorin
+lib/compress/*        @akohlmey
+lib/kokkos/*          @stanmoore1
+lib/molfile/*         @akohlmey
+lib/qmmm/*            @akohlmey
+lib/vtk/*             @rbberger
+
+# packages
+src/KOKKOS            @stanmoore1
+src/USER-CGSDK        @akohlmey
+src/USER-COLVARS      @giacomofiorin
+src/USER-OMP          @akohlmey
+src/USER-QMMM         @akohlmey
+
+# tools
+tools/msi2lmp/*       @akohlmey
--- a/.gitignore
+++ b/.gitignore
@ -32,3 +32,11 @@ log.cite
 .Trashes
 ehthumbs.db
 Thumbs.db
+
+#cmake
+/build*
+/CMakeCache.txt
+/CMakeFiles/
+/Makefile
+/cmake_install.cmake
+/lmp
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@ -0,0 +1,547 @@
+########################################
+# CMake build system
+# This file is part of LAMMPS
+# Created by Christoph Junghans and Richard Berger
+cmake_minimum_required(VERSION 3.1)
+
+project(lammps)
+set(SOVERSION 0)
+set(LAMMPS_SOURCE_DIR ${CMAKE_SOURCE_DIR}/../src)
+set(LAMMPS_LIB_SOURCE_DIR ${CMAKE_SOURCE_DIR}/../lib)
+set(LAMMPS_LIB_BINARY_DIR ${CMAKE_BINARY_DIR}/lib)
+
+#To not conflict with old Makefile build system, we build everything here
+file(GLOB LIB_SOURCES ${LAMMPS_SOURCE_DIR}/*.cpp)
+file(GLOB LMP_SOURCES ${LAMMPS_SOURCE_DIR}/main.cpp)
+list(REMOVE_ITEM LIB_SOURCES ${LMP_SOURCES})
+
+# Cmake modules/macros are in a subdirectory to keep this file cleaner
+set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/Modules)
+
+if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CXX_FLAGS)
+  #release comes with -O3 by default
+  set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." FORCE)
+endif(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CXX_FLAGS)
+
+foreach(STYLE_FILE style_angle.h style_atom.h style_body.h style_bond.h style_command.h style_compute.h style_dihedral.h style_dump.h
+             style_fix.h style_improper.h style_integrate.h style_kspace.h style_minimize.h style_nbin.h style_npair.h style_nstencil.h
+             style_ntopo.h style_pair.h style_reader.h style_region.h)
+  if(EXISTS ${LAMMPS_SOURCE_DIR}/${STYLE_FILE})
+    message(FATAL_ERROR "There is a ${STYLE_FILE} in ${LAMMPS_SOURCE_DIR}, please clean up the source directory first")
+  endif()
+endforeach()
+
+enable_language(CXX)
+
+######################################################################
+# compiler tests
+# these need ot be done early (before further tests).
+#####################################################################
+include(CheckCCompilerFlag)
+
+########################################################################
+# User input options                                                   #
+########################################################################
+option(BUILD_SHARED_LIBS "Build shared libs" OFF)
+option(INSTALL_LIB "Install lammps library and header" ON)
+include(GNUInstallDirs)
+
+set(LAMMPS_LINK_LIBS)
+option(ENABLE_MPI "Build MPI version" OFF)
+if(ENABLE_MPI)
+  find_package(MPI REQUIRED)
+  include_directories(${MPI_C_INCLUDE_PATH})
+  list(APPEND LAMMPS_LINK_LIBS ${MPI_CXX_LIBRARIES})
+  option(LAMMPS_LONGLONG_TO_LONG "Workaround if your system or MPI version does not recognize 'long long' data types" OFF)
+  if(LAMMPS_LONGLONG_TO_LONG)
+    add_definitions(-DLAMMPS_LONGLONG_TO_LONG)
+  endif()
+else()
+  file(GLOB MPI_SOURCES ${LAMMPS_SOURCE_DIR}/STUBS/mpi.c)
+  list(APPEND LIB_SOURCES ${MPI_SOURCES})
+  include_directories(${LAMMPS_SOURCE_DIR}/STUBS)
+endif()
+
+set(LAMMPS_SIZE_LIMIT "LAMMPS_SMALLBIG" CACHE STRING "Lammps size limit")
+set_property(CACHE LAMMPS_SIZE_LIMIT PROPERTY STRINGS LAMMPS_SMALLBIG LAMMPS_BIGBIG LAMMPS_SMALLSMALL)
+add_definitions(-D${LAMMPS_SIZE_LIMIT})
+
+set(LAMMPS_MEMALIGN "64" CACHE STRING "enables the use of the posix_memalign() call instead of malloc() when large chunks or memory are allocated by LAMMPS")
+add_definitions(-DLAMMPS_MEMALIGN=${LAMMPS_MEMALIGN})
+
+option(LAMMPS_EXCEPTIONS "enable the use of C++ exceptions for error messages (useful for library interface)" OFF)
+if(LAMMPS_EXCEPTIONS)
+  add_definitions(-DLAMMPS_EXCEPTIONS)
+endif()
+
+option(CMAKE_VERBOSE_MAKEFILE "Verbose makefile" OFF)
+
+option(ENABLE_TESTING "Enable testing" OFF)
+if(ENABLE_TESTING)
+  enable_testing()
+endif(ENABLE_TESTING)
+
+option(ENABLE_ALL "Build all default packages" OFF)
+set(DEFAULT_PACKAGES ASPHERE BODY CLASS2 COLLOID COMPRESS CORESHELL DIPOLE GRANULAR
+  KSPACE MANYBODY MC MEAM MISC MOLECULE PERI QEQ
+  REAX REPLICA RIGID SHOCK SNAP SRD)
+set(OTHER_PACKAGES KIM PYTHON MSCG MPIIO VORONOI POEMS
+   USER-ATC USER-AWPMD USER-CGDNA
+  USER-CGSDK USER-COLVARS USER-DIFFRACTION USER-DPD USER-DRUDE USER-EFF
+  USER-FEP USER-H5MD USER-LB USER-MANIFOLD USER-MEAMC USER-MGPT USER-MISC
+  USER-MOLFILE USER-NETCDF USER-PHONON USER-QTB USER-REAXC USER-SMD 
+  USER-SMTBQ USER-SPH USER-TALLY USER-VTK USER-QUIP USER-QMMM)
+set(ACCEL_PACKAGES USER-OMP KOKKOS OPT USER-INTEL GPU)
+foreach(PKG ${DEFAULT_PACKAGES})
+  option(ENABLE_${PKG} "Build ${PKG} Package" ${ENABLE_ALL})
+endforeach()
+foreach(PKG ${ACCEL_PACKAGES} ${OTHER_PACKAGES})
+  option(ENABLE_${PKG} "Build ${PKG} Package" OFF)
+endforeach()
+
+macro(pkg_depends PKG1 PKG2)
+  if(ENABLE_${PKG1} AND NOT ENABLE_${PKG2})
+    message(FATAL_ERROR "${PKG1} package needs LAMMPS to be build with ${PKG2}")
+  endif()
+endmacro()
+
+pkg_depends(MPIIO MPI)
+pkg_depends(QEQ MANYBODY)
+pkg_depends(USER-ATC MANYBODY)
+pkg_depends(USER-H5MD MPI)
+pkg_depends(USER-LB MPI)
+pkg_depends(USER-MISC MANYBODY)
+pkg_depends(USER-PHONON KSPACE)
+
+if(ENABLE_BODY AND ENABLE_POEMS)
+  message(FATAL_ERROR "BODY and POEMS cannot be enabled at the same time")
+endif()
+
+######################################################
+# packages with special compiler needs or external libs
+######################################################
+if(ENABLE_REAX OR ENABLE_MEAM OR ENABLE_USER-QUIP OR ENABLE_USER-QMMM)
+  enable_language(Fortran)
+endif()
+
+if(ENABLE_KOKKOS OR ENABLE_MSCG)
+  # starting with CMake 3.1 this is all you have to do to enforce C++11
+  set(CMAKE_CXX_STANDARD 11) # C++11...
+  set(CMAKE_CXX_STANDARD_REQUIRED ON) #...is required...
+  set(CMAKE_CXX_EXTENSIONS OFF) #...without compiler extensions like gnu++11
+endif()
+
+if(ENABLE_USER-OMP OR ENABLE_KOKKOS OR ENABLE_USER-INTEL)
+  find_package(OpenMP REQUIRED)
+  set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+endif()
+
+if(ENABLE_KSPACE)
+  set(FFT "KISSFFT" CACHE STRING "FFT library for KSPACE package")
+  set_property(CACHE FFT PROPERTY STRINGS KISSFFT FFTW3 MKL FFTW2)
+  if(NOT FFT STREQUAL "KISSFFT")
+    find_package(${FFT} REQUIRED)
+    add_definitions(-DFFT_${FFT})
+    include_directories(${${FFT}_INCLUDE_DIRS})
+    list(APPEND LAMMPS_LINK_LIBS ${${FFT}_LIBRARIES})
+  endif()
+  set(PACK_OPTIMIZATION "PACK_ARRAY" CACHE STRING "Optimization for FFT")
+  set_property(CACHE PACK_OPTIMIZATION PROPERTY STRINGS PACK_ARRAY PACK_POINTER PACK_MEMCPY)
+  if(NOT PACK_OPTIMIZATION STREQUAL "PACK_ARRAY")
+    add_definitions(-D${PACK_OPTIMIZATION})
+  endif()
+endif()
+
+if(ENABLE_MISC)
+  option(LAMMPS_XDR "include XDR compatibility files for doing particle dumps in XTC format" OFF)
+  if(LAMMPS_XDR)
+    add_definitions(-DLAMMPS_XDR)
+  endif()
+endif()
+
+if(ENABLE_MSCG OR ENABLE_USER-ATC OR ENABLE_USER-AWPMD OR ENABLE_USER-QUIP)
+  find_package(LAPACK)
+  if(LAPACK_FOUND)
+    list(APPEND LAMMPS_LINK_LIBS ${LAPACK_LIBRARIES})
+  else()
+    enable_language(Fortran)
+    file(GLOB LAPACK_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/linalg/*.f)
+    list(APPEND LIB_SOURCES ${LAPACK_SOURCES})
+  endif()
+endif()
+
+if(ENABLE_PYTHON)
+  find_package(PythonInterp REQUIRED)
+  find_package(PythonLibs REQUIRED)
+  add_definitions(-DLMP_PYTHON)
+  include_directories(${PYTHON_INCLUDE_DIR})
+  list(APPEND LAMMPS_LINK_LIBS ${PYTHON_LIBRARY})
+  if(NOT PYTHON_INSTDIR)
+    execute_process(COMMAND ${PYTHON_EXECUTABLE}
+	  -c "import distutils.sysconfig as cg; print(cg.get_python_lib(1,0,prefix='${CMAKE_INSTALL_PREFIX}'))"
+      OUTPUT_VARIABLE PYTHON_INSTDIR OUTPUT_STRIP_TRAILING_WHITESPACE)
+  endif()
+  install(FILES ${CMAKE_SOURCE_DIR}/../python/lammps.py DESTINATION ${PYTHON_INSTDIR})
+  if(NOT BUILD_SHARED_LIBS)
+    message(FATAL_ERROR "Python package need lammps to be build shared, use -DBUILD_SHARED_LIBS=ON")
+  endif()
+endif()
+
+find_package(JPEG)
+if(JPEG_FOUND)
+  add_definitions(-DLAMMPS_JPEG)
+  include_directories(${JPEG_INCLUDE_DIR})
+  list(APPEND LAMMPS_LINK_LIBS ${JPEG_LIBRARIES})
+endif()
+
+find_package(PNG)
+find_package(ZLIB)
+if(PNG_FOUND AND ZLIB_FOUND)
+  include_directories(${PNG_INCLUDE_DIRS} ${ZLIB_INCLUDE_DIRS})
+  list(APPEND LAMMPS_LINK_LIBS ${PNG_LIBRARIES} ${ZLIB_LIBRARIES})
+  add_definitions(-DLAMMPS_PNG)
+endif()
+
+find_program(GZIP_EXECUTABLE gzip)
+find_package_handle_standard_args(GZIP REQUIRED_VARS GZIP_EXECUTABLE)
+if(GZIP_FOUND)
+  add_definitions(-DLAMMPS_GZIP)
+endif()
+
+find_program(FFMPEG_EXECUTABLE ffmpeg)
+find_package_handle_standard_args(FFMPEG REQUIRED_VARS FFMPEG_EXECUTABLE)
+if(FFMPEG_FOUND)
+  add_definitions(-DLAMMPS_FFMPEG)
+endif()
+
+if(ENABLE_VORONOI)
+  find_package(VORO REQUIRED) #some distros
+  include_directories(${VORO_INCLUDE_DIRS})
+  list(APPEND LAMMPS_LINK_LIBS ${VORO_LIBRARIES})
+endif()
+
+if(ENABLE_USER-MOLFILE)
+  list(APPEND LAMMPS_LINK_LIBS ${CMAKE_DL_LIBS})
+endif()
+
+if(ENABLE_USER-NETCDF)
+  find_package(NetCDF REQUIRED)
+  include_directories(NETCDF_INCLUDE_DIR)
+  list(APPEND LAMMPS_LINK_LIBS ${NETCDF_LIBRARY})
+  add_definitions(-DLMP_HAS_NETCDF -DNC_64BIT_DATA=0x0020)
+endif()
+
+if(ENABLE_USER-SMD)
+  find_package(Eigen3 REQUIRED)
+  include_directories(${EIGEN3_INCLUDE_DIR})
+endif()
+
+if(ENABLE_USER-QUIP)
+  find_package(QUIP REQUIRED)
+  list(APPEND LAMMPS_LINK_LIBS ${QUIP_LIBRARIES} ${CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES})
+endif()
+
+if(ENABLE_USER-QMMM)
+  find_package(QE REQUIRED)
+  include_directories(${QE_INCLUDE_DIRS})
+  list(APPEND LAMMPS_LINK_LIBS ${QE_LIBRARIES} ${CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES})
+endif()
+
+if(ENABLE_USER-AWPMD)
+  include_directories(${LAMMPS_LIB_SOURCE_DIR}/awpmd/systems/interact
+    ${LAMMPS_LIB_SOURCE_DIR}/awpmd/ivutils/include)
+endif()
+
+if(ENABLE_USER-H5MD)
+  find_package(HDF5 REQUIRED)
+  list(APPEND LAMMPS_LINK_LIBS ${HDF5_LIBRARIES})
+  include_directories(${HDF5_INCLUDE_DIRS} ${LAMMPS_LIB_SOURCE_DIR}/h5md/include)
+endif()
+
+if(ENABLE_USER-VTK)
+  find_package(VTK REQUIRED NO_MODULE)
+  include(${VTK_USE_FILE})
+  add_definitions(-DLAMMPS_VTK)
+  list(APPEND LAMMPS_LINK_LIBS ${VTK_LIBRARIES})
+endif()
+
+if(ENABLE_KIM)
+  find_package(KIM REQUIRED)
+  list(APPEND LAMMPS_LINK_LIBS ${KIM_LIBRARIES})
+  include_directories(${KIM_INCLUDE_DIRS})
+endif()
+
+if(ENABLE_MSCG)
+  find_package(GSL REQUIRED)
+  set(LAMMPS_LIB_MSCG_BIN_DIR ${LAMMPS_LIB_BINARY_DIR}/mscg)
+  set(MSCG_TARBALL ${LAMMPS_LIB_MSCG_BIN_DIR}/MS-CG-master.zip)
+  set(LAMMPS_LIB_MSCG_BIN_DIR ${LAMMPS_LIB_MSCG_BIN_DIR}/MSCG-release-master/src)
+  if(NOT EXISTS ${LAMMPS_LIB_MSCG_BIN_DIR})
+    if(NOT EXISTS ${MSCG_TARBALL})
+      message(STATUS "Downloading ${MSCG_TARBALL}")
+      file(DOWNLOAD
+        https://github.com/uchicago-voth/MSCG-release/archive/master.zip
+        ${MSCG_TARBALL} SHOW_PROGRESS) #EXPECTED_MD5 cannot be due due to master
+    endif()
+    message(STATUS "Unpacking ${MSCG_TARBALL}")
+    execute_process(COMMAND ${CMAKE_COMMAND} -E tar xvf ${MSCG_TARBALL}
+      WORKING_DIRECTORY ${LAMMPS_LIB_BINARY_DIR}/mscg)
+  endif()
+  file(GLOB MSCG_SOURCES ${LAMMPS_LIB_MSCG_BIN_DIR}/*.cpp)
+  list(APPEND LIB_SOURCES ${MSCG_SOURCES})
+  foreach(MSCG_SOURCE ${MSCG_SOURCES})
+    set_property(SOURCE ${MSCG_SOURCE} APPEND PROPERTY COMPILE_DEFINITIONS
+      DIMENSION=3 _exclude_gromacs=1)
+  endforeach()
+  include_directories(${LAMMPS_LIB_MSCG_BIN_DIR} ${GSL_INCLUDE_DIRS})
+  list(APPEND LAMMPS_LINK_LIBS ${GSL_LIBRARIES})
+endif()
+
+########################################################################
+# Basic system tests (standard libraries, headers, functions, types)   #
+########################################################################
+include(CheckIncludeFile)
+foreach(HEADER math.h)
+  check_include_file(${HEADER} FOUND_${HEADER})
+  if(NOT FOUND_${HEADER})
+    message(FATAL_ERROR "Could not find needed header - ${HEADER}")
+  endif(NOT FOUND_${HEADER})
+endforeach(HEADER)
+
+set(MATH_LIBRARIES "m" CACHE STRING "math library")
+mark_as_advanced( MATH_LIBRARIES )
+include(CheckLibraryExists)
+foreach(FUNC sin cos)
+  check_library_exists(${MATH_LIBRARIES} ${FUNC} "" FOUND_${FUNC}_${MATH_LIBRARIES})
+  if(NOT FOUND_${FUNC}_${MATH_LIBRARIES})
+    message(FATAL_ERROR "Could not find needed math function - ${FUNC}")
+  endif(NOT FOUND_${FUNC}_${MATH_LIBRARIES})
+endforeach(FUNC)
+list(APPEND LAMMPS_LINK_LIBS ${MATH_LIBRARIES})
+
+######################################
+# Generate Basic Style files 
+######################################
+include(StyleHeaderUtils)
+RegisterStyles(${LAMMPS_SOURCE_DIR})
+
+##############################################
+# add sources of enabled packages
+############################################
+foreach(PKG ${DEFAULT_PACKAGES} ${OTHER_PACKAGES})
+  if(ENABLE_${PKG})
+    set(${PKG}_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/${PKG})
+
+    # detects styles in package and adds them to global list
+    RegisterStyles(${${PKG}_SOURCES_DIR})
+
+    file(GLOB ${PKG}_SOURCES ${${PKG}_SOURCES_DIR}/*.cpp)
+    list(APPEND LIB_SOURCES ${${PKG}_SOURCES})
+    include_directories(${${PKG}_SOURCES_DIR})
+  endif()
+endforeach()
+
+##############################################
+# add lib sources of (simple) enabled packages
+############################################
+foreach(SIMPLE_LIB REAX MEAM POEMS USER-ATC USER-AWPMD USER-COLVARS USER-H5MD
+  USER-MOLFILE USER-QMMM)
+  if(ENABLE_${SIMPLE_LIB})
+    string(REGEX REPLACE "^USER-" "" SIMPLE_LIB "${SIMPLE_LIB}")
+    string(TOLOWER "${SIMPLE_LIB}" INC_DIR)
+    file(GLOB_RECURSE ${SIMPLE_LIB}_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/${INC_DIR}/*.F
+      ${LAMMPS_LIB_SOURCE_DIR}/${INC_DIR}/*.c ${LAMMPS_LIB_SOURCE_DIR}/${INC_DIR}/*.cpp)
+    list(APPEND LIB_SOURCES ${${SIMPLE_LIB}_SOURCES})
+    include_directories(${LAMMPS_LIB_SOURCE_DIR}/${INC_DIR})
+  endif()
+endforeach()
+
+######################################################################
+# packages which selectively include variants based on enabled styles
+# e.g. accelerator packages
+######################################################################
+if(ENABLE_USER-OMP)
+    set(USER-OMP_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/USER-OMP)
+    set(USER-OMP_SOURCES ${USER-OMP_SOURCES_DIR}/thr_data.cpp
+                         ${USER-OMP_SOURCES_DIR}/thr_omp.cpp
+                         ${USER-OMP_SOURCES_DIR}/fix_nh_omp.cpp
+                         ${USER-OMP_SOURCES_DIR}/fix_nh_sphere_omp.cpp)
+    set_property(GLOBAL PROPERTY "OMP_SOURCES" "${USER-OMP_SOURCES}")
+
+    # detects styles which have USER-OMP version
+    RegisterStylesExt(${USER-OMP_SOURCES_DIR} omp OMP_SOURCES)
+
+    get_property(USER-OMP_SOURCES GLOBAL PROPERTY OMP_SOURCES)
+
+    list(APPEND LIB_SOURCES ${USER-OMP_SOURCES})
+    include_directories(${USER-OMP_SOURCES_DIR})
+endif()
+
+if(ENABLE_KOKKOS)
+  set(LAMMPS_LIB_KOKKOS_SRC_DIR ${LAMMPS_LIB_SOURCE_DIR}/kokkos)
+  set(LAMMPS_LIB_KOKKOS_BIN_DIR ${LAMMPS_LIB_BINARY_DIR}/kokkos)
+  add_definitions(-DLMP_KOKKOS)
+  add_subdirectory(${LAMMPS_LIB_KOKKOS_SRC_DIR} ${LAMMPS_LIB_KOKKOS_BIN_DIR})
+
+  set(Kokkos_INCLUDE_DIRS ${LAMMPS_LIB_KOKKOS_SRC_DIR}/core/src
+                          ${LAMMPS_LIB_KOKKOS_SRC_DIR}/containers/src
+                          ${LAMMPS_LIB_KOKKOS_SRC_DIR}/algorithms/src
+                          ${LAMMPS_LIB_KOKKOS_BIN_DIR})
+  include_directories(${Kokkos_INCLUDE_DIRS})
+  list(APPEND LAMMPS_LINK_LIBS kokkos)
+
+  set(KOKKOS_PKG_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/KOKKOS)
+  set(KOKKOS_PKG_SOURCES ${KOKKOS_PKG_SOURCES_DIR}/kokkos.cpp
+                         ${KOKKOS_PKG_SOURCES_DIR}/atom_kokkos.cpp
+                         ${KOKKOS_PKG_SOURCES_DIR}/atom_vec_kokkos.cpp
+                         ${KOKKOS_PKG_SOURCES_DIR}/comm_kokkos.cpp
+                         ${KOKKOS_PKG_SOURCES_DIR}/comm_tiled_kokkos.cpp
+                         ${KOKKOS_PKG_SOURCES_DIR}/neighbor_kokkos.cpp
+                         ${KOKKOS_PKG_SOURCES_DIR}/neigh_list_kokkos.cpp
+                         ${KOKKOS_PKG_SOURCES_DIR}/neigh_bond_kokkos.cpp
+                         ${KOKKOS_PKG_SOURCES_DIR}/fix_nh_kokkos.cpp
+                         ${KOKKOS_PKG_SOURCES_DIR}/domain_kokkos.cpp
+                         ${KOKKOS_PKG_SOURCES_DIR}/modify_kokkos.cpp)
+  set_property(GLOBAL PROPERTY "KOKKOS_PKG_SOURCES" "${KOKKOS_PKG_SOURCES}")
+
+  # detects styles which have KOKKOS version
+  RegisterStylesExt(${KOKKOS_PKG_SOURCES_DIR} kokkos KOKKOS_PKG_SOURCES)
+
+  get_property(KOKKOS_PKG_SOURCES GLOBAL PROPERTY KOKKOS_PKG_SOURCES)
+
+  list(APPEND LIB_SOURCES ${KOKKOS_PKG_SOURCES})
+  include_directories(${KOKKOS_PKG_SOURCES_DIR})
+endif()
+
+if(ENABLE_OPT)
+    set(OPT_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/OPT)
+    set(OPT_SOURCES)
+    set_property(GLOBAL PROPERTY "OPT_SOURCES" "${OPT_SOURCES}")
+
+    # detects styles which have OPT version
+    RegisterStylesExt(${OPT_SOURCES_DIR} opt OPT_SOURCES)
+
+    get_property(OPT_SOURCES GLOBAL PROPERTY OPT_SOURCES)
+
+    list(APPEND LIB_SOURCES ${OPT_SOURCES})
+    include_directories(${OPT_SOURCES_DIR})
+endif()
+
+if(ENABLE_USER-INTEL)
+    set(USER-INTEL_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/USER-INTEL)
+    set(USER-INTEL_SOURCES ${USER-INTEL_SOURCES_DIR}/intel_preprocess.h
+                           ${USER-INTEL_SOURCES_DIR}/intel_buffers.h
+                           ${USER-INTEL_SOURCES_DIR}/intel_buffers.cpp
+                           ${USER-INTEL_SOURCES_DIR}/math_extra_intel.h
+                           ${USER-INTEL_SOURCES_DIR}/nbin_intel.h
+                           ${USER-INTEL_SOURCES_DIR}/nbin_intel.cpp
+                           ${USER-INTEL_SOURCES_DIR}/npair_intel.h
+                           ${USER-INTEL_SOURCES_DIR}/npair_intel.cpp
+                           ${USER-INTEL_SOURCES_DIR}/intel_simd.h
+                           ${USER-INTEL_SOURCES_DIR}/intel_intrinsics.h)
+
+    set_property(GLOBAL PROPERTY "USER-INTEL_SOURCES" "${USER-INTEL_SOURCES}")
+
+    # detects styles which have USER-INTEL version
+    RegisterStylesExt(${USER-INTEL_SOURCES_DIR} opt USER-INTEL_SOURCES)
+
+    get_property(USER-INTEL_SOURCES GLOBAL PROPERTY USER-INTEL_SOURCES)
+
+    list(APPEND LIB_SOURCES ${USER-INTEL_SOURCES})
+    include_directories(${USER-INTEL_SOURCES_DIR})
+endif()
+
+if(ENABLE_GPU)
+    find_package(CUDA REQUIRED)
+    find_program(BIN2C bin2c)
+    if(NOT BIN2C)
+      message(FATAL_ERROR "Couldn't find bin2c, use -DBIN2C helping cmake to find it.")
+    endif()
+    include_directories(${CUDA_INCLUDE_DIRS})
+    list(APPEND LAMMPS_LINK_LIBS ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY})
+    set(GPU_PREC "SINGLE_DOUBLE" CACHE STRING "Lammps gpu precision size")
+    set_property(CACHE GPU_PREC PROPERTY STRINGS SINGLE_DOUBLE SINGLE_SINGLE DOUBLE_DOUBLE)
+    add_definitions(-D_${GPU_PREC})
+    add_definitions(-DNV_KERNEL -DUCL_CUDADR)
+    option(CUDPP_OPT "Enable CUDPP_OPT" ON)
+
+    set(GPU_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/GPU)
+    set(GPU_SOURCES ${GPU_SOURCES_DIR}/gpu_extra.h)
+
+    set_property(GLOBAL PROPERTY "GPU_SOURCES" "${GPU_SOURCES}")
+
+    # detects styles which have GPU version
+    RegisterStylesExt(${GPU_SOURCES_DIR} opt GPU_SOURCES)
+
+    get_property(GPU_SOURCES GLOBAL PROPERTY GPU_SOURCES)
+    
+    file(GLOB GPU_LIB_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/gpu/*.cpp)
+    file(GLOB GPU_LIB_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/*.cu ${CMAKE_SOURCE_DIR}/gpu/*.cu)
+    file(GLOB_RECURSE GPU_NOT_LIB_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_pppm.cu)
+    list(REMOVE_ITEM GPU_LIB_CU ${GPU_NOT_LIB_CU})
+    include_directories(${GPU_SOURCES_DIR} ${LAMMPS_LIB_SOURCE_DIR}/gpu ${LAMMPS_LIB_BINARY_DIR}/gpu)
+    if(CUDPP_OPT)
+      include_directories(${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini)
+      add_definitions(-DCUDPP_OPT)
+      file(GLOB GPU_LIB_CUDPP_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini/*.cpp)
+      file(GLOB GPU_LIB_CUDPP_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini/*.cu)
+    endif()
+    cuda_compile(GPU_OBJS ${GPU_LIB_CU} ${GPU_LIB_CUDPP_CU} OPTIONS $<$<BOOL:${BUILD_SHARED_LIBS}>:-Xcompiler=-fPIC>)    
+    file(MAKE_DIRECTORY ${LAMMPS_LIB_BINARY_DIR}/gpu)
+    foreach(CU_OBJ ${GPU_OBJS})
+      get_filename_component(CU_NAME ${CU_OBJ} NAME_WE)
+      string(REGEX REPLACE "^.*_lal_" "" CU_NAME "${CU_NAME}")
+      add_custom_command(OUTPUT ${LAMMPS_LIB_BINARY_DIR}/gpu/${CU_NAME}_cubin.h
+        COMMAND ${BIN2C} -c -n ${CU_NAME} ${CU_OBJ} > ${LAMMPS_LIB_BINARY_DIR}/gpu/${CU_NAME}_cubin.h
+        DEPENDS ${CU_OBJ} 
+        COMMENT "Generating ${CU_NAME}_cubin.h")
+      list(APPEND LIB_SOURCES ${LAMMPS_LIB_BINARY_DIR}/gpu/${CU_NAME}_cubin.h)
+      if(${CU_NAME} STREQUAL "pppm_d") #pppm_d doesn't get linked into the lib
+        set(CU_FORBIDDEN_OBJ "${CU_OBJ}") 
+      endif()
+    endforeach()
+    list(REMOVE_ITEM GPU_OBJS "${CU_FORBIDDEN_OBJ}")
+    list(APPEND LIB_SOURCES ${GPU_SOURCES} ${GPU_LIB_SOURCES} ${GPU_LIB_CUDPP_SOURCES} ${GPU_OBJS})
+    set_directory_properties(PROPERTIES ADDITIONAL_MAKE_CLEAN_FILES "${LAMMPS_LIB_BINARY_DIR}/gpu/*_cubin.h")
+endif()
+
+######################################################
+# Generate style headers based on global list of
+# styles registered during package selection           
+######################################################
+set(LAMMPS_STYLE_HEADERS_DIR ${CMAKE_CURRENT_BINARY_DIR}/styles)
+
+GenerateStyleHeaders(${LAMMPS_STYLE_HEADERS_DIR})
+
+include_directories(${LAMMPS_SOURCE_DIR})
+include_directories(${LAMMPS_STYLE_HEADERS_DIR})
+
+###########################################
+# Actually add executable and lib to build
+############################################
+add_library(lammps ${LIB_SOURCES})
+target_link_libraries(lammps ${LAMMPS_LINK_LIBS})
+set_target_properties(lammps PROPERTIES SOVERSION ${SOVERSION})
+if(INSTALL_LIB)
+  install(TARGETS lammps LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
+  install(FILES ${LAMMPS_SOURCE_DIR}/lammps.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+elseif(BUILD_SHARED_LIBS)
+  message(FATAL_ERROR "Shared library has to be installed, use -DINSTALL_LIB=ON to install lammps with a library")
+endif()
+
+add_executable(lmp ${LMP_SOURCES})
+target_link_libraries(lmp lammps)
+install(TARGETS lmp DESTINATION ${CMAKE_INSTALL_BINDIR})
+if(ENABLE_TESTING)
+  add_test(ShowHelp ${CMAKE_CURRENT_BINARY_DIR}/lmp -help)
+endif()
+
+##################################
+# Print package summary
+##################################
+foreach(PKG ${DEFAULT_PACKAGES} ${OTHER_PACKAGES} ${ACCEL_PACKAGES})
+  if(ENABLE_${PKG})
+    message(STATUS "Building package: ${PKG}")
+  endif()
+endforeach()
--- a/cmake/Modules/FindFFTW2.cmake
+++ b/cmake/Modules/FindFFTW2.cmake
@ -0,0 +1,22 @@
+# - Find fftw2
+# Find the native FFTW2 headers and libraries.
+#
+#  FFTW2_INCLUDE_DIRS - where to find fftw2.h, etc.
+#  FFTW2_LIBRARIES    - List of libraries when using fftw2.
+#  FFTW2_FOUND        - True if fftw2 found.
+#
+
+find_path(FFTW2_INCLUDE_DIR fftw.h)
+
+find_library(FFTW2_LIBRARY NAMES fftw)
+
+set(FFTW2_LIBRARIES ${FFTW2_LIBRARY})
+set(FFTW2_INCLUDE_DIRS ${FFTW2_INCLUDE_DIR})
+
+include(FindPackageHandleStandardArgs)
+# handle the QUIETLY and REQUIRED arguments and set FFTW2_FOUND to TRUE
+# if all listed variables are TRUE
+
+find_package_handle_standard_args(FFTW2 DEFAULT_MSG FFTW2_LIBRARY FFTW2_INCLUDE_DIR)
+
+mark_as_advanced(FFTW2_INCLUDE_DIR FFTW2_LIBRARY )
--- a/cmake/Modules/FindFFTW3.cmake
+++ b/cmake/Modules/FindFFTW3.cmake
@ -0,0 +1,25 @@
+# - Find fftw3
+# Find the native FFTW3 headers and libraries.
+#
+#  FFTW3_INCLUDE_DIRS - where to find fftw3.h, etc.
+#  FFTW3_LIBRARIES    - List of libraries when using fftw3.
+#  FFTW3_FOUND        - True if fftw3 found.
+#
+
+find_package(PkgConfig)
+
+pkg_check_modules(PC_FFTW3 fftw3)
+find_path(FFTW3_INCLUDE_DIR fftw3.h HINTS ${PC_FFTW3_INCLUDE_DIRS})
+
+find_library(FFTW3_LIBRARY NAMES fftw3 HINTS ${PC_FFTW3_LIBRARY_DIRS})
+
+set(FFTW3_LIBRARIES ${FFTW3_LIBRARY})
+set(FFTW3_INCLUDE_DIRS ${FFTW3_INCLUDE_DIR})
+
+include(FindPackageHandleStandardArgs)
+# handle the QUIETLY and REQUIRED arguments and set FFTW3_FOUND to TRUE
+# if all listed variables are TRUE
+
+find_package_handle_standard_args(FFTW3 DEFAULT_MSG FFTW3_LIBRARY FFTW3_INCLUDE_DIR)
+
+mark_as_advanced(FFTW3_INCLUDE_DIR FFTW3_LIBRARY )
--- a/cmake/Modules/FindKIM.cmake
+++ b/cmake/Modules/FindKIM.cmake
@ -0,0 +1,22 @@
+# - Find kim
+# Find the native KIM headers and libraries.
+#
+#  KIM_INCLUDE_DIRS - where to find kim.h, etc.
+#  KIM_LIBRARIES    - List of libraries when using kim.
+#  KIM_FOUND        - True if kim found.
+#
+
+find_path(KIM_INCLUDE_DIR KIM_API.h PATH_SUFFIXES kim-api-v1)
+
+find_library(KIM_LIBRARY NAMES kim-api-v1)
+
+set(KIM_LIBRARIES ${KIM_LIBRARY})
+set(KIM_INCLUDE_DIRS ${KIM_INCLUDE_DIR})
+
+include(FindPackageHandleStandardArgs)
+# handle the QUIETLY and REQUIRED arguments and set KIM_FOUND to TRUE
+# if all listed variables are TRUE
+
+find_package_handle_standard_args(KIM DEFAULT_MSG KIM_LIBRARY KIM_INCLUDE_DIR)
+
+mark_as_advanced(KIM_INCLUDE_DIR KIM_LIBRARY )
--- a/cmake/Modules/FindMKL.cmake
+++ b/cmake/Modules/FindMKL.cmake
@ -0,0 +1,22 @@
+# - Find mkl
+# Find the native MKL headers and libraries.
+#
+#  MKL_INCLUDE_DIRS - where to find mkl.h, etc.
+#  MKL_LIBRARIES    - List of libraries when using mkl.
+#  MKL_FOUND        - True if mkl found.
+#
+
+find_path(MKL_INCLUDE_DIR mkl_dfti.h HINTS $ENV{MKLROOT}/include)
+
+find_library(MKL_LIBRARY NAMES mkl_rt HINTS $ENV{MKLROOT}/lib $ENV{MKLROOT}/lib/intel64)
+
+set(MKL_LIBRARIES ${MKL_LIBRARY})
+set(MKL_INCLUDE_DIRS ${MKL_INCLUDE_DIR})
+
+include(FindPackageHandleStandardArgs)
+# handle the QUIETLY and REQUIRED arguments and set MKL_FOUND to TRUE
+# if all listed variables are TRUE
+
+find_package_handle_standard_args(MKL DEFAULT_MSG MKL_LIBRARY MKL_INCLUDE_DIR)
+
+mark_as_advanced(MKL_INCLUDE_DIR MKL_LIBRARY )
--- a/cmake/Modules/FindNetCDF.cmake
+++ b/cmake/Modules/FindNetCDF.cmake
@ -0,0 +1,118 @@
+# - Find NetCDF
+# Find the native NetCDF includes and library
+#
+#  NETCDF_INCLUDE_DIR  - user modifiable choice of where netcdf headers are
+#  NETCDF_LIBRARY      - user modifiable choice of where netcdf libraries are
+#
+# Your package can require certain interfaces to be FOUND by setting these
+#
+#  NETCDF_CXX         - require the C++ interface and link the C++ library
+#  NETCDF_F77         - require the F77 interface and link the fortran library
+#  NETCDF_F90         - require the F90 interface and link the fortran library
+#
+# Or equivalently by calling FindNetCDF with a COMPONENTS argument containing one or
+# more of "CXX;F77;F90".
+#
+# When interfaces are requested the user has access to interface specific hints:
+#
+#  NETCDF_${LANG}_INCLUDE_DIR - where to search for interface header files
+#  NETCDF_${LANG}_LIBRARY     - where to search for interface libraries
+#
+# This module returns these variables for the rest of the project to use.
+#
+#  NETCDF_FOUND          - True if NetCDF found including required interfaces (see below)
+#  NETCDF_LIBRARIES      - All netcdf related libraries.
+#  NETCDF_INCLUDE_DIRS   - All directories to include.
+#  NETCDF_HAS_INTERFACES - Whether requested interfaces were found or not.
+#  NETCDF_${LANG}_INCLUDE_DIRS/NETCDF_${LANG}_LIBRARIES - C/C++/F70/F90 only interface
+#
+# Normal usage would be:
+#  set (NETCDF_F90 "YES")
+#  find_package (NetCDF REQUIRED)
+#  target_link_libraries (uses_everthing ${NETCDF_LIBRARIES})
+#  target_link_libraries (only_uses_f90 ${NETCDF_F90_LIBRARIES})
+
+#search starting from user editable cache var
+if (NETCDF_INCLUDE_DIR AND NETCDF_LIBRARY)
+  # Already in cache, be silent
+  set (NETCDF_FIND_QUIETLY TRUE)
+endif ()
+
+set(USE_DEFAULT_PATHS "NO_DEFAULT_PATH")
+if(NETCDF_USE_DEFAULT_PATHS)
+  set(USE_DEFAULT_PATHS "")
+endif()
+
+find_path (NETCDF_INCLUDE_DIR netcdf.h
+  HINTS "${NETCDF_DIR}/include")
+mark_as_advanced (NETCDF_INCLUDE_DIR)
+set (NETCDF_C_INCLUDE_DIRS ${NETCDF_INCLUDE_DIR})
+
+find_library (NETCDF_LIBRARY NAMES netcdf
+  HINTS "${NETCDF_DIR}/lib")
+mark_as_advanced (NETCDF_LIBRARY)
+
+set (NETCDF_C_LIBRARIES ${NETCDF_LIBRARY})
+
+#start finding requested language components
+set (NetCDF_libs "")
+set (NetCDF_includes "${NETCDF_INCLUDE_DIR}")
+
+get_filename_component (NetCDF_lib_dirs "${NETCDF_LIBRARY}" PATH)
+set (NETCDF_HAS_INTERFACES "YES") # will be set to NO if we're missing any interfaces
+
+macro (NetCDF_check_interface lang header libs)
+  if (NETCDF_${lang})
+    #search starting from user modifiable cache var
+    find_path (NETCDF_${lang}_INCLUDE_DIR NAMES ${header}
+      HINTS "${NETCDF_INCLUDE_DIR}"
+      HINTS "${NETCDF_${lang}_ROOT}/include"
+      ${USE_DEFAULT_PATHS})
+
+    find_library (NETCDF_${lang}_LIBRARY NAMES ${libs}
+      HINTS "${NetCDF_lib_dirs}"
+      HINTS "${NETCDF_${lang}_ROOT}/lib"
+      ${USE_DEFAULT_PATHS})
+
+    mark_as_advanced (NETCDF_${lang}_INCLUDE_DIR NETCDF_${lang}_LIBRARY)
+
+    #export to internal varS that rest of project can use directly
+    set (NETCDF_${lang}_LIBRARIES ${NETCDF_${lang}_LIBRARY})
+    set (NETCDF_${lang}_INCLUDE_DIRS ${NETCDF_${lang}_INCLUDE_DIR})
+
+    if (NETCDF_${lang}_INCLUDE_DIR AND NETCDF_${lang}_LIBRARY)
+      list (APPEND NetCDF_libs ${NETCDF_${lang}_LIBRARY})
+      list (APPEND NetCDF_includes ${NETCDF_${lang}_INCLUDE_DIR})
+    else ()
+      set (NETCDF_HAS_INTERFACES "NO")
+      message (STATUS "Failed to find NetCDF interface for ${lang}")
+    endif ()
+  endif ()
+endmacro ()
+
+list (FIND NetCDF_FIND_COMPONENTS "CXX" _nextcomp)
+if (_nextcomp GREATER -1)
+  set (NETCDF_CXX 1)
+endif ()
+list (FIND NetCDF_FIND_COMPONENTS "F77" _nextcomp)
+if (_nextcomp GREATER -1)
+  set (NETCDF_F77 1)
+endif ()
+list (FIND NetCDF_FIND_COMPONENTS "F90" _nextcomp)
+if (_nextcomp GREATER -1)
+  set (NETCDF_F90 1)
+endif ()
+NetCDF_check_interface (CXX netcdfcpp.h netcdf_c++)
+NetCDF_check_interface (F77 netcdf.inc  netcdff)
+NetCDF_check_interface (F90 netcdf.mod  netcdff)
+
+#export accumulated results to internal varS that rest of project can depend on
+list (APPEND NetCDF_libs "${NETCDF_C_LIBRARIES}")
+set (NETCDF_LIBRARIES ${NetCDF_libs})
+set (NETCDF_INCLUDE_DIRS ${NetCDF_includes})
+
+# handle the QUIETLY and REQUIRED arguments and set NETCDF_FOUND to TRUE if
+# all listed variables are TRUE
+include (FindPackageHandleStandardArgs)
+find_package_handle_standard_args (NetCDF
+  DEFAULT_MSG NETCDF_LIBRARIES NETCDF_INCLUDE_DIRS NETCDF_HAS_INTERFACES)
--- a/cmake/Modules/FindQE.cmake
+++ b/cmake/Modules/FindQE.cmake
@ -0,0 +1,29 @@
+# - Find quantum-espresso
+# Find the native QE headers and libraries.
+#
+#  QE_INCLUDE_DIRS - where to find quantum-espresso.h, etc.
+#  QE_LIBRARIES    - List of libraries when using quantum-espresso.
+#  QE_FOUND        - True if quantum-espresso found.
+#
+
+find_path(QE_INCLUDE_DIR libqecouple.h PATH_SUFFIXES COUPLE/include)
+
+find_library(QECOUPLE_LIBRARY NAMES qecouple)
+find_library(PW_LIBRARY NAMES pw)
+find_library(QEMOD_LIBRARY NAMES qemod)
+find_library(QEFFT_LIBRARY NAMES qefft)
+find_library(QELA_LIBRARY NAMES qela)
+find_library(CLIB_LIBRARY NAMES clib)
+find_library(IOTK_LIBRARY NAMES iotk)
+
+
+set(QE_LIBRARIES ${QECOUPLE_LIBRARY} ${PW_LIBRARY} ${QEMOD_LIBRARY} ${QEFFT_LIBRARY} ${QELA_LIBRARY} ${CLIB_LIBRARY} ${IOTK_LIBRARY})
+set(QE_INCLUDE_DIRS ${QE_INCLUDE_DIR})
+
+include(FindPackageHandleStandardArgs)
+# handle the QUIETLY and REQUIRED arguments and set QE_FOUND to TRUE
+# if all listed variables are TRUE
+
+find_package_handle_standard_args(QE DEFAULT_MSG QECOUPLE_LIBRARY PW_LIBRARY QEMOD_LIBRARY QEFFT_LIBRARY QELA_LIBRARY CLIB_LIBRARY IOTK_LIBRARY QE_INCLUDE_DIR)
+
+mark_as_advanced(QE_INCLUDE_DIR QECOUPLE_LIBRARY PW_LIBRARY QEMOD_LIBRARY QEFFT_LIBRARY QELA_LIBRARY CLIB_LIBRARY IOTK_LIBRARY)
--- a/cmake/Modules/FindQUIP.cmake
+++ b/cmake/Modules/FindQUIP.cmake
@ -0,0 +1,18 @@
+# - Find quip
+# Find the native QUIP libraries.
+#
+#  QUIP_LIBRARIES    - List of libraries when using fftw3.
+#  QUIP_FOUND        - True if fftw3 found.
+#
+
+find_library(QUIP_LIBRARY NAMES quip)
+
+set(QUIP_LIBRARIES ${QUIP_LIBRARY})
+
+include(FindPackageHandleStandardArgs)
+# handle the QUIETLY and REQUIRED arguments and set QUIP_FOUND to TRUE
+# if all listed variables are TRUE
+
+find_package_handle_standard_args(QUIP DEFAULT_MSG QUIP_LIBRARY)
+
+mark_as_advanced(QUIP_LIBRARY)
--- a/cmake/Modules/FindVORO.cmake
+++ b/cmake/Modules/FindVORO.cmake
@ -0,0 +1,22 @@
+# - Find voro++
+# Find the native VORO headers and libraries.
+#
+#  VORO_INCLUDE_DIRS - where to find voro++.hh, etc.
+#  VORO_LIBRARIES    - List of libraries when using voro++.
+#  VORO_FOUND        - True if voro++ found.
+#
+
+find_path(VORO_INCLUDE_DIR voro++.hh PATH_SUFFIXES voro++)
+
+find_library(VORO_LIBRARY NAMES voro++)
+
+set(VORO_LIBRARIES ${VORO_LIBRARY})
+set(VORO_INCLUDE_DIRS ${VORO_INCLUDE_DIR})
+
+include(FindPackageHandleStandardArgs)
+# handle the QUIETLY and REQUIRED arguments and set VORO_FOUND to TRUE
+# if all listed variables are TRUE
+
+find_package_handle_standard_args(VORO DEFAULT_MSG VORO_LIBRARY VORO_INCLUDE_DIR)
+
+mark_as_advanced(VORO_INCLUDE_DIR VORO_LIBRARY )
--- a/cmake/Modules/StyleHeaderUtils.cmake
+++ b/cmake/Modules/StyleHeaderUtils.cmake
@ -0,0 +1,132 @@
+function(FindStyleHeaders path style_class file_pattern headers)
+    file(GLOB files "${path}/${file_pattern}*.h")
+    get_property(hlist GLOBAL PROPERTY ${headers})
+
+    foreach(file_name ${files})
+        file(STRINGS ${file_name} is_style LIMIT_COUNT 1 REGEX ${style_class})
+        if(is_style)
+            list(APPEND hlist ${file_name})
+        endif()
+    endforeach()
+    set_property(GLOBAL PROPERTY ${headers} "${hlist}")
+endfunction(FindStyleHeaders)
+
+function(FindStyleHeadersExt path style_class extension headers sources)
+    get_property(hlist GLOBAL PROPERTY ${headers})
+    get_property(slist GLOBAL PROPERTY ${sources})
+    set(ext_list)
+    get_filename_component(abs_path "${path}" ABSOLUTE)
+
+    foreach(file_name ${hlist})
+        get_filename_component(basename ${file_name} NAME_WE)
+        set(ext_file_name "${abs_path}/${basename}_${extension}.h")
+        if(EXISTS "${ext_file_name}")
+            file(STRINGS ${ext_file_name} is_style LIMIT_COUNT 1 REGEX ${style_class})
+            if(is_style)
+                list(APPEND ext_list ${ext_file_name})
+
+                set(source_file_name "${abs_path}/${basename}_${extension}.cpp")
+                if(EXISTS "${source_file_name}")
+                    list(APPEND slist ${source_file_name})
+                endif()
+            endif()
+        endif()
+    endforeach()
+
+    list(APPEND hlist ${ext_list})
+    set_property(GLOBAL PROPERTY ${headers} "${hlist}")
+    set_property(GLOBAL PROPERTY ${sources} "${slist}")
+endfunction(FindStyleHeadersExt)
+
+function(CreateStyleHeader path filename)
+    math(EXPR N "${ARGC}-2")
+
+    set(temp "")
+    if(N GREATER 0)
+        math(EXPR ARG_END   "${ARGC}-1")
+ 
+        foreach(IDX RANGE 2 ${ARG_END})
+            list(GET ARGV ${IDX} FNAME)
+            get_filename_component(FNAME ${FNAME} NAME)
+            set(temp "${temp}#include \"${FNAME}\"\n")
+        endforeach()
+    endif()
+    message(STATUS "Generating ${filename}...")
+    file(WRITE "${path}/${filename}.tmp" "${temp}" )
+    execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different "${path}/${filename}.tmp" "${path}/${filename}")
+endfunction(CreateStyleHeader)
+
+function(GenerateStyleHeader path property style)
+    get_property(files GLOBAL PROPERTY ${property})
+    #message("${property} = ${files}")
+    CreateStyleHeader("${path}" "style_${style}.h" ${files})
+endfunction(GenerateStyleHeader)
+
+function(RegisterStyles search_path)
+    FindStyleHeaders(${search_path} ANGLE_CLASS     angle_     ANGLE     ) # angle     ) # force
+    FindStyleHeaders(${search_path} ATOM_CLASS      atom_vec_  ATOM_VEC  ) # atom      ) # atom      atom_vec_hybrid
+    FindStyleHeaders(${search_path} BODY_CLASS      body_      BODY      ) # body      ) # atom_vec_body
+    FindStyleHeaders(${search_path} BOND_CLASS      bond_      BOND      ) # bond      ) # force
+    FindStyleHeaders(${search_path} COMMAND_CLASS   ""         COMMAND   ) # command   ) # input
+    FindStyleHeaders(${search_path} COMPUTE_CLASS   compute_   COMPUTE   ) # compute   ) # modify
+    FindStyleHeaders(${search_path} DIHEDRAL_CLASS  dihedral_  DIHEDRAL  ) # dihedral  ) # force
+    FindStyleHeaders(${search_path} DUMP_CLASS      dump_      DUMP      ) # dump      ) # output    write_dump
+    FindStyleHeaders(${search_path} FIX_CLASS       fix_       FIX       ) # fix       ) # modify
+    FindStyleHeaders(${search_path} IMPROPER_CLASS  improper_  IMPROPER  ) # improper  ) # force
+    FindStyleHeaders(${search_path} INTEGRATE_CLASS ""         INTEGRATE ) # integrate ) # update
+    FindStyleHeaders(${search_path} KSPACE_CLASS    ""         KSPACE    ) # kspace    ) # force
+    FindStyleHeaders(${search_path} MINIMIZE_CLASS  min_       MINIMIZE  ) # minimize  ) # update
+    FindStyleHeaders(${search_path} NBIN_CLASS      nbin_      NBIN      ) # nbin      ) # neighbor
+    FindStyleHeaders(${search_path} NPAIR_CLASS     npair_     NPAIR     ) # npair     ) # neighbor
+    FindStyleHeaders(${search_path} NSTENCIL_CLASS  nstencil_  NSTENCIL  ) # nstencil  ) # neighbor
+    FindStyleHeaders(${search_path} NTOPO_CLASS     ntopo_     NTOPO     ) # ntopo     ) # neighbor
+    FindStyleHeaders(${search_path} PAIR_CLASS      pair_      PAIR      ) # pair      ) # force
+    FindStyleHeaders(${search_path} READER_CLASS    reader_    READER    ) # reader    ) # read_dump
+    FindStyleHeaders(${search_path} REGION_CLASS    region_    REGION    ) # region    ) # domain
+endfunction(RegisterStyles)
+
+function(RegisterStylesExt search_path extension sources)
+    FindStyleHeadersExt(${search_path} ANGLE_CLASS     ${extension}  ANGLE     ${sources})
+    FindStyleHeadersExt(${search_path} ATOM_CLASS      ${extension}  ATOM_VEC  ${sources})
+    FindStyleHeadersExt(${search_path} BODY_CLASS      ${extension}  BODY      ${sources})
+    FindStyleHeadersExt(${search_path} BOND_CLASS      ${extension}  BOND      ${sources})
+    FindStyleHeadersExt(${search_path} COMMAND_CLASS   ${extension}  COMMAND   ${sources})
+    FindStyleHeadersExt(${search_path} COMPUTE_CLASS   ${extension}  COMPUTE   ${sources})
+    FindStyleHeadersExt(${search_path} DIHEDRAL_CLASS  ${extension}  DIHEDRAL  ${sources})
+    FindStyleHeadersExt(${search_path} DUMP_CLASS      ${extension}  DUMP      ${sources})
+    FindStyleHeadersExt(${search_path} FIX_CLASS       ${extension}  FIX       ${sources})
+    FindStyleHeadersExt(${search_path} IMPROPER_CLASS  ${extension}  IMPROPER  ${sources})
+    FindStyleHeadersExt(${search_path} INTEGRATE_CLASS ${extension}  INTEGRATE ${sources})
+    FindStyleHeadersExt(${search_path} KSPACE_CLASS    ${extension}  KSPACE    ${sources})
+    FindStyleHeadersExt(${search_path} MINIMIZE_CLASS  ${extension}  MINIMIZE  ${sources})
+    FindStyleHeadersExt(${search_path} NBIN_CLASS      ${extension}  NBIN      ${sources})
+    FindStyleHeadersExt(${search_path} NPAIR_CLASS     ${extension}  NPAIR     ${sources})
+    FindStyleHeadersExt(${search_path} NSTENCIL_CLASS  ${extension}  NSTENCIL  ${sources})
+    FindStyleHeadersExt(${search_path} NTOPO_CLASS     ${extension}  NTOPO     ${sources})
+    FindStyleHeadersExt(${search_path} PAIR_CLASS      ${extension}  PAIR      ${sources})
+    FindStyleHeadersExt(${search_path} READER_CLASS    ${extension}  READER    ${sources})
+    FindStyleHeadersExt(${search_path} REGION_CLASS    ${extension}  REGION    ${sources})
+endfunction(RegisterStylesExt)
+
+function(GenerateStyleHeaders output_path)
+    GenerateStyleHeader(${output_path} ANGLE      angle     ) # force
+    GenerateStyleHeader(${output_path} ATOM_VEC   atom      ) # atom      atom_vec_hybrid
+    GenerateStyleHeader(${output_path} BODY       body      ) # atom_vec_body
+    GenerateStyleHeader(${output_path} BOND       bond      ) # force
+    GenerateStyleHeader(${output_path} COMMAND    command   ) # input
+    GenerateStyleHeader(${output_path} COMPUTE    compute   ) # modify
+    GenerateStyleHeader(${output_path} DIHEDRAL   dihedral  ) # force
+    GenerateStyleHeader(${output_path} DUMP       dump      ) # output    write_dump
+    GenerateStyleHeader(${output_path} FIX        fix       ) # modify
+    GenerateStyleHeader(${output_path} IMPROPER   improper  ) # force
+    GenerateStyleHeader(${output_path} INTEGRATE  integrate ) # update
+    GenerateStyleHeader(${output_path} KSPACE     kspace    ) # force
+    GenerateStyleHeader(${output_path} MINIMIZE   minimize  ) # update
+    GenerateStyleHeader(${output_path} NBIN       nbin      ) # neighbor
+    GenerateStyleHeader(${output_path} NPAIR      npair     ) # neighbor
+    GenerateStyleHeader(${output_path} NSTENCIL   nstencil  ) # neighbor
+    GenerateStyleHeader(${output_path} NTOPO      ntopo     ) # neighbor
+    GenerateStyleHeader(${output_path} PAIR       pair      ) # force
+    GenerateStyleHeader(${output_path} READER     reader    ) # read_dump
+    GenerateStyleHeader(${output_path} REGION     region    ) # domain
+endfunction(GenerateStyleHeaders)
--- a/cmake/README
+++ b/cmake/README
@ -0,0 +1,19 @@
+cmake-buildsystem
+-----------------
+
+To use the cmake build system instead of the make-driven one, do:
+```
+cmake /path/to/lammps/source/cmake
+```
+(please note the cmake directory as the very end)
+
+To enable package, e.g. GPU do
+```
+cmake /path/to/lammps/source/cmake -DENABLE_GPU=ON
+```
+
+cmake has many many options, do get an overview use the curses-based cmake interface, ccmake:
+```
+ccmake /path/to/lammps/source/cmake
+```
+(Don't forget to press "g" for generate once you are done with configuring)
--- a/cmake/gpu/lal_pppm_d.cu
+++ b/cmake/gpu/lal_pppm_d.cu
@ -0,0 +1,4 @@
+#define grdtyp double
+#define grdtyp4 double4
+
+#include "lal_pppm.cu"
--- a/cmake/gpu/lal_pppm_f.cu
+++ b/cmake/gpu/lal_pppm_f.cu
@ -0,0 +1,4 @@
+#define grdtyp float
+#define grdtyp4 float4
+
+#include "lal_pppm.cu"
--- a/doc/src/JPG/user_intel.png
+++ b/doc/src/JPG/user_intel.png
--- a/doc/src/Manual.txt
+++ b/doc/src/Manual.txt
@ -1,7 +1,7 @@
 <!-- HTML_ONLY -->
 <HEAD>
 <TITLE>LAMMPS Users Manual</TITLE>
-<META NAME="docnumber" CONTENT="10 Aug 2017 version">
+<META NAME="docnumber" CONTENT="17 Aug 2017 version">
 <META NAME="author" CONTENT="http://lammps.sandia.gov - Sandia National Laboratories">
 <META NAME="copyright" CONTENT="Copyright (2003) Sandia Corporation.  This software and manual is distributed under the GNU General Public License.">
 </HEAD>
@ -21,7 +21,7 @@
 <H1></H1>

 LAMMPS Documentation :c,h3
-10 Aug 2017 version :c,h4
+17 Aug 2017 version :c,h4

 Version info: :h4

@ -79,7 +79,7 @@ bug reports and feature requests are mainly coordinated through the
 "LAMMPS project on GitHub."_https://github.com/lammps/lammps
 The lammps.org domain, currently hosting "public continuous integration
 testing"_https://ci.lammps.org/job/lammps/ and "precompiled Linux
-RPM and Windows installer packages"_http://rpm.lammps.org is located
+RPM and Windows installer packages"_http://packages.lammps.org is located
 at Temple University and managed by Richard Berger,
 richard.berger at temple.edu.

--- a/doc/src/PDF/colvars-refman-lammps.pdf
+++ b/doc/src/PDF/colvars-refman-lammps.pdf
--- a/doc/src/Section_commands.txt
+++ b/doc/src/Section_commands.txt
@ -892,8 +892,8 @@ KOKKOS, o = USER-OMP, t = OPT.
 "hybrid"_pair_hybrid.html,
 "hybrid/overlay"_pair_hybrid.html,
 "adp (o)"_pair_adp.html,
-"airebo (o)"_pair_airebo.html,
-"airebo/morse (o)"_pair_airebo.html,
+"airebo (oi)"_pair_airebo.html,
+"airebo/morse (oi)"_pair_airebo.html,
 "beck (go)"_pair_beck.html,
 "body"_pair_body.html,
 "bop"_pair_bop.html,
@ -927,8 +927,8 @@ KOKKOS, o = USER-OMP, t = OPT.
 "dpd/tstat (go)"_pair_dpd.html,
 "dsmc"_pair_dsmc.html,
 "eam (gkiot)"_pair_eam.html,
-"eam/alloy (gkot)"_pair_eam.html,
-"eam/fs (gkot)"_pair_eam.html,
+"eam/alloy (gkiot)"_pair_eam.html,
+"eam/fs (gkiot)"_pair_eam.html,
 "eim (o)"_pair_eim.html,
 "gauss (go)"_pair_gauss.html,
 "gayberne (gio)"_pair_gayberne.html,
@ -942,9 +942,9 @@ KOKKOS, o = USER-OMP, t = OPT.
 "kim"_pair_kim.html,
 "lcbop"_pair_lcbop.html,
 "line/lj"_pair_line_lj.html,
-"lj/charmm/coul/charmm (ko)"_pair_charmm.html,
+"lj/charmm/coul/charmm (kio)"_pair_charmm.html,
 "lj/charmm/coul/charmm/implicit (ko)"_pair_charmm.html,
-"lj/charmm/coul/long (giko)"_pair_charmm.html,
+"lj/charmm/coul/long (gkio)"_pair_charmm.html,
 "lj/charmm/coul/msm"_pair_charmm.html,
 "lj/charmmfsw/coul/charmmfsh"_pair_charmm.html,
 "lj/charmmfsw/coul/long"_pair_charmm.html,
@ -990,7 +990,7 @@ KOKKOS, o = USER-OMP, t = OPT.
 "polymorphic"_pair_polymorphic.html,
 "python"_pair_python.html,
 "reax"_pair_reax.html,
-"rebo (o)"_pair_airebo.html,
+"rebo (oi)"_pair_airebo.html,
 "resquared (go)"_pair_resquared.html,
 "snap"_pair_snap.html,
 "soft (go)"_pair_soft.html,
--- a/doc/src/Section_errors.txt
+++ b/doc/src/Section_errors.txt
@ -7886,8 +7886,8 @@ keyword to allow for additional bonds to be formed :dd

 {New bond exceeded special list size in fix bond/create} :dt

-See the "special_bonds extra" command
-(or the "read_data extra/special/per/atom" command)
+See the "read_data extra/special/per/atom" command
+(or the "create_box extra/special/per/atom" command)
 for info on how to leave space in the special bonds
 list to allow for additional bonds to be formed. :dd

@ -9666,8 +9666,8 @@ you are running. :dd

 {Special list size exceeded in fix bond/create} :dt

-See the special_bonds extra command
-(or the read_data extra/special/per/atom command)
+See the "read_data extra/special/per/atom" command
+(or the "create_box extra/special/per/atom" command)
 for info on how to leave space in the special bonds
 list to allow for additional bonds to be formed. :dd

--- a/doc/src/Section_start.txt
+++ b/doc/src/Section_start.txt
@ -662,27 +662,25 @@ your own build system. Due to differences between the Windows OS
 and Windows system libraries to Unix-like environments like Linux
 or MacOS, when compiling for Windows a few adjustments may be needed:

-Do not set the -DLAMMPS_MEMALIGN define (see LMP_INC makefile variable)
+Do [not] set the -DLAMMPS_MEMALIGN define (see LMP_INC makefile variable)
 Add -lwsock32 -lpsapi to the linker flags (see LIB makefile variable)
-Try adding -static-libgcc or -static or both to the linker flags when your
-LAMMPS executable complains about missing .dll files  :ul
+Try adding -static-libgcc or -static or both to the linker flags when your LAMMPS executable complains about missing .dll files  :ul

-Since none of the current LAMMPS core developers
-has significant experience building executables on Windows, we are
-happy to distribute contributed instructions and modifications, but
-we cannot provide support for those.
+Since none of the current LAMMPS core developers has significant
+experience building executables on Windows, we are happy to distribute
+contributed instructions and modifications to improve the situation,
+but we cannot provide support for those.

 With the so-called "Anniversary Update" to Windows 10, there is a
 Ubuntu Linux subsystem available for Windows, that can be installed
 and then used to compile/install LAMMPS as if you are running on a
 Ubuntu Linux system instead of Windows.

-As an alternative, you can download "daily builds" (and some older
-versions) of the installer packages from
-"rpm.lammps.org/windows.html"_http://rpm.lammps.org/windows.html.
-These executables are built with most optional packages and the
-download includes documentation, potential files, some tools and
-many examples, but no source code.
+As an alternative, you can download pre-compiled installer packages from
+"packages.lammps.org/windows.html"_http://packages.lammps.org/windows.html.
+These executables are built with most optional packages included and the
+download includes documentation, potential files, some tools and many
+examples, but no source code.

 :line

@ -1095,7 +1093,7 @@ LAMMPS to be built with one or more of its optional packages.
 :line

 On a Windows box, you can skip making LAMMPS and simply download an
-installer package from "here"_http://rpm.lammps.org/windows.html
+installer package from "here"_http://packages.lammps.org/windows.html

 For running the non-MPI executable, follow these steps:

@ -1107,18 +1105,27 @@ the [in.lj] input from the bench folder. (e.g. by typing: cd "Documents"). :l

 At the command prompt, type "lmp_serial -in in.lj", replacing [in.lj]
 with the name of your LAMMPS input script. :l
+
+The serial executable includes support for multi-threading
+parallelization from the styles in the USER-OMP packages.
+
+To run with, e.g. 4 threads, type "lmp_serial -in in.lj -pk omp 4 -sf omp"
 :ule

-For the MPI version, which allows you to run LAMMPS under Windows on
-multiple processors, follow these steps:
+For the MPI version, which allows you to run LAMMPS under Windows with
+the more general message passing parallel library (LAMMPS has been
+designed from ground up to use MPI efficiently), follow these steps:

-Download and install
-"MPICH2"_http://www.mcs.anl.gov/research/projects/mpich2/downloads/index.php?s=downloads
-for Windows. :ulb,l
+Download and install a compatible MPI library binary package:
+for 32-bit Windows
+"mpich2-1.4.1p1-win-ia32.msi"_download.lammps.org/thirdparty/mpich2-1.4.1p1-win-ia32.msi
+and for 64-bit Windows
+"mpich2-1.4.1p1-win-x86-64.msi"_download.lammps.org/thirdparty/mpich2-1.4.1p1-win-x86-64.msi
+:ulb,l

 The LAMMPS Windows installer packages will automatically adjust your
 path for the default location of this MPI package. After the installation
-of the MPICH software, it needs to be integrated into the system.
+of the MPICH2 software, it needs to be integrated into the system.
 For this you need to start a Command Prompt in {Administrator Mode}
 (right click on the icon and select it). Change into the MPICH2
 installation directory, then into the subdirectory [bin] and execute
@ -1137,7 +1144,7 @@ or

 mpiexec -np 4 lmp_mpi -in in.lj :pre

-replacing in.lj with the name of your LAMMPS input script. For the latter
+replacing [in.lj] with the name of your LAMMPS input script. For the latter
 case, you may be prompted to enter your password. :l

 In this mode, output may not immediately show up on the screen, so if
@ -1149,6 +1156,11 @@ something like:

 lmp_mpi -in in.lj :pre

+And the parallel executable also includes OpenMP multi-threading, which
+can be combined with MPI using something like:
+
+mpiexec -localonly 2 lmp_mpi -in in.lj -pk omp 2 -sf omp :pre
+
 :ule

 :line
--- a/doc/src/accelerate_intel.txt
+++ b/doc/src/accelerate_intel.txt
@ -29,8 +29,10 @@ Bond Styles: fene, harmonic :l
 Dihedral Styles: charmm, harmonic, opls :l
 Fixes: nve, npt, nvt, nvt/sllod :l
 Improper Styles: cvff, harmonic :l
-Pair Styles: buck/coul/cut, buck/coul/long, buck, eam, gayberne,
-charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, sw, tersoff :l
+Pair Styles: airebo, airebo/morse, buck/coul/cut, buck/coul/long, 
+buck, eam, eam/alloy, eam/fs, gayberne, lj/charmm/coul/charmm, 
+lj/charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, rebo,
+sw, tersoff :l
 K-Space Styles: pppm, pppm/disp :l
 :ule

--- a/doc/src/fix_bond_create.txt
+++ b/doc/src/fix_bond_create.txt
@ -150,10 +150,9 @@ atoms.  Note that adding a single bond always adds a new 1st neighbor
 but may also induce *many* new 2nd and 3rd neighbors, depending on the
 molecular topology of your system.  The "extra special per atom"
 parameter must typically be set to allow for the new maximum total
-size (1st + 2nd + 3rd neighbors) of this per-atom list.  There are 3
+size (1st + 2nd + 3rd neighbors) of this per-atom list.  There are 2
 ways to do this.  See the "read_data"_read_data.html or
-"create_box"_create_box.html or "special_bonds extra" commands for
-details.
+"create_box"_create_box.html commands for details.

 NOTE: Even if you do not use the {atype}, {dtype}, or {itype}
 keywords, the list of topological neighbors is updated for atoms
--- a/doc/src/pair_airebo.txt
+++ b/doc/src/pair_airebo.txt
@ -7,10 +7,13 @@
 :line

 pair_style airebo command :h3
+pair_style airebo/intel command :h3
 pair_style airebo/omp command :h3
 pair_style airebo/morse command :h3
+pair_style airebo/morse/intel command :h3
 pair_style airebo/morse/omp command :h3
 pair_style rebo command :h3
+pair_style rebo/intel command :h3
 pair_style rebo/omp command :h3

 [Syntax:]
--- a/doc/src/pair_charmm.txt
+++ b/doc/src/pair_charmm.txt
@ -7,6 +7,7 @@
 :line

 pair_style lj/charmm/coul/charmm command :h3
+pair_style lj/charmm/coul/charmm/intel command :h3
 pair_style lj/charmm/coul/charmm/omp command :h3
 pair_style lj/charmm/coul/charmm/implicit command :h3
 pair_style lj/charmm/coul/charmm/implicit/omp command :h3
--- a/doc/src/pair_eam.txt
+++ b/doc/src/pair_eam.txt
@ -14,6 +14,7 @@ pair_style eam/omp command :h3
 pair_style eam/opt command :h3
 pair_style eam/alloy command :h3
 pair_style eam/alloy/gpu command :h3
+pair_style eam/alloy/intel command :h3
 pair_style eam/alloy/kk command :h3
 pair_style eam/alloy/omp command :h3
 pair_style eam/alloy/opt command :h3
@ -21,6 +22,7 @@ pair_style eam/cd command :h3
 pair_style eam/cd/omp command :h3
 pair_style eam/fs command :h3
 pair_style eam/fs/gpu command :h3
+pair_style eam/fs/intel command :h3
 pair_style eam/fs/kk command :h3
 pair_style eam/fs/omp command :h3
 pair_style eam/fs/opt command :h3
--- a/doc/src/special_bonds.txt
+++ b/doc/src/special_bonds.txt
@ -25,9 +25,7 @@ keyword = {amber} or {charmm} or {dreiding} or {fene} or {lj/coul} or {lj} or {c
  {coul} values = w1,w2,w3
    w1,w2,w3 = weights (0.0 to 1.0) on pairwise Coulombic interactions
  {angle} value = {yes} or {no}
-  {dihedral} value = {yes} or {no}
-  {extra} value = N
-    N = number of extra 1-2,1-3,1-4 interactions to save space for :pre
+  {dihedral} value = {yes} or {no} :pre
 :ule

 Examples:
@ -36,8 +34,7 @@ special_bonds amber
 special_bonds charmm
 special_bonds fene dihedral no
 special_bonds lj/coul 0.0 0.0 0.5 angle yes dihedral yes
-special_bonds lj 0.0 0.0 0.5 coul 0.0 0.0 0.0 dihedral yes
-special_bonds lj/coul 0 1 1 extra 2 :pre
+special_bonds lj 0.0 0.0 0.5 coul 0.0 0.0 0.0 dihedral yes :pre

 [Description:]

@ -178,14 +175,6 @@ interaction between atoms 2 and 5 will be unaffected (full weighting
 of 1.0).  If the {dihedral} keyword is specified as {no} which is the
 default, then the 2,5 interaction will also be weighted by 0.5.

-The {extra} keyword can be used when additional bonds will be created
-during a simulation run, e.g. by the "fix
-bond/create"_fix_bond_create.html command.  It can also be used if
-molecules will be added to the system, e.g. via the "fix
-deposit"_fix_deposit.html, or "fix pour"_fix_pour.html commands, which
-will have atoms with more special neighbors than any atom in the
-current system has.
-
 :line

 NOTE: LAMMPS stores and maintains a data structure with a list of the
@ -194,8 +183,9 @@ the system).  If new bonds are created (or molecules added containing
 atoms with more special neighbors), the size of this list needs to
 grow.  Note that adding a single bond always adds a new 1st neighbor
 but may also induce *many* new 2nd and 3rd neighbors, depending on the
-molecular topology of your system.  Using the {extra} keyword leaves
-empty space in the list for this N additional 1st, 2nd, or 3rd
+molecular topology of your system.  Using the {extra/special/per/atom}
+keyword to either "read_data"_read_data.html or "create_box"_create_box.html
+reserves empty space in the list for this N additional 1st, 2nd, or 3rd
 neighbors to be added.  If you do not do this, you may get an error
 when bonds (or molecules) are added.

@ -203,8 +193,7 @@ when bonds (or molecules) are added.

 NOTE: If you reuse this command in an input script, you should set all
 the options you need each time.  This command cannot be used a 2nd
-time incrementally, e.g. to add some extra storage locations via the
-{extra} keyword.  E.g. these two commands:
+time incrementally.  E.g. these two commands:

 special_bonds lj 0.0 1.0 1.0
 special_bonds coul 0.0 0.0 1.0
@ -221,25 +210,6 @@ Coul: coul 0.0 0.0 1.0
 because the LJ settings are reset to their default values
 each time the command is issued.

-Likewise
-
-special_bonds amber
-special_bonds extra 2 :pre
-
-is not the same as this single command:
-
-special_bonds amber extra 2 :pre
-
-since in the former case, the 2nd command will reset all the LJ and
-Coulombic weights to 0.0 (the default).
-
-One exception to this rule is the {extra} option itself.  It is not
-reset to its default value of 0 each time the special_bonds command is
-invoked.  This is because it can also be set by the
-"read_data"_read_data.html and "create_box"_create_box.html commands,
-so this command will not override those settings unless you explicitly
-use {extra} as an option.
-
 [Restrictions:] none

 [Related commands:]
--- a/doc/src/tutorial_bash_on_windows.txt
+++ b/doc/src/tutorial_bash_on_windows.txt
--- a/doc/src/tutorial_drude.txt
+++ b/doc/src/tutorial_drude.txt
@ -176,12 +176,13 @@ By recognizing the fix {drude}, LAMMPS will find and store matching
 DC-DP pairs and will treat DP as equivalent to their DC in the
 {special bonds} relations.  It may be necessary to extend the space
 for storing such special relations.  In this case extra space should
-be reserved by using the {extra} keyword of the {special_bonds}
+be reserved by using the {extra/special/per/atom} keyword of either
+the "read_data"_read_data.html or "create_box"_create_box.html
 command.  With our phenol, there is 1 more special neighbor for which
 space is required.  Otherwise LAMMPS crashes and gives the required
 value.

-special_bonds lj/coul 0.0 0.0 0.5 extra 1 :pre
+read_data data-p.lmp extra/special/per/atom 1 :pre

 Let us assume we want to run a simple NVT simulation at 300 K.  Note
 that Drude oscillators need to be thermalized at a low temperature in
--- a/doc/src/tutorials.txt
+++ b/doc/src/tutorials.txt
--- a/lib/colvars/Install.py
+++ b/lib/colvars/Install.py
@ -45,12 +45,12 @@ while iarg < nargs:
  if args[iarg] == "-m":
    if iarg+2 > len(args): error()
    machine = args[iarg+1]
-    iarg += 2  
+    iarg += 2
  elif args[iarg] == "-e":
    if iarg+2 > len(args): error()
    extraflag = True
    suffix = args[iarg+1]
-    iarg += 2  
+    iarg += 2
  else: error()

 # set lib from working dir
--- a/lib/colvars/README
+++ b/lib/colvars/README
@ -32,7 +32,7 @@ where Makefile.g++ uses the GNU C++ compiler and is a good template to start.

 **Optional**: if you use the Install.py script provided in this folder, you
 can give the machine name as the '-m' argument.  This can be the suffix of one
-of the files from either this folder, or from src/MAKE.
+of the files from either this folder, or from src/MAKE/MACHINES.
 *This is only supported by the Install.py within the lib/colvars folder*.

 When you are done building this library, two files should
@ -53,10 +53,10 @@ settings in Makefile.common should work.
 For the reference manual see:
  http://colvars.github.io/colvars-refman-lammps

-A copy of reference manual is also in:
+A copy of the reference manual is also in:
  doc/PDF/colvars-refman-lammps.pdf

-Also included is a Doxygen-based developer documentation:
+Also available is a Doxygen-based developer documentation:
  http://colvars.github.io/doxygen/html/

 The reference article is:
--- a/lib/colvars/colvar.h
+++ b/lib/colvars/colvar.h
@ -88,7 +88,12 @@ public:
  static std::vector<feature *> cv_features;

  /// \brief Implementation of the feature list accessor for colvar
-  std::vector<feature *> &features() {
+  virtual const std::vector<feature *> &features()
+  {
+    return cv_features;
+  }
+  virtual std::vector<feature *> &modify_features()
+  {
    return cv_features;
  }

--- a/lib/colvars/colvaratoms.h
+++ b/lib/colvars/colvaratoms.h
@ -206,7 +206,12 @@ public:
  static std::vector<feature *> ag_features;

  /// \brief Implementation of the feature list accessor for atom group
-  virtual std::vector<feature *> &features() {
+  virtual const std::vector<feature *> &features()
+  {
+    return ag_features;
+  }
+  virtual std::vector<feature *> &modify_features()
+  {
    return ag_features;
  }

--- a/lib/colvars/colvarbias.cpp
+++ b/lib/colvars/colvarbias.cpp
@ -384,6 +384,7 @@ std::ostream & colvarbias::write_traj(std::ostream &os)
  os << " ";
  if (b_output_energy)
    os << " "
+       << std::setprecision(cvm::en_prec) << std::setw(cvm::en_width)
       << bias_energy;
  return os;
 }
--- a/lib/colvars/colvarbias.h
+++ b/lib/colvars/colvarbias.h
@ -175,7 +175,11 @@ public:
  static std::vector<feature *> cvb_features;

  /// \brief Implementation of the feature list accessor for colvarbias
-  virtual std::vector<feature *> &features()
+  virtual const std::vector<feature *> &features()
+  {
+    return cvb_features;
+  }
+  virtual std::vector<feature *> &modify_features()
  {
    return cvb_features;
  }
--- a/lib/colvars/colvarbias_restraint.cpp
+++ b/lib/colvars/colvarbias_restraint.cpp
@ -99,12 +99,9 @@ int colvarbias_restraint_centers::init(std::string const &conf)
  if (null_centers) {
    // try to initialize the restraint centers for the first time
    colvar_centers.resize(num_variables());
-    colvar_centers_raw.resize(num_variables());
    for (i = 0; i < num_variables(); i++) {
      colvar_centers[i].type(variables(i)->value());
      colvar_centers[i].reset();
-      colvar_centers_raw[i].type(variables(i)->value());
-      colvar_centers_raw[i].reset();
    }
  }

@ -113,7 +110,6 @@ int colvarbias_restraint_centers::init(std::string const &conf)
      if (cvm::debug()) {
        cvm::log("colvarbias_restraint: parsing initial centers, i = "+cvm::to_str(i)+".\n");
      }
-      colvar_centers_raw[i] = colvar_centers[i];
      colvar_centers[i].apply_constraints();
    }
    null_centers = false;
@ -141,8 +137,6 @@ int colvarbias_restraint_centers::change_configuration(std::string const &conf)
    for (size_t i = 0; i < num_variables(); i++) {
      colvar_centers[i].type(variables(i)->value());
      colvar_centers[i].apply_constraints();
-      colvar_centers_raw[i].type(variables(i)->value());
-      colvar_centers_raw[i] = colvar_centers[i];
    }
  }
  return COLVARS_OK;
@ -232,7 +226,6 @@ int colvarbias_restraint_moving::set_state_params(std::string const &conf)
 {
  if (b_chg_centers || b_chg_force_k) {
    if (target_nstages) {
-      //    cvm::log ("Reading current stage from the restart.\n");
      if (!get_keyval(conf, "stage", stage))
        cvm::error("Error: current stage is missing from the restart.\n");
    }
@ -265,100 +258,127 @@ int colvarbias_restraint_centers_moving::init(std::string const &conf)

  size_t i;
  if (get_keyval(conf, "targetCenters", target_centers, colvar_centers)) {
-    if (colvar_centers.size() != num_variables()) {
+    if (target_centers.size() != num_variables()) {
      cvm::error("Error: number of target centers does not match "
-                 "that of collective variables.\n");
+                 "that of collective variables.\n", INPUT_ERROR);
    }
    b_chg_centers = true;
    for (i = 0; i < target_centers.size(); i++) {
      target_centers[i].apply_constraints();
+      centers_incr.push_back(colvar_centers[i]);
+      centers_incr[i].reset();
    }
  }

  if (b_chg_centers) {
-    // parse moving restraint options
+    // parse moving schedule options
    colvarbias_restraint_moving::init(conf);
+    if (initial_centers.size() == 0) {
+      // One-time init
+      initial_centers = colvar_centers;
+    }
+    // Call to check that the definition is correct
+    for (i = 0; i < num_variables(); i++) {
+      colvarvalue const midpoint =
+        colvarvalue::interpolate(initial_centers[i],
+                                 target_centers[i],
+                                 0.5);
+    }
  } else {
    target_centers.clear();
    return COLVARS_OK;
  }

  get_keyval(conf, "outputCenters", b_output_centers, b_output_centers);
-  get_keyval(conf, "outputAccumulatedWork", b_output_acc_work, b_output_acc_work);
+  get_keyval(conf, "outputAccumulatedWork", b_output_acc_work,
+             b_output_acc_work); // TODO this conflicts with stages

  return COLVARS_OK;
 }


+int colvarbias_restraint_centers_moving::update_centers(cvm::real lambda)
+{
+  if (cvm::debug()) {
+    cvm::log("Updating centers for the restraint bias \""+
+             this->name+"\": "+cvm::to_str(colvar_centers)+".\n");
+  }
+  size_t i;
+  for (i = 0; i < num_variables(); i++) {
+    colvarvalue const c_new = colvarvalue::interpolate(initial_centers[i],
+                                                       target_centers[i],
+                                                       lambda);
+    centers_incr[i] = (c_new).dist2_grad(colvar_centers[i]);
+    colvar_centers[i] = c_new;
+    variables(i)->wrap(colvar_centers[i]);
+  }
+  if (cvm::debug()) {
+    cvm::log("New centers for the restraint bias \""+
+             this->name+"\": "+cvm::to_str(colvar_centers)+".\n");
+  }
+  return cvm::get_error();
+}
+
+
 int colvarbias_restraint_centers_moving::update()
 {
  if (b_chg_centers) {

-    if (cvm::debug()) {
-      cvm::log("Updating centers for the restraint bias \""+
-               this->name+"\": "+cvm::to_str(colvar_centers)+".\n");
-    }
-
-    if (!centers_incr.size()) {
-      // if this is the first calculation, calculate the advancement
-      // at each simulation step (or stage, if applicable)
-      // (take current stage into account: it can be non-zero
-      //  if we are restarting a staged calculation)
-      centers_incr.resize(num_variables());
-      for (size_t i = 0; i < num_variables(); i++) {
-        centers_incr[i].type(variables(i)->value());
-        centers_incr[i] = (target_centers[i] - colvar_centers_raw[i]) /
-          cvm::real( target_nstages ? (target_nstages - stage) :
-                     (target_nsteps - cvm::step_absolute()));
-      }
-      if (cvm::debug()) {
-        cvm::log("Center increment for the restraint bias \""+
-                 this->name+"\": "+cvm::to_str(centers_incr)+" at stage "+cvm::to_str(stage)+ ".\n");
-      }
-    }
-
    if (target_nstages) {
-      if ((cvm::step_relative() > 0)
-          && (cvm::step_absolute() % target_nsteps) == 0
-          && stage < target_nstages) {
-
-        for (size_t i = 0; i < num_variables(); i++) {
-          colvar_centers_raw[i] += centers_incr[i];
-          colvar_centers[i] = colvar_centers_raw[i];
-          variables(i)->wrap(colvar_centers[i]);
-          colvar_centers[i].apply_constraints();
+      // Staged update
+      if (stage <= target_nstages) {
+        if ((cvm::step_relative() > 0) &&
+            ((cvm::step_absolute() % target_nsteps) == 1)) {
+          cvm::real const lambda =
+            cvm::real(stage)/cvm::real(target_nstages);
+          update_centers(lambda);
+          stage++;
+          cvm::log("Moving restraint \"" + this->name +
+                   "\" stage " + cvm::to_str(stage) +
+                   " : setting centers to " + cvm::to_str(colvar_centers) +
+                   " at step " +  cvm::to_str(cvm::step_absolute()));
+        } else {
+          for (size_t i = 0; i < num_variables(); i++) {
+            centers_incr[i].reset();
+          }
        }
-        stage++;
-        cvm::log("Moving restraint \"" + this->name +
-                 "\" stage " + cvm::to_str(stage) +
-                 " : setting centers to " + cvm::to_str(colvar_centers) +
-                 " at step " +  cvm::to_str(cvm::step_absolute()));
      }
-    } else if ((cvm::step_relative() > 0) && (cvm::step_absolute() <= target_nsteps)) {
-      // move the restraint centers in the direction of the targets
-      // (slow growth)
+    } else {
+      // Continuous update
+      if (cvm::step_absolute() <= target_nsteps) {
+        cvm::real const lambda =
+          cvm::real(cvm::step_absolute())/cvm::real(target_nsteps);
+        update_centers(lambda);
+      } else {
+        for (size_t i = 0; i < num_variables(); i++) {
+          centers_incr[i].reset();
+        }
+      }
+    }
+
+    if (cvm::step_relative() == 0) {
      for (size_t i = 0; i < num_variables(); i++) {
-        colvar_centers_raw[i] += centers_incr[i];
-        colvar_centers[i] = colvar_centers_raw[i];
-        variables(i)->wrap(colvar_centers[i]);
-        colvar_centers[i].apply_constraints();
+        // finite differences are undefined when restarting
+        centers_incr[i].reset();
      }
    }

    if (cvm::debug()) {
-      cvm::log("New centers for the restraint bias \""+
-               this->name+"\": "+cvm::to_str(colvar_centers)+".\n");
+      cvm::log("Center increment for the restraint bias \""+
+               this->name+"\": "+cvm::to_str(centers_incr)+
+               " at stage "+cvm::to_str(stage)+ ".\n");
    }
  }

-  return COLVARS_OK;
+  return cvm::get_error();
 }


 int colvarbias_restraint_centers_moving::update_acc_work()
 {
  if (b_output_acc_work) {
-    if ((cvm::step_relative() > 0) || (cvm::step_absolute() == 0)) {
+    if ((cvm::step_relative() > 0) &&
+        (cvm::step_absolute() <= target_nsteps)) {
      for (size_t i = 0; i < num_variables(); i++) {
        // project forces on the calculated increments at this step
        acc_work += colvar_forces[i] * centers_incr[i];
@ -383,13 +403,6 @@ std::string const colvarbias_restraint_centers_moving::get_state_params() const
         << colvar_centers[i];
    }
    os << "\n";
-    os << "centers_raw ";
-    for (i = 0; i < num_variables(); i++) {
-      os << " "
-         << std::setprecision(cvm::cv_prec) << std::setw(cvm::cv_width)
-         << colvar_centers_raw[i];
-    }
-    os << "\n";

    if (b_output_acc_work) {
      os << "accumulatedWork "
@ -398,7 +411,7 @@ std::string const colvarbias_restraint_centers_moving::get_state_params() const
    }
  }

-  return colvarbias_restraint_moving::get_state_params() + os.str();
+  return os.str();
 }


@ -410,8 +423,6 @@ int colvarbias_restraint_centers_moving::set_state_params(std::string const &con
    //    cvm::log ("Reading the updated restraint centers from the restart.\n");
    if (!get_keyval(conf, "centers", colvar_centers))
      cvm::error("Error: restraint centers are missing from the restart.\n");
-    if (!get_keyval(conf, "centers_raw", colvar_centers_raw))
-      cvm::error("Error: \"raw\" restraint centers are missing from the restart.\n");
    if (b_output_acc_work) {
      if (!get_keyval(conf, "accumulatedWork", acc_work))
        cvm::error("Error: accumulatedWork is missing from the restart.\n");
@ -609,7 +620,7 @@ std::string const colvarbias_restraint_k_moving::get_state_params() const
       << std::setprecision(cvm::en_prec)
       << std::setw(cvm::en_width) << force_k << "\n";
  }
-  return colvarbias_restraint_moving::get_state_params() + os.str();
+  return os.str();
 }


@ -770,6 +781,7 @@ cvm::real colvarbias_restraint_harmonic::d_restraint_potential_dk(size_t i) cons
 std::string const colvarbias_restraint_harmonic::get_state_params() const
 {
  return colvarbias_restraint::get_state_params() +
+    colvarbias_restraint_moving::get_state_params() +
    colvarbias_restraint_centers_moving::get_state_params() +
    colvarbias_restraint_k_moving::get_state_params();
 }
@ -779,6 +791,7 @@ int colvarbias_restraint_harmonic::set_state_params(std::string const &conf)
 {
  int error_code = COLVARS_OK;
  error_code |= colvarbias_restraint::set_state_params(conf);
+  error_code |= colvarbias_restraint_moving::set_state_params(conf);
  error_code |= colvarbias_restraint_centers_moving::set_state_params(conf);
  error_code |= colvarbias_restraint_k_moving::set_state_params(conf);
  return error_code;
@ -1037,6 +1050,7 @@ cvm::real colvarbias_restraint_harmonic_walls::d_restraint_potential_dk(size_t i
 std::string const colvarbias_restraint_harmonic_walls::get_state_params() const
 {
  return colvarbias_restraint::get_state_params() +
+    colvarbias_restraint_moving::get_state_params() +
    colvarbias_restraint_k_moving::get_state_params();
 }

@ -1045,6 +1059,7 @@ int colvarbias_restraint_harmonic_walls::set_state_params(std::string const &con
 {
  int error_code = COLVARS_OK;
  error_code |= colvarbias_restraint::set_state_params(conf);
+  error_code |= colvarbias_restraint_moving::set_state_params(conf);
  error_code |= colvarbias_restraint_k_moving::set_state_params(conf);
  return error_code;
 }
@ -1164,6 +1179,7 @@ cvm::real colvarbias_restraint_linear::d_restraint_potential_dk(size_t i) const
 std::string const colvarbias_restraint_linear::get_state_params() const
 {
  return colvarbias_restraint::get_state_params() +
+    colvarbias_restraint_moving::get_state_params() +
    colvarbias_restraint_centers_moving::get_state_params() +
    colvarbias_restraint_k_moving::get_state_params();
 }
@ -1173,6 +1189,7 @@ int colvarbias_restraint_linear::set_state_params(std::string const &conf)
 {
  int error_code = COLVARS_OK;
  error_code |= colvarbias_restraint::set_state_params(conf);
+  error_code |= colvarbias_restraint_moving::set_state_params(conf);
  error_code |= colvarbias_restraint_centers_moving::set_state_params(conf);
  error_code |= colvarbias_restraint_k_moving::set_state_params(conf);
  return error_code;
--- a/lib/colvars/colvarbias_restraint.h
+++ b/lib/colvars/colvarbias_restraint.h
@ -74,9 +74,6 @@ protected:

  /// \brief Restraint centers
  std::vector<colvarvalue> colvar_centers;
-
-  /// \brief Restraint centers outside the domain of the colvars (no wrapping or constraints applied)
-  std::vector<colvarvalue> colvar_centers_raw;
 };


@ -156,10 +153,16 @@ protected:
  /// \brief New restraint centers
  std::vector<colvarvalue> target_centers;

+  /// \brief Initial value of the restraint centers
+  std::vector<colvarvalue> initial_centers;
+
  /// \brief Amplitude of the restraint centers' increment at each step
-  /// (or stage) towards the new values (calculated from target_nsteps)
+  /// towards the new values (calculated from target_nsteps)
  std::vector<colvarvalue> centers_incr;

+  /// \brief Update the centers by interpolating between initial and target
+  virtual int update_centers(cvm::real lambda);
+
  /// Whether to write the current restraint centers to the trajectory file
  bool b_output_centers;

--- a/lib/colvars/colvarcomp.h
+++ b/lib/colvars/colvarcomp.h
@ -132,9 +132,15 @@ public:
  static std::vector<feature *> cvc_features;

  /// \brief Implementation of the feature list accessor for colvar
-  virtual std::vector<feature *> &features() {
+  virtual const std::vector<feature *> &features()
+  {
    return cvc_features;
  }
+  virtual std::vector<feature *> &modify_features()
+  {
+    return cvc_features;
+  }
+

  /// \brief Obtain data needed for the calculation for the backend
  virtual void read_data();
--- a/lib/colvars/colvardeps.cpp
+++ b/lib/colvars/colvardeps.cpp
@ -374,8 +374,8 @@ int colvardeps::decr_ref_count(int feature_id) {
 }

 void colvardeps::init_feature(int feature_id, const char *description, feature_type type) {
-  features()[feature_id]->description = description;
-  features()[feature_id]->type = type;
+  modify_features()[feature_id]->description = description;
+  modify_features()[feature_id]->type = type;
 }

 // Shorthand macros for describing dependencies
@ -401,7 +401,7 @@ void colvardeps::init_cvb_requires() {
  int i;
  if (features().size() == 0) {
    for (i = 0; i < f_cvb_ntot; i++) {
-      features().push_back(new feature);
+      modify_features().push_back(new feature);
    }

    init_feature(f_cvb_active, "active", f_type_dynamic);
@ -438,7 +438,7 @@ void colvardeps::init_cv_requires() {
  size_t i;
  if (features().size() == 0) {
    for (i = 0; i < f_cv_ntot; i++) {
-      features().push_back(new feature);
+      modify_features().push_back(new feature);
    }

    init_feature(f_cv_active, "active", f_type_dynamic);
@ -554,7 +554,7 @@ void colvardeps::init_cvc_requires() {
  // Initialize static array once and for all
  if (features().size() == 0) {
    for (i = 0; i < colvardeps::f_cvc_ntot; i++) {
-      features().push_back(new feature);
+      modify_features().push_back(new feature);
    }

    init_feature(f_cvc_active, "active", f_type_dynamic);
@ -633,7 +633,7 @@ void colvardeps::init_ag_requires() {
  // Initialize static array once and for all
  if (features().size() == 0) {
    for (i = 0; i < f_ag_ntot; i++) {
-      features().push_back(new feature);
+      modify_features().push_back(new feature);
    }

    init_feature(f_ag_active, "active", f_type_dynamic);
--- a/lib/colvars/colvardeps.h
+++ b/lib/colvars/colvardeps.h
@ -135,7 +135,8 @@ public:
  // with a non-static array
  // Intermediate classes (colvarbias and colvarcomp, which are also base classes)
  // implement this as virtual to allow overriding
-  virtual std::vector<feature *>&features() = 0;
+  virtual const std::vector<feature *>&features() = 0;
+  virtual std::vector<feature *>&modify_features() = 0;

  void add_child(colvardeps *child);

--- a/lib/colvars/colvars_version.h
+++ b/lib/colvars/colvars_version.h
@ -1,4 +1,5 @@
-#define COLVARS_VERSION "2017-07-15"
+#ifndef COLVARS_VERSION
+#define COLVARS_VERSION "2017-08-06"
 // This file is part of the Collective Variables module (Colvars).
 // The original version of Colvars and its updates are located at:
 // https://github.com/colvars/colvars
@ -6,3 +7,4 @@
 // If you wish to distribute your changes, please submit them to the
 // Colvars repository at GitHub.

+#endif
--- a/lib/colvars/colvarscript.cpp
+++ b/lib/colvars/colvarscript.cpp
@ -472,7 +472,7 @@ int colvarscript::proc_features(colvardeps *obj,
  }

  if ((subcmd == "get") || (subcmd == "set")) {
-    std::vector<colvardeps::feature *> &features = obj->features();
+    std::vector<colvardeps::feature *> const &features = obj->features();
    std::string const req_feature(obj_to_str(objv[3]));
    colvardeps::feature *f = NULL;
    int fid = 0;
--- a/lib/colvars/colvartypes.cpp
+++ b/lib/colvars/colvartypes.cpp
@ -19,6 +19,17 @@ bool      colvarmodule::rotation::monitor_crossings = false;
 cvm::real colvarmodule::rotation::crossing_threshold = 1.0E-02;


+/// Numerical recipes diagonalization
+static int jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot);
+
+/// Eigenvector sort
+static int eigsrt(cvm::real *d, cvm::real **v);
+
+/// Transpose the matrix
+static int transpose(cvm::real **v);
+
+
+
 std::string cvm::rvector::to_simple_string() const
 {
  std::ostringstream os;
@ -286,7 +297,12 @@ void colvarmodule::rotation::diagonalize_matrix(cvm::matrix2d<cvm::real> &S,

  // diagonalize
  int jac_nrot = 0;
-  jacobi(S.c_array(), S_eigval.c_array(), S_eigvec.c_array(), &jac_nrot);
+  if (jacobi(S.c_array(), S_eigval.c_array(), S_eigvec.c_array(), &jac_nrot) !=
+      COLVARS_OK) {
+    cvm::error("Too many iterations in routine jacobi.\n"
+               "This is usually the result of an ill-defined set of atoms for "
+               "rotational alignment (RMSD, rotateReference, etc).\n");
+  }
  eigsrt(S_eigval.c_array(), S_eigvec.c_array());
  // jacobi saves eigenvectors by columns
  transpose(S_eigvec.c_array());
@ -528,7 +544,7 @@ void colvarmodule::rotation::calc_optimal_rotation(std::vector<cvm::atom_pos> co

 #define n 4

-void jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot)
+int jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot)
 {
  int j,iq,ip,i;
  cvm::real tresh,theta,tau,t,sm,s,h,g,c;
@ -554,7 +570,7 @@ void jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot)
        sm += std::fabs(a[ip][iq]);
    }
    if (sm == 0.0) {
-      return;
+      return COLVARS_OK;
    }
    if (i < 4)
      tresh=0.2*sm/(n*n);
@ -606,10 +622,11 @@ void jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot)
      z[ip]=0.0;
    }
  }
-  cvm::error("Too many iterations in routine jacobi.\n");
+  return COLVARS_ERROR;
 }

-void eigsrt(cvm::real *d, cvm::real **v)
+
+int eigsrt(cvm::real *d, cvm::real **v)
 {
  int k,j,i;
  cvm::real p;
@ -628,9 +645,11 @@ void eigsrt(cvm::real *d, cvm::real **v)
      }
    }
  }
+  return COLVARS_OK;
 }

-void transpose(cvm::real **v)
+
+int transpose(cvm::real **v)
 {
  cvm::real p;
  int i,j;
@ -641,6 +660,7 @@ void transpose(cvm::real **v)
      v[j][i]=p;
    }
  }
+  return COLVARS_OK;
 }

 #undef n
--- a/lib/colvars/colvartypes.h
+++ b/lib/colvars/colvartypes.h
@ -1020,16 +1020,6 @@ inline cvm::rvector operator * (cvm::rmatrix const &m,
 }


-/// Numerical recipes diagonalization
-void jacobi(cvm::real **a, cvm::real *d, cvm::real **v, int *nrot);
-
-/// Eigenvector sort
-void eigsrt(cvm::real *d, cvm::real **v);
-
-/// Transpose the matrix
-void transpose(cvm::real **v);
-
-


 /// \brief 1-dimensional vector of real numbers with four components and
--- a/lib/colvars/colvarvalue.cpp
+++ b/lib/colvars/colvarvalue.cpp
@ -570,6 +570,50 @@ colvarvalue colvarvalue::dist2_grad(colvarvalue const &x2) const
 }


+/// Return the midpoint between x1 and x2, optionally weighted by lambda
+/// (which must be between 0.0 and 1.0)
+colvarvalue const colvarvalue::interpolate(colvarvalue const &x1,
+                                           colvarvalue const &x2,
+                                           cvm::real const lambda)
+{
+  colvarvalue::check_types(x1, x2);
+
+  if ((lambda < 0.0) || (lambda > 1.0)) {
+    cvm::error("Error: trying to interpolate between two colvarvalues with a "
+               "lamdba outside [0:1].\n", BUG_ERROR);
+  }
+
+  colvarvalue interp = ((1.0-lambda)*x1 + lambda*x2);
+  cvm::real const d2 = x1.dist2(x2);
+
+  switch (x1.type()) {
+  case colvarvalue::type_scalar:
+  case colvarvalue::type_3vector:
+  case colvarvalue::type_vector:
+  case colvarvalue::type_unit3vectorderiv:
+  case colvarvalue::type_quaternionderiv:
+    return interp;
+    break;
+  case colvarvalue::type_unit3vector:
+  case colvarvalue::type_quaternion:
+    if (interp.norm()/std::sqrt(d2) < 1.0e-6) {
+      cvm::error("Error: interpolation between "+cvm::to_str(x1)+" and "+
+                 cvm::to_str(x2)+" with lambda = "+cvm::to_str(lambda)+
+                 " is undefined: result = "+cvm::to_str(interp)+"\n",
+                 INPUT_ERROR);
+    }
+    interp.apply_constraints();
+    return interp;
+    break;
+  case colvarvalue::type_notset:
+  default:
+    x1.undef_op();
+    break;
+  }
+  return colvarvalue(colvarvalue::type_notset);
+}
+
+
 std::string colvarvalue::to_simple_string() const
 {
  switch (type()) {
--- a/lib/colvars/colvarvalue.h
+++ b/lib/colvars/colvarvalue.h
@ -193,6 +193,12 @@ public:
  /// Derivative with respect to this \link colvarvalue \endlink of the square distance
  colvarvalue dist2_grad(colvarvalue const &x2) const;

+  /// Return the midpoint between x1 and x2, optionally weighted by lambda
+  /// (which must be between 0.0 and 1.0)
+  static colvarvalue const interpolate(colvarvalue const &x1,
+                                       colvarvalue const &x2,
+                                       cvm::real const lambda = 0.5);
+
  /// Assignment operator (type of x is checked)
  colvarvalue & operator = (colvarvalue const &x);

@ -285,10 +291,10 @@ public:
  cvm::real & operator [] (int const i);

  /// Ensure that the two types are the same within a binary operator
-  int static check_types(colvarvalue const &x1, colvarvalue const &x2);
+  static int check_types(colvarvalue const &x1, colvarvalue const &x2);

  /// Ensure that the two types are the same within an assignment, or that the left side is type_notset
-  int static check_types_assign(Type const &vt1, Type const &vt2);
+  static int check_types_assign(Type const &vt1, Type const &vt2);

  /// Undefined operation
  void undef_op() const;
@ -317,14 +323,14 @@ public:

  /// \brief Optimized routine for the inner product of one collective
  /// variable with an array
-  void static inner_opt(colvarvalue const                        &x,
+  static void inner_opt(colvarvalue const                        &x,
                        std::vector<colvarvalue>::iterator       &xv,
                        std::vector<colvarvalue>::iterator const &xv_end,
                        std::vector<cvm::real>::iterator         &result);

  /// \brief Optimized routine for the inner product of one collective
  /// variable with an array
-  void static inner_opt(colvarvalue const                        &x,
+  static void inner_opt(colvarvalue const                        &x,
                        std::list<colvarvalue>::iterator         &xv,
                        std::list<colvarvalue>::iterator const   &xv_end,
                        std::vector<cvm::real>::iterator         &result);
@ -332,14 +338,14 @@ public:
  /// \brief Optimized routine for the second order Legendre
  /// polynomial, (3cos^2(w)-1)/2, of one collective variable with an
  /// array
-  void static p2leg_opt(colvarvalue const                        &x,
+  static void p2leg_opt(colvarvalue const                        &x,
                        std::vector<colvarvalue>::iterator       &xv,
                        std::vector<colvarvalue>::iterator const &xv_end,
                        std::vector<cvm::real>::iterator         &result);

  /// \brief Optimized routine for the second order Legendre
  /// polynomial of one collective variable with an array
-  void static p2leg_opt(colvarvalue const                        &x,
+  static void p2leg_opt(colvarvalue const                        &x,
                        std::list<colvarvalue>::iterator         &xv,
                        std::list<colvarvalue>::iterator const   &xv_end,
                        std::vector<cvm::real>::iterator         &result);
--- a/lib/gpu/Install.py
+++ b/lib/gpu/Install.py
@ -14,7 +14,7 @@ Syntax from lib dir: python Install.py -m machine -h hdir -a arch -p precision -

 specify one or more options, order does not matter

-copies an existing Makefile.machine in lib/gpu to Makefile.auto 
+copies an existing Makefile.machine in lib/gpu to Makefile.auto
 optionally edits these variables in Makefile.auto:
  CUDA_HOME, CUDA_ARCH, CUDA_PRECISION, EXTRAMAKE
 optionally uses Makefile.auto to build the GPU library -> libgpu.a
@ -26,7 +26,7 @@ optionally copies Makefile.auto to a new Makefile.osuffix
  -h = set CUDA_HOME variable in Makefile.auto to hdir
       hdir = path to NVIDIA Cuda software, e.g. /usr/local/cuda
  -a = set CUDA_ARCH variable in Makefile.auto to arch
-       use arch = 20 for Tesla C2050/C2070 (Fermi) (deprecated as of CUDA 8.0) 
+       use arch = 20 for Tesla C2050/C2070 (Fermi) (deprecated as of CUDA 8.0)
                     or GeForce GTX 580 or similar
       use arch = 30 for Tesla K10 (Kepler)
       use arch = 35 for Tesla K40 (Kepler) or GeForce GTX Titan or similar
@ -108,10 +108,10 @@ if pflag:
  elif precision == "mixed": precstr = "-D_SINGLE_DOUBLE"
  elif precision == "single": precstr = "-D_SINGLE_SINGLE"
  else: error("Invalid precision setting")
-  
+
 # create Makefile.auto
 # reset EXTRAMAKE, CUDA_HOME, CUDA_ARCH, CUDA_PRECISION if requested
-  
+
 if not os.path.exists("Makefile.%s" % isuffix):
  error("lib/gpu/Makefile.%s does not exist" % isuffix)

--- a/lib/gpu/lal_aux_fun1.h
+++ b/lib/gpu/lal_aux_fun1.h
@ -22,21 +22,21 @@
  offset=tid & (t_per_atom-1);                                               \
  ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom;

-#define nbor_info(nbor_mem, packed_mem, nbor_stride, t_per_atom, ii, offset, \
-                  i, numj, stride, nbor_end, nbor_begin)                     \
-  i=nbor_mem[ii];                                                            \
-  nbor_begin=ii+nbor_stride;                                                 \
-  numj=nbor_mem[nbor_begin];                                                 \
-  if (nbor_mem==packed_mem) {                                                \
-    nbor_begin+=nbor_stride+fast_mul(ii,t_per_atom-1);                       \
-    stride=fast_mul(t_per_atom,nbor_stride);                                 \
-    nbor_end=nbor_begin+fast_mul(numj/t_per_atom,stride)+(numj & (t_per_atom-1)); \
+#define nbor_info(dev_nbor, dev_packed, nbor_pitch, t_per_atom, ii, offset,  \
+                  i, numj, n_stride, nbor_end, nbor_begin)                   \
+  i=dev_nbor[ii];                                                            \
+  nbor_begin=ii+nbor_pitch;                                                  \
+  numj=dev_nbor[nbor_begin];                                                 \
+  if (dev_nbor==dev_packed) {                                                \
+    nbor_begin+=nbor_pitch+fast_mul(ii,t_per_atom-1);                        \
+    n_stride=fast_mul(t_per_atom,nbor_pitch);                                \
+    nbor_end=nbor_begin+fast_mul(numj/t_per_atom,n_stride)+(numj & (t_per_atom-1)); \
    nbor_begin+=offset;                                                      \
  } else {                                                                   \
-    nbor_begin+=nbor_stride;                                                 \
-    nbor_begin=nbor_mem[nbor_begin];                                         \
+    nbor_begin+=nbor_pitch;                                                  \
+    nbor_begin=dev_nbor[nbor_begin];                                         \
    nbor_end=nbor_begin+numj;                                                \
-    stride=t_per_atom;                                                       \
+    n_stride=t_per_atom;                                                     \
    nbor_begin+=offset;                                                      \
  }

--- a/lib/gpu/lal_base_three.cpp
+++ b/lib/gpu/lal_base_three.cpp
@ -20,7 +20,7 @@ using namespace LAMMPS_AL;
 extern Device<PRECISION,ACC_PRECISION> global_device;

 template <class numtyp, class acctyp>
-BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0)  {
+BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0) {
  device=&global_device;
  ans=new Answer<numtyp,acctyp>();
  nbor=new Neighbor();
@ -53,8 +53,8 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
                           const int max_nbors, const int maxspecial,
                           const double cell_size, const double gpu_split,
                           FILE *_screen, const void *pair_program,
-                           const char *k_two, const char *k_three_center,
-                           const char *k_three_end) {
+                           const char *two, const char *three_center,
+                           const char *three_end, const char *short_nbor) {
  screen=_screen;

  int gpu_nbor=0;
@ -70,10 +70,10 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
    _gpu_host=1;

  _threads_per_atom=device->threads_per_atom();
-  if (_threads_per_atom>1 && gpu_nbor==0) {
+  if (_threads_per_atom>1 && gpu_nbor==0) { // neigh no and tpa > 1
    nbor->packing(true);
    _nbor_data=&(nbor->dev_packed);
-  } else
+  } else  // neigh yes or tpa == 1
    _nbor_data=&(nbor->dev_nbor);
  if (_threads_per_atom*_threads_per_atom>device->warp_size())
    return -10;
@ -97,7 +97,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall,

  _block_pair=device->pair_block_size();
  _block_size=device->block_ellipse();
-  compile_kernels(*ucl_device,pair_program,k_two,k_three_center,k_three_end);
+  compile_kernels(*ucl_device,pair_program,two,three_center,three_end,short_nbor);

  // Initialize host-device load balancer
  hd_balancer.init(device,gpu_nbor,gpu_split);
@ -113,6 +113,11 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
  _max_an_bytes+=ans2->gpu_bytes();
  #endif

+  int ef_nall=nall;
+  if (ef_nall==0)
+    ef_nall=2000;
+  dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE);
+
  return 0;
 }

@ -136,6 +141,7 @@ void BaseThreeT::clear_atomic() {
    k_three_end.clear();
    k_three_end_vatom.clear();
    k_pair.clear();
+    k_short_nbor.clear();
    delete pair_program;
    _compiled=false;
  }
@ -143,6 +149,7 @@ void BaseThreeT::clear_atomic() {
  time_pair.clear();
  hd_balancer.clear();

+  dev_short_nbor.clear();
  nbor->clear();
  ans->clear();
  #ifdef THREE_CONCURRENT
@ -169,6 +176,8 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
  if (!success)
    return NULL;

+  _nall = nall;
+
  // originally the requirement that nall == nlist was enforced
  // to allow direct indexing neighbors of neighbors after re-arrangement
 //  nbor->get_host3(nall,nlist,ilist,numj,firstneigh,block_size());
@ -203,6 +212,8 @@ inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
    return 0;
  atom->cast_copy_x(host_x,host_type);

+  _nall = nall;
+
  int mn;
  nbor->build_nbor_list(host_x, nall, host_inum, nall, *atom, sublo, subhi, tag,
                        nspecial, special, success, mn);
@ -247,12 +258,22 @@ void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
    reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
    if (!success)
      return;
+    _max_nbors = nbor->max_nbor_loop(nlist,numj,ilist);
  }

  atom->cast_x_data(host_x,host_type);
  hd_balancer.start_timer();
  atom->add_x_data(host_x,host_type);

+  // re-allocate dev_short_nbor if necessary
+  if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
+    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
+    dev_short_nbor.resize((2+_max_nbors)*_nmax);
+  }
+
+  // _ainum to be used in loop() for short neighbor list build
+  _ainum = nlist;
+
  int evatom=0;
  if (eatom || vatom)
    evatom=1;
@ -300,7 +321,7 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,

  // Build neighbor list on GPU if necessary
  if (ago==0) {
-    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
+    _max_nbors = build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
                    sublo, subhi, tag, nspecial, special, success);
    if (!success)
      return NULL;
@ -313,6 +334,15 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
  *ilist=nbor->host_ilist.begin();
  *jnum=nbor->host_acc.begin();

+  // re-allocate dev_short_nbor if necessary
+  if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
+    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
+    dev_short_nbor.resize((2+_max_nbors)*_nmax);
+  }
+
+  // _ainum to be used in loop() for short neighbor list build
+  _ainum = nall;
+
  int evatom=0;
  if (eatom || vatom)
    evatom=1;
@ -339,19 +369,20 @@ double BaseThreeT::host_memory_usage_atomic() const {

 template <class numtyp, class acctyp>
 void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str,
-                                 const char *ktwo, const char *kthree_center,
-                                 const char *kthree_end) {
+                                 const char *two, const char *three_center,
+                                 const char *three_end, const char* short_nbor) {
  if (_compiled)
    return;

-  std::string vatom_name=std::string(kthree_end)+"_vatom";
+  std::string vatom_name=std::string(three_end)+"_vatom";

  pair_program=new UCL_Program(dev);
  pair_program->load_string(pair_str,device->compile_string().c_str());
-  k_three_center.set_function(*pair_program,kthree_center);
-  k_three_end.set_function(*pair_program,kthree_end);
+  k_three_center.set_function(*pair_program,three_center);
+  k_three_end.set_function(*pair_program,three_end);
  k_three_end_vatom.set_function(*pair_program,vatom_name.c_str());
-  k_pair.set_function(*pair_program,ktwo);
+  k_pair.set_function(*pair_program,two);
+  k_short_nbor.set_function(*pair_program,short_nbor);
  pos_tex.get_texture(*pair_program,"pos_tex");

  #ifdef THREE_CONCURRENT
--- a/lib/gpu/lal_base_three.h
+++ b/lib/gpu/lal_base_three.h
@ -56,7 +56,8 @@ class BaseThree {
                 const int maxspecial, const double cell_size,
                 const double gpu_split, FILE *screen,
                 const void *pair_program, const char *k_two,
-                 const char *k_three_center, const char *k_three_end);
+                 const char *k_three_center, const char *k_three_end,
+                 const char *k_short_nbor=NULL);

  /// Estimate the overhead for GPU context changes and CPU driver
  void estimate_gpu_overhead();
@ -73,18 +74,18 @@ class BaseThree {
  }

  /// Check if there is enough storage for neighbors and realloc if not
-  /** \param nlocal number of particles whose nbors must be stored on device
-    * \param host_inum number of particles whose nbors need to copied to host
-    * \param current maximum number of neighbors
+  /** \param inum number of particles whose nbors must be stored on device
+    * \param max_nbors maximum number of neighbors
+    * \param success set to false if insufficient memory
    * \note olist_size=total number of local particles **/
  inline void resize_local(const int inum, const int max_nbors, bool &success) {
    nbor->resize(inum,max_nbors,success);
  }

  /// Check if there is enough storage for neighbors and realloc if not
-  /** \param nlocal number of particles whose nbors must be stored on device
+  /** \param inum number of particles whose nbors must be stored on device
    * \param host_inum number of particles whose nbors need to copied to host
-    * \param current maximum number of neighbors
+    * \param max_nbors current maximum number of neighbors
    * \note host_inum is 0 if the host is performing neighboring
    * \note nlocal+host_inum=total number local particles
    * \note olist_size=0 **/
@ -143,14 +144,6 @@ class BaseThree {
               const bool vflag, const bool eatom, const bool vatom,
               int &host_start, const double cpu_time, bool &success);

-  /// Pair loop with device neighboring
-  int * compute(const int ago, const int inum_full, const int nall,
-                double **host_x, int *host_type, double *sublo,
-                double *subhi, tagint *tag, int **nspecial,
-                tagint **special, const bool eflag, const bool vflag,
-                const bool eatom, const bool vatom, int &host_start,
-                const double cpu_time, bool &success);
-
  /// Pair loop with device neighboring
  int ** compute(const int ago, const int inum_full,
                 const int nall, double **host_x, int *host_type, double *sublo,
@ -193,6 +186,9 @@ class BaseThree {
  /// Neighbor data
  Neighbor *nbor;

+  UCL_D_Vec<int> dev_short_nbor;
+  UCL_Kernel k_short_nbor;
+
  // ------------------------- DEVICE KERNELS -------------------------
  UCL_Program *pair_program;
  UCL_Kernel k_pair, k_three_center, k_three_end, k_three_end_vatom;
@ -207,12 +203,13 @@ class BaseThree {
  int _block_pair, _block_size, _threads_per_atom, _end_command_queue;
  int _gpu_nbor;
  double _max_bytes, _max_an_bytes;
+  int _max_nbors, _ainum, _nall;
  double _gpu_overhead, _driver_overhead;
  UCL_D_Vec<int> *_nbor_data;

  void compile_kernels(UCL_Device &dev, const void *pair_string,
-                       const char *k_two, const char *k_three_center,
-                       const char *k_three_end);
+                       const char *two, const char *three_center,
+                       const char *three_end, const char* short_nbor);

  virtual void loop(const bool _eflag, const bool _vflag,
                    const int evatom) = 0;
--- a/lib/gpu/lal_sw.cpp
+++ b/lib/gpu/lal_sw.cpp
@ -55,7 +55,7 @@ int SWT::init(const int ntypes, const int nlocal, const int nall, const int max_
  int success;
  success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
                           _screen,sw,"k_sw","k_sw_three_center",
-                           "k_sw_three_end");
+                           "k_sw_three_end","k_sw_short_nbor");
  if (success!=0)
    return success;

@ -193,19 +193,30 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
  else
    vflag=0;

-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+  // build the short neighbor list
+  int ainum=this->_ainum;
+  int nbor_pitch=this->nbor->nbor_pitch();
+  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
                               (BX/this->_threads_per_atom)));
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &sw3, &map, &elem2param, &_nelements,
+                 &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                 &this->dev_short_nbor, &ainum,
+                 &nbor_pitch, &this->_threads_per_atom);

  // this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1
  // this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1
-  int ainum=this->ans->inum();
-  int nbor_pitch=this->nbor->nbor_pitch();
+  ainum=this->ans->inum();
+  nbor_pitch=this->nbor->nbor_pitch();
+  GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
  this->time_pair.start();
-
+  
  this->k_pair.set_size(GX,BX);
  this->k_pair.run(&this->atom->x, &sw1, &sw2, &sw3,
                   &map, &elem2param, &_nelements,
                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                   &this->dev_short_nbor,
                   &this->ans->force, &this->ans->engv,
                   &eflag, &vflag, &ainum, &nbor_pitch,
                   &this->_threads_per_atom);
@ -217,6 +228,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
  this->k_three_center.run(&this->atom->x, &sw1, &sw2, &sw3,
                           &map, &elem2param, &_nelements,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                           &this->dev_short_nbor,
                           &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
                           &nbor_pitch, &this->_threads_per_atom, &evatom);

@ -231,7 +243,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end_vatom.run(&this->atom->x, &sw1, &sw2, &sw3,
                          &map, &elem2param, &_nelements,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc,
+                          &this->nbor->dev_acc, &this->dev_short_nbor,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);

@ -240,7 +252,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end.run(&this->atom->x, &sw1, &sw2, &sw3,
                          &map, &elem2param, &_nelements,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc,
+                          &this->nbor->dev_acc, &this->dev_short_nbor,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);

--- a/lib/gpu/lal_sw.cu
+++ b/lib/gpu/lal_sw.cu
@ -130,6 +130,63 @@ texture<int4> sw3_tex;

 #endif

+__kernel void k_sw_short_nbor(const __global numtyp4 *restrict x_,
+                           const __global numtyp4 *restrict sw3,
+                           const __global int *restrict map,
+                           const __global int *restrict elem2param,
+                           const int nelements,
+                           const __global int * dev_nbor,
+                           const __global int * dev_packed,
+                           __global int * dev_short_nbor,
+                           const int inum, const int nbor_pitch, const int t_per_atom) {
+  __local int n_stride;
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    int itype=ix.w;
+    itype=map[itype];
+
+    int ncount = 0;
+    int m = nbor;
+    dev_short_nbor[m] = 0;
+    int nbor_short = nbor+n_stride;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int j=dev_packed[nbor];
+      int nj = j;
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int jtype=jx.w;
+      jtype=map[jtype];
+      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<sw3[ijparam].y) { // sw_cutsq = sw3[ijparam].y
+        dev_short_nbor[nbor_short] = nj;
+        nbor_short += n_stride;
+        ncount++;
+      }
+    } // for nbor
+
+    // store the number of neighbors for each thread
+    dev_short_nbor[m] = ncount;
+
+  } // if ii
+}

 __kernel void k_sw(const __global numtyp4 *restrict x_,
                   const __global numtyp4 *restrict sw1,
@ -140,6 +197,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
                   const int nelements,
                   const __global int * dev_nbor,
                   const __global int * dev_packed,
+                   const __global int * dev_short_nbor,
                   __global acctyp4 *restrict ans,
                   __global acctyp *restrict engv,
                   const int eflag, const int vflag, const int inum,
@ -158,8 +216,8 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
  __syncthreads();

  if (ii<inum) {
-    int nbor, nbor_end;
-    int i, numj;
+    int nbor, nbor_end, i, numj;
+    const int* nbor_mem = dev_packed;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

@ -167,9 +225,17 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
    int itype=ix.w;
    itype=map[itype];

+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
    for ( ; nbor<nbor_end; nbor+=n_stride) {

-      int j=dev_packed[nbor];
+      int j=nbor_mem[nbor];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -337,6 +403,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
                                const int nelements,
                                const __global int * dev_nbor,
                                const __global int * dev_packed,
+                                const __global int * dev_short_nbor,
                                __global acctyp4 *restrict ans,
                                __global acctyp *restrict engv,
                                const int eflag, const int vflag,
@ -361,7 +428,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int i, numj, nbor_j, nbor_end;
-
+    const int* nbor_mem = dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
@ -371,9 +438,18 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
    int itype=ix.w;
    itype=map[itype];

+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+    int nborj_start = nbor_j;
+
    for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {

-      int j=dev_packed[nbor_j];
+      int j=nbor_mem[nbor_j];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -395,14 +471,23 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
      sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
      sw_cut_ij=sw3_ijparam.x;

-      int nbor_k=nbor_j-offset_j+offset_k;
-      if (nbor_k<=nbor_j)
-        nbor_k+=n_stride;
+      int nbor_k,k_end;
+      if (dev_packed==dev_nbor) {
+        nbor_k=nborj_start-offset_j+offset_k;
+        int numk = dev_short_nbor[nbor_k-n_stride];
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      } else {
+        nbor_k = nbor_j-offset_j+offset_k;
+        if (nbor_k<=nbor_j) nbor_k += n_stride;
+        k_end = nbor_end;
+      }

-      for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
+        int k=nbor_mem[nbor_k];
        k &= NEIGHMASK;

+        if (dev_packed==dev_nbor && k <= j) continue;
+
        numtyp4 kx; fetch4(kx,k,pos_tex);
        int ktype=kx.w;
        ktype=map[ktype];
@ -460,6 +545,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
                             const __global int * dev_nbor,
                             const __global int * dev_packed,
                             const __global int * dev_acc,
+                             const __global int * dev_short_nbor,
                             __global acctyp4 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag,
@ -484,7 +570,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int i, numj, nbor_j, nbor_end, k_end;
-
+    const int* nbor_mem = dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
@ -494,8 +580,16 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
    int itype=ix.w;
    itype=map[itype];

+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
    for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
-      int j=dev_packed[nbor_j];
+      int j=nbor_mem[nbor_j];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -534,8 +628,15 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
        nbor_k+=offset_k;
      }

+      // recalculate numk and k_end for the use of short neighbor list
+      if (dev_packed==dev_nbor) {
+        numk = dev_short_nbor[nbor_k];
+        nbor_k += n_stride;
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
+
      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=nbor_mem[nbor_k];
        k &= NEIGHMASK;

        if (k == i) continue;
@ -598,6 +699,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
                             const __global int * dev_nbor,
                             const __global int * dev_packed,
                             const __global int * dev_acc,
+                             const __global int * dev_short_nbor,
                             __global acctyp4 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag,
@ -622,7 +724,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int i, numj, nbor_j, nbor_end, k_end;
-
+    const int* nbor_mem = dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
@ -632,8 +734,16 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
    int itype=ix.w;
    itype=map[itype];

+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
    for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
-      int j=dev_packed[nbor_j];
+      int j=nbor_mem[nbor_j];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -672,8 +782,15 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
        nbor_k+=offset_k;
      }

+      // recalculate numk and k_end for the use of short neighbor list
+      if (dev_packed==dev_nbor) {
+        numk = dev_short_nbor[nbor_k];
+        nbor_k += n_stride;
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
+
      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=nbor_mem[nbor_k];
        k &= NEIGHMASK;

        if (k == i) continue;
--- a/lib/gpu/lal_tersoff.cpp
+++ b/lib/gpu/lal_tersoff.cpp
@ -55,7 +55,8 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int
  int success;
  success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
                           _screen,tersoff,"k_tersoff_repulsive",
-                           "k_tersoff_three_center", "k_tersoff_three_end");
+                           "k_tersoff_three_center", "k_tersoff_three_end",
+                           "k_tersoff_short_nbor");
  if (success!=0)
    return success;

@ -157,11 +158,16 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int

  UCL_H_Vec<numtyp> cutsq_view(nparams,*(this->ucl_device),
                               UCL_WRITE_ONLY);
-  for (int i=0; i<nparams; i++)
+  double cutsqmax = 0.0;
+  for (int i=0; i<nparams; i++) {
    cutsq_view[i]=static_cast<numtyp>(host_cutsq[i]);
+    if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i];
+  }
  cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
  ucl_copy(cutsq,cutsq_view,false);

+  _cutshortsq = static_cast<numtyp>(cutsqmax);
+
  UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements,
                           *(this->ucl_device), UCL_WRITE_ONLY);

@ -219,171 +225,6 @@ double TersoffT::host_memory_usage() const {

 #define KTHREADS this->_threads_per_atom
 #define JTHREADS this->_threads_per_atom
-// ---------------------------------------------------------------------------
-// Copy nbor list from host if necessary and then calculate forces, virials,..
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-void TersoffT::compute(const int f_ago, const int inum_full, const int nall,
-                       const int nlist, double **host_x, int *host_type,
-                       int *ilist, int *numj, int **firstneigh,
-                       const bool eflag, const bool vflag, const bool eatom,
-                       const bool vatom, int &host_start,
-                       const double cpu_time, bool &success) {
-  this->acc_timers();
-  if (inum_full==0) {
-    host_start=0;
-    // Make sure textures are correct if realloc by a different hybrid style
-    this->resize_atom(0,nall,success);
-    this->zero_timers();
-    return;
-  }
-
-  int ago=this->hd_balancer.ago_first(f_ago);
-  int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
-  this->ans->inum(inum);
-  #ifdef THREE_CONCURRENT
-  this->ans2->inum(inum);
-  #endif
-  host_start=inum;
-
-  if (ago==0) {
-    this->reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
-    if (!success)
-      return;
-    _max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist);
-  }
-
-  this->atom->cast_x_data(host_x,host_type);
-  this->hd_balancer.start_timer();
-  this->atom->add_x_data(host_x,host_type);
-
-  // re-allocate zetaij if necessary
-  if (nall*_max_nbors > _zetaij.cols()) {
-    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-    _zetaij.resize(_max_nbors*_nmax);
-  }
-
-  int _eflag;
-  if (eflag)
-    _eflag=1;
-  else
-    _eflag=0;
-
-  int ainum=nlist;
-  int nbor_pitch=this->nbor->nbor_pitch();
-  int BX=this->block_pair();
-  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
-                               (BX/(JTHREADS*KTHREADS))));
-
-  this->k_zeta.set_size(GX,BX);
-  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
-                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
-
-  int evatom=0;
-  if (eatom || vatom)
-    evatom=1;
-  #ifdef THREE_CONCURRENT
-  this->ucl_device->sync();
-  #endif
-  loop(eflag,vflag,evatom);
-  this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
-  this->device->add_ans_object(this->ans);
-  #ifdef THREE_CONCURRENT
-  this->ans2->copy_answers(eflag,vflag,eatom,vatom,ilist);
-  this->device->add_ans_object(this->ans2);
-  #endif
-  this->hd_balancer.stop_timer();
-}
-
-// ---------------------------------------------------------------------------
-// Reneighbor on GPU if necessary and then compute forces, virials, energies
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-int ** TersoffT::compute(const int ago, const int inum_full,
-                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag,
-                         int **nspecial, tagint **special, const bool eflag,
-                         const bool vflag, const bool eatom,
-                         const bool vatom, int &host_start,
-                         int **ilist, int **jnum,
-                         const double cpu_time, bool &success) {
-  this->acc_timers();
-
-  if (inum_full==0) {
-    host_start=0;
-    // Make sure textures are correct if realloc by a different hybrid style
-    this->resize_atom(0,nall,success);
-    this->zero_timers();
-    return NULL;
-  }
-
-  this->hd_balancer.balance(cpu_time);
-  int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
-  this->ans->inum(inum);
-  #ifdef THREE_CONCURRENT
-  this->ans2->inum(inum);
-  #endif
-  host_start=inum;
-
-  // Build neighbor list on GPU if necessary
-  if (ago==0) {
-    _max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
-                    sublo, subhi, tag, nspecial, special, success);
-    if (!success)
-      return NULL;
-    this->hd_balancer.start_timer();
-  } else {
-    this->atom->cast_x_data(host_x,host_type);
-    this->hd_balancer.start_timer();
-    this->atom->add_x_data(host_x,host_type);
-  }
-  *ilist=this->nbor->host_ilist.begin();
-  *jnum=this->nbor->host_acc.begin();
-
-  // re-allocate zetaij if necessary
-  if (nall*_max_nbors > _zetaij.cols()) {
-    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-    _zetaij.resize(_max_nbors*_nmax);
-  }
-
-  int _eflag;
-  if (eflag)
-    _eflag=1;
-  else
-    _eflag=0;
-
-  int ainum=nall;
-  int nbor_pitch=this->nbor->nbor_pitch();
-  int BX=this->block_pair();
-  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
-                               (BX/(JTHREADS*KTHREADS))));
-
-  this->k_zeta.set_size(GX,BX);
-  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
-                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
-
-  int evatom=0;
-  if (eatom || vatom)
-    evatom=1;
-  #ifdef THREE_CONCURRENT
-  this->ucl_device->sync();
-  #endif
-  loop(eflag,vflag,evatom);
-  this->ans->copy_answers(eflag,vflag,eatom,vatom);
-  this->device->add_ans_object(this->ans);
-  #ifdef THREE_CONCURRENT
-  this->ans2->copy_answers(eflag,vflag,eatom,vatom);
-  this->device->add_ans_object(this->ans2);
-  #endif
-  this->hd_balancer.stop_timer();
-
-  return this->nbor->host_jlist.begin()-host_start;
-}
-
 // ---------------------------------------------------------------------------
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
@ -402,9 +243,40 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
  else
    vflag=0;

-  int ainum=this->ans->inum();
+  // build the short neighbor list
+  int ainum=this->_ainum;
  int nbor_pitch=this->nbor->nbor_pitch();
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
+                               (BX/this->_threads_per_atom)));
+
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &cutsq, &map,
+                 &elem2param, &_nelements, &_nparams,
+                 &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                 &this->dev_short_nbor, &ainum,
+                 &nbor_pitch, &this->_threads_per_atom);
+
+  // re-allocate zetaij if necessary
+  int nall = this->_nall;
+  if (nall*this->_max_nbors > _zetaij.cols()) {
+    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
+    _zetaij.resize(this->_max_nbors*_nmax);
+  }
+
+  nbor_pitch=this->nbor->nbor_pitch();
+  GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/
+                               (BX/(JTHREADS*KTHREADS))));
+
+  this->k_zeta.set_size(GX,BX);
+  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
+                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
+                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                   &this->dev_short_nbor,
+                   &_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);
+
+  ainum=this->ans->inum();
+  nbor_pitch=this->nbor->nbor_pitch();
+  GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));

  this->time_pair.start();
@ -412,6 +284,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
  this->k_pair.run(&this->atom->x, &ts1, &ts2, &cutsq,
                   &map, &elem2param, &_nelements, &_nparams,
                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                   &this->dev_short_nbor,
                   &this->ans->force, &this->ans->engv,
                   &eflag, &vflag, &ainum, &nbor_pitch,
                   &this->_threads_per_atom);
@ -423,6 +296,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
  this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                           &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                           &this->dev_short_nbor,
                           &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
                           &nbor_pitch, &this->_threads_per_atom, &evatom);

@ -437,7 +311,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                          &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc,
+                          &this->nbor->dev_acc, &this->dev_short_nbor,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);

@ -446,7 +320,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                          &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc,
+                          &this->nbor->dev_acc, &this->dev_short_nbor,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
  }
--- a/lib/gpu/lal_tersoff.cu
+++ b/lib/gpu/lal_tersoff.cu
@ -106,7 +106,7 @@ texture<int4> ts5_tex;
    ans[ii]=old;                                                            \
  }

-#define store_zeta(z, tid, t_per_atom, offset)                              \
+#define acc_zeta(z, tid, t_per_atom, offset)                                \
  if (t_per_atom>1) {                                                       \
    __local acctyp red_acc[BLOCK_PAIR];                                     \
    red_acc[tid]=z;                                                         \
@ -155,7 +155,7 @@ texture<int4> ts5_tex;
    ans[ii]=old;                                                            \
  }

-#define store_zeta(z, tid, t_per_atom, offset)                              \
+#define acc_zeta(z, tid, t_per_atom, offset)                                \
  if (t_per_atom>1) {                                                       \
    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
      z += shfl_xor(z, s, t_per_atom);                                      \
@ -164,6 +164,65 @@ texture<int4> ts5_tex;

 #endif

+__kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_,
+                                   const __global numtyp *restrict cutsq,
+                                   const __global int *restrict map,
+                                   const __global int *restrict elem2param,
+                                   const int nelements, const int nparams,
+                                   const __global int * dev_nbor,
+                                   const __global int * dev_packed,
+                                   __global int * dev_short_nbor,
+                                   const int inum, const int nbor_pitch,
+                                   const int t_per_atom) {
+  __local int n_stride;
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    int itype=ix.w;
+    itype=map[itype];
+
+    int ncount = 0;
+    int m = nbor;
+    dev_short_nbor[m] = 0;
+    int nbor_short = nbor+n_stride;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int j=dev_packed[nbor];
+      int nj = j;
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int jtype=jx.w;
+      jtype=map[jtype];
+      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<cutsq[ijparam]) {
+        dev_short_nbor[nbor_short] = nj;
+        nbor_short += n_stride;
+        ncount++;
+      }
+    } // for nbor
+
+    // store the number of neighbors for each thread
+    dev_short_nbor[m] = ncount;
+
+  } // if ii
+}
+
 // Tersoff is currently used for 3 elements at most: 3*3*3 = 27 entries
 // while the block size should never be less than 32.
 // SHARED_SIZE = 32 for now to reduce the pressure on the shared memory per block
@ -184,6 +243,7 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
                             __global acctyp4 * zetaij,
                             const __global int * dev_nbor,
                             const __global int * dev_packed,
+                             const __global int * dev_short_nbor,
                             const int eflag, const int inum,
                             const int nbor_pitch, const int t_per_atom) {
  __local int tpa_sq,n_stride;
@ -211,22 +271,29 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
  __syncthreads();

  if (ii<inum) {
-    int nbor_j, nbor_end;
-    int i, numj;
-
+    int nbor_j, nbor_end, i, numj;
+    const int* nbor_mem=dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
    int offset_k=tid & (t_per_atom-1);
-    int nborj_start = nbor_j;

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;
    itype=map[itype];

+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+    int nborj_start = nbor_j;
+
    for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {

-      int j=dev_packed[nbor_j];
+      int j=nbor_mem[nbor_j];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -241,14 +308,20 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
      delr1.z = jx.z-ix.z;
      numtyp rsq1 = delr1.x*delr1.x+delr1.y*delr1.y+delr1.z*delr1.z;

-      if (rsq1 > cutsq[ijparam]) continue;
+//      if (rsq1 > cutsq[ijparam]) continue;

      // compute zeta_ij
      z = (acctyp)0;

      int nbor_k = nborj_start-offset_j+offset_k;
-      for ( ; nbor_k < nbor_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+      int k_end = nbor_end;
+      if (dev_packed==dev_nbor) {
+        int numk = dev_short_nbor[nbor_k-n_stride];
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
+
+      for ( ; nbor_k < k_end; nbor_k+=n_stride) {
+        int k=nbor_mem[nbor_k];
        k &= NEIGHMASK;

        if (k == j) continue;
@ -284,10 +357,12 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,

      //int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
      //int idx = jj*n_stride + i*t_per_atom + offset_j;
-      int idx;
-      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-               i, nbor_j, offset_j, idx);
-      store_zeta(z, tid, t_per_atom, offset_k);
+      //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
+      int idx = nbor_j;
+      if (dev_packed==dev_nbor) idx -= n_stride;
+//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//               i, nbor_j, offset_j, idx);
+      acc_zeta(z, tid, t_per_atom, offset_k);

      numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
      numtyp ijparam_lam2 = ts1_ijparam.y;
@ -330,6 +405,7 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
                                  const int nelements, const int nparams,
                                  const __global int * dev_nbor,
                                  const __global int * dev_packed,
+                                  const __global int * dev_short_nbor,
                                  __global acctyp4 *restrict ans,
                                  __global acctyp *restrict engv,
                                  const int eflag, const int vflag,
@ -356,8 +432,8 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
  __syncthreads();

  if (ii<inum) {
-    int nbor, nbor_end;
-    int i, numj;
+    int nbor, nbor_end, i, numj;
+    const int* nbor_mem=dev_packed;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

@ -365,9 +441,17 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
    int itype=ix.w;
    itype=map[itype];

+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
    for ( ; nbor<nbor_end; nbor+=n_stride) {

-      int j=dev_packed[nbor];
+      int j=nbor_mem[nbor];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -382,32 +466,31 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;

-      if (rsq<cutsq[ijparam]) {
-        numtyp feng[2];
-        numtyp ijparam_lam1 = ts1[ijparam].x;
-        numtyp4 ts2_ijparam = ts2[ijparam];
-        numtyp ijparam_biga = ts2_ijparam.x;
-        numtyp ijparam_bigr = ts2_ijparam.z;
-        numtyp ijparam_bigd = ts2_ijparam.w;
+      // rsq<cutsq[ijparam]
+      numtyp feng[2];
+      numtyp ijparam_lam1 = ts1[ijparam].x;
+      numtyp4 ts2_ijparam = ts2[ijparam];
+      numtyp ijparam_biga = ts2_ijparam.x;
+      numtyp ijparam_bigr = ts2_ijparam.z;
+      numtyp ijparam_bigd = ts2_ijparam.w;

-        repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
-                  rsq, eflag, feng);
+      repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
+                rsq, eflag, feng);

-        numtyp force = feng[0];
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
+      numtyp force = feng[0];
+      f.x+=delx*force;
+      f.y+=dely*force;
+      f.z+=delz*force;

-        if (eflag>0)
-          energy+=feng[1];
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
+      if (eflag>0)
+        energy+=feng[1];
+      if (vflag>0) {
+        virial[0] += delx*delx*force;
+        virial[1] += dely*dely*force;
+        virial[2] += delz*delz*force;
+        virial[3] += delx*dely*force;
+        virial[4] += delx*delz*force;
+        virial[5] += dely*delz*force;
      }
    } // for nbor

@ -428,6 +511,7 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
                                     const __global acctyp4 *restrict zetaij,
                                     const __global int * dev_nbor,
                                     const __global int * dev_packed,
+                                     const __global int * dev_short_nbor,
                                     __global acctyp4 *restrict ans,
                                     __global acctyp *restrict engv,
                                     const int eflag, const int vflag,
@ -461,20 +545,28 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int i, numj, nbor_j, nbor_end;
-
+    const int* nbor_mem=dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
    int offset_k=tid & (t_per_atom-1);
-    int nborj_start = nbor_j;

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;
    itype=map[itype];

+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+    int nborj_start = nbor_j;
+
    for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {

-      int j=dev_packed[nbor_j];
+      int j=nbor_mem[nbor_j];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -489,7 +581,6 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
      delr1[2] = jx.z-ix.z;
      numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];

-      if (rsq1 > cutsq[ijparam]) continue;
      numtyp r1 = ucl_sqrt(rsq1);
      numtyp r1inv = ucl_rsqrt(rsq1);

@ -497,9 +588,11 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,

      //int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
      //int idx = jj*n_stride + i*t_per_atom + offset_j;
-      int idx;
-      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-               i, nbor_j, offset_j, idx);
+      //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
+      int idx = nbor_j;
+      if (dev_packed==dev_nbor) idx -= n_stride;
+//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//               i, nbor_j, offset_j, idx);
      acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
      numtyp force = zeta_ij.x*tpainv;
      numtyp prefactor = zeta_ij.y;
@ -520,9 +613,15 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
        virial[5] += delr1[1]*delr1[2]*mforce;
      }

-      int nbor_k=nborj_start-offset_j+offset_k;
-      for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+      int nbor_k = nborj_start-offset_j+offset_k;
+      int k_end = nbor_end;
+      if (dev_packed==dev_nbor) {
+        int numk = dev_short_nbor[nbor_k-n_stride];
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
+
+      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
+        int k=nbor_mem[nbor_k];
        k &= NEIGHMASK;

        if (j == k) continue;
@ -598,6 +697,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
                                  const __global int * dev_nbor,
                                  const __global int * dev_packed,
                                  const __global int * dev_acc,
+                                  const __global int * dev_short_nbor,
                                  __global acctyp4 *restrict ans,
                                  __global acctyp *restrict engv,
                                  const int eflag, const int vflag,
@ -632,7 +732,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int i, numj, nbor_j, nbor_end, k_end;
-
+    const int* nbor_mem=dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
@ -643,9 +743,18 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
    itype=map[itype];

    numtyp tpainv = ucl_recip((numtyp)t_per_atom);
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
    for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {

-      int j=dev_packed[nbor_j];
+      int j=nbor_mem[nbor_j];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -660,8 +769,6 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
      delr1[2] = jx.z-ix.z;
      numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];

-      if (rsq1 > cutsq[ijparam]) continue;
-
      numtyp mdelr1[3];
      mdelr1[0] = -delr1[0];
      mdelr1[1] = -delr1[1];
@ -683,13 +790,20 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
        k_end=nbor_k+numk;
        nbor_k+=offset_k;
      }
+
+      // recalculate numk and k_end for the use of short neighbor list
+      if (dev_packed==dev_nbor) {
+        numk = dev_short_nbor[nbor_k];
+        nbor_k += n_stride;
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
      int nbork_start = nbor_k;

      // look up for zeta_ji: find i in the j's neighbor list
      int m = tid / t_per_atom;
      int ijnum = -1;
      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=nbor_mem[nbor_k];
        k &= NEIGHMASK;
        if (k == i) {
          ijnum = nbor_k;
@ -711,9 +825,11 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,

      //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
      //int idx = iix*n_stride + j*t_per_atom + offset_kf;
-      int idx;
-      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-               j, ijnum, offset_kf, idx);
+      //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
+      int idx = ijnum;
+      if (dev_packed==dev_nbor) idx -= n_stride;
+//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//               j, ijnum, offset_kf, idx);
      acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
      numtyp force = zeta_ji.x*tpainv;
      numtyp prefactor_ji = zeta_ji.y;
@ -736,7 +852,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,

      // attractive forces
      for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=nbor_mem[nbor_k];
        k &= NEIGHMASK;

        if (k == i) continue;
@ -777,9 +893,11 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,

        //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
        //int idx = kk*n_stride + j*t_per_atom + offset_k;
-        int idx;
-        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-                 j, nbor_k, offset_k, idx);
+        //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
+        int idx = nbor_k;
+        if (dev_packed==dev_nbor) idx -= n_stride;
+//        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//                 j, nbor_k, offset_k, idx);
        acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
        numtyp prefactor_jk = zeta_jk.y;
        int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
@ -824,6 +942,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
                                        const __global int * dev_nbor,
                                        const __global int * dev_packed,
                                        const __global int * dev_acc,
+                                        const __global int * dev_short_nbor,
                                        __global acctyp4 *restrict ans,
                                        __global acctyp *restrict engv,
                                        const int eflag, const int vflag,
@ -858,7 +977,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int i, numj, nbor_j, nbor_end, k_end;
-
+    const int* nbor_mem = dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
@ -869,9 +988,18 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
    itype=map[itype];

    numtyp tpainv = ucl_recip((numtyp)t_per_atom);
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
    for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {

-      int j=dev_packed[nbor_j];
+      int j=nbor_mem[nbor_j];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -886,8 +1014,6 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
      delr1[2] = jx.z-ix.z;
      numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];

-      if (rsq1 > cutsq[ijparam]) continue;
-
      numtyp mdelr1[3];
      mdelr1[0] = -delr1[0];
      mdelr1[1] = -delr1[1];
@ -909,13 +1035,20 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
        k_end=nbor_k+numk;
        nbor_k+=offset_k;
      }
+
+      // recalculate numk and k_end for the use of short neighbor list
+      if (dev_packed==dev_nbor) {
+        numk = dev_short_nbor[nbor_k];
+        nbor_k += n_stride;
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
      int nbork_start = nbor_k;

      // look up for zeta_ji
      int m = tid / t_per_atom;
      int ijnum = -1;
      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=nbor_mem[nbor_k];
        k &= NEIGHMASK;
        if (k == i) {
          ijnum = nbor_k;
@ -937,9 +1070,11 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,

      //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
      //int idx = iix*n_stride + j*t_per_atom + offset_kf;
-      int idx;
-      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-               j, ijnum, offset_kf, idx);
+      //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
+      int idx = ijnum;
+      if (dev_packed==dev_nbor) idx -= n_stride;
+//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//               j, ijnum, offset_kf, idx);
      acctyp4 zeta_ji = zetaij[idx]; //  fetch(zeta_ji,idx,zeta_tex);
      numtyp force = zeta_ji.x*tpainv;
      numtyp prefactor_ji = zeta_ji.y;
@ -962,7 +1097,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,

      // attractive forces
      for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=nbor_mem[nbor_k];
        k &= NEIGHMASK;

        if (k == i) continue;
@ -1010,9 +1145,11 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,

        //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
        //int idx = kk*n_stride + j*t_per_atom + offset_k;
-        int idx;
-        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-                 j, nbor_k, offset_k, idx);
+        //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
+        int idx = nbor_k;
+        if (dev_packed==dev_nbor) idx -= n_stride;
+//        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//                 j, nbor_k, offset_k, idx);
        acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
        numtyp prefactor_jk = zeta_jk.y;

@ -1040,7 +1177,6 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
        virial[3] += TWOTHIRD*(delr2[0]*fj[1] + mdelr1[0]*fk[1]);
        virial[4] += TWOTHIRD*(delr2[0]*fj[2] + mdelr1[0]*fk[2]);
        virial[5] += TWOTHIRD*(delr2[1]*fj[2] + mdelr1[1]*fk[2]);
-
      }
    } // for nbor

--- a/lib/gpu/lal_tersoff.h
+++ b/lib/gpu/lal_tersoff.h
@ -47,21 +47,6 @@ class Tersoff : public BaseThree<numtyp, acctyp> {
           const double* h, const double* gamma, const double* beta,
           const double* powern, const double* cutsq);

-  /// Pair loop with host neighboring
-  void compute(const int f_ago, const int inum_full, const int nall,
-               const int nlist, double **host_x, int *host_type,
-               int *ilist, int *numj, int **firstneigh, const bool eflag,
-               const bool vflag, const bool eatom, const bool vatom,
-               int &host_start, const double cpu_time, bool &success);
-
-  /// Pair loop with device neighboring
-  int ** compute(const int ago, const int inum_full,
-                 const int nall, double **host_x, int *host_type, double *sublo,
-                 double *subhi, tagint *tag, int **nspecial,
-                 tagint **special, const bool eflag, const bool vflag,
-                 const bool eatom, const bool vatom, int &host_start,
-                 int **ilist, int **numj, const double cpu_time, bool &success);
-
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear();
@ -104,8 +89,7 @@ class Tersoff : public BaseThree<numtyp, acctyp> {

  UCL_Kernel k_zeta;
  UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex;
-
-  int _max_nbors;
+  numtyp _cutshortsq;

 private:
  bool _allocated;
--- a/lib/gpu/lal_tersoff_mod.cpp
+++ b/lib/gpu/lal_tersoff_mod.cpp
@ -55,7 +55,8 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
  int success;
  success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
                           _screen,tersoff_mod,"k_tersoff_mod_repulsive",
-                           "k_tersoff_mod_three_center", "k_tersoff_mod_three_end");
+                           "k_tersoff_mod_three_center", "k_tersoff_mod_three_end",
+                           "k_tersoff_mod_short_nbor");
  if (success!=0)
    return success;

@ -157,11 +158,16 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in

  UCL_H_Vec<numtyp> cutsq_view(nparams,*(this->ucl_device),
                               UCL_WRITE_ONLY);
-  for (int i=0; i<nparams; i++)
+  double cutsqmax = 0.0;
+  for (int i=0; i<nparams; i++) {
    cutsq_view[i]=static_cast<numtyp>(host_cutsq[i]);
+    if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i];
+  }
  cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
  ucl_copy(cutsq,cutsq_view,false);

+  _cutshortsq = static_cast<numtyp>(cutsqmax);
+
  UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements,
                           *(this->ucl_device), UCL_WRITE_ONLY);

@ -219,171 +225,6 @@ double TersoffMT::host_memory_usage() const {

 #define KTHREADS this->_threads_per_atom
 #define JTHREADS this->_threads_per_atom
-// ---------------------------------------------------------------------------
-// Copy nbor list from host if necessary and then calculate forces, virials,..
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-void TersoffMT::compute(const int f_ago, const int inum_full, const int nall,
-                       const int nlist, double **host_x, int *host_type,
-                       int *ilist, int *numj, int **firstneigh,
-                       const bool eflag, const bool vflag, const bool eatom,
-                       const bool vatom, int &host_start,
-                       const double cpu_time, bool &success) {
-  this->acc_timers();
-  if (inum_full==0) {
-    host_start=0;
-    // Make sure textures are correct if realloc by a different hybrid style
-    this->resize_atom(0,nall,success);
-    this->zero_timers();
-    return;
-  }
-
-  int ago=this->hd_balancer.ago_first(f_ago);
-  int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
-  this->ans->inum(inum);
-  #ifdef THREE_CONCURRENT
-  this->ans2->inum(inum);
-  #endif
-  host_start=inum;
-
-  if (ago==0) {
-    this->reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
-    if (!success)
-      return;
-    _max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist);
-  }
-
-  this->atom->cast_x_data(host_x,host_type);
-  this->hd_balancer.start_timer();
-  this->atom->add_x_data(host_x,host_type);
-
-  // re-allocate zetaij if necessary
-  if (nall*_max_nbors > _zetaij.cols()) {
-    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-    _zetaij.resize(_max_nbors*_nmax);
-  }
-
-  int _eflag;
-  if (eflag)
-    _eflag=1;
-  else
-    _eflag=0;
-
-  int ainum=nlist;
-  int nbor_pitch=this->nbor->nbor_pitch();
-  int BX=this->block_pair();
-  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
-                               (BX/(JTHREADS*KTHREADS))));
-
-  this->k_zeta.set_size(GX,BX);
-  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
-                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
-
-  int evatom=0;
-  if (eatom || vatom)
-    evatom=1;
-  #ifdef THREE_CONCURRENT
-  this->ucl_device->sync();
-  #endif
-  loop(eflag,vflag,evatom);
-  this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
-  this->device->add_ans_object(this->ans);
-  #ifdef THREE_CONCURRENT
-  this->ans2->copy_answers(eflag,vflag,eatom,vatom,ilist);
-  this->device->add_ans_object(this->ans2);
-  #endif
-  this->hd_balancer.stop_timer();
-}
-
-// ---------------------------------------------------------------------------
-// Reneighbor on GPU if necessary and then compute forces, virials, energies
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-int ** TersoffMT::compute(const int ago, const int inum_full,
-                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag,
-                         int **nspecial, tagint **special, const bool eflag,
-                         const bool vflag, const bool eatom,
-                         const bool vatom, int &host_start,
-                         int **ilist, int **jnum,
-                         const double cpu_time, bool &success) {
-  this->acc_timers();
-
-  if (inum_full==0) {
-    host_start=0;
-    // Make sure textures are correct if realloc by a different hybrid style
-    this->resize_atom(0,nall,success);
-    this->zero_timers();
-    return NULL;
-  }
-
-  this->hd_balancer.balance(cpu_time);
-  int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
-  this->ans->inum(inum);
-  #ifdef THREE_CONCURRENT
-  this->ans2->inum(inum);
-  #endif
-  host_start=inum;
-
-  // Build neighbor list on GPU if necessary
-  if (ago==0) {
-    _max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
-                    sublo, subhi, tag, nspecial, special, success);
-    if (!success)
-      return NULL;
-    this->hd_balancer.start_timer();
-  } else {
-    this->atom->cast_x_data(host_x,host_type);
-    this->hd_balancer.start_timer();
-    this->atom->add_x_data(host_x,host_type);
-  }
-  *ilist=this->nbor->host_ilist.begin();
-  *jnum=this->nbor->host_acc.begin();
-
-  // re-allocate zetaij if necessary
-  if (nall*_max_nbors > _zetaij.cols()) {
-    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-    _zetaij.resize(_max_nbors*_nmax);
-  }
-
-  int _eflag;
-  if (eflag)
-    _eflag=1;
-  else
-    _eflag=0;
-
-  int ainum=nall;
-  int nbor_pitch=this->nbor->nbor_pitch();
-  int BX=this->block_pair();
-  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
-                               (BX/(JTHREADS*KTHREADS))));
-
-  this->k_zeta.set_size(GX,BX);
-  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
-                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
-
-  int evatom=0;
-  if (eatom || vatom)
-    evatom=1;
-  #ifdef THREE_CONCURRENT
-  this->ucl_device->sync();
-  #endif
-  loop(eflag,vflag,evatom);
-  this->ans->copy_answers(eflag,vflag,eatom,vatom);
-  this->device->add_ans_object(this->ans);
-  #ifdef THREE_CONCURRENT
-  this->ans2->copy_answers(eflag,vflag,eatom,vatom);
-  this->device->add_ans_object(this->ans2);
-  #endif
-  this->hd_balancer.stop_timer();
-
-  return this->nbor->host_jlist.begin()-host_start;
-}
-
 // ---------------------------------------------------------------------------
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
@ -402,9 +243,40 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
  else
    vflag=0;

-  int ainum=this->ans->inum();
+  // build the short neighbor list
+  int ainum=this->_ainum;
  int nbor_pitch=this->nbor->nbor_pitch();
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
+                               (BX/this->_threads_per_atom)));
+
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &cutsq, &map,
+                 &elem2param, &_nelements, &_nparams,
+                 &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                 &this->dev_short_nbor, &ainum,
+                 &nbor_pitch, &this->_threads_per_atom);
+
+  // re-allocate zetaij if necessary
+  int nall = this->_nall;
+  if (nall*this->_max_nbors > _zetaij.cols()) {
+    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
+    _zetaij.resize(this->_max_nbors*_nmax);
+  }
+
+  nbor_pitch=this->nbor->nbor_pitch();
+  GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/
+                               (BX/(JTHREADS*KTHREADS))));
+
+  this->k_zeta.set_size(GX,BX);
+  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
+                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
+                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                   &this->dev_short_nbor,
+                   &_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);
+
+  ainum=this->ans->inum();
+  nbor_pitch=this->nbor->nbor_pitch();
+  GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));

  this->time_pair.start();
@ -412,6 +284,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
  this->k_pair.run(&this->atom->x, &ts1, &ts2, &cutsq,
                   &map, &elem2param, &_nelements, &_nparams,
                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                   &this->dev_short_nbor,
                   &this->ans->force, &this->ans->engv,
                   &eflag, &vflag, &ainum, &nbor_pitch,
                   &this->_threads_per_atom);
@ -423,6 +296,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
  this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
                           &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                           &this->dev_short_nbor,
                           &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
                           &nbor_pitch, &this->_threads_per_atom, &evatom);

@ -437,7 +311,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
                          &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc,
+                          &this->nbor->dev_acc, &this->dev_short_nbor,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);

@ -446,7 +320,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
                          &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc,
+                          &this->nbor->dev_acc, &this->dev_short_nbor,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
  }
--- a/lib/gpu/lal_tersoff_mod.cu
+++ b/lib/gpu/lal_tersoff_mod.cu
@ -106,7 +106,7 @@ texture<int4> ts5_tex;
    ans[ii]=old;                                                            \
  }

-#define store_zeta(z, tid, t_per_atom, offset)                              \
+#define acc_zeta(z, tid, t_per_atom, offset)                                \
  if (t_per_atom>1) {                                                       \
    __local acctyp red_acc[BLOCK_PAIR];                                     \
    red_acc[tid]=z;                                                         \
@ -155,7 +155,7 @@ texture<int4> ts5_tex;
    ans[ii]=old;                                                            \
  }

-#define store_zeta(z, tid, t_per_atom, offset)                              \
+#define acc_zeta(z, tid, t_per_atom, offset)                                \
  if (t_per_atom>1) {                                                       \
    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
      z += shfl_xor(z, s, t_per_atom);                                      \
@ -164,6 +164,65 @@ texture<int4> ts5_tex;

 #endif

+__kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
+                                   const __global numtyp *restrict cutsq,
+                                   const __global int *restrict map,
+                                   const __global int *restrict elem2param,
+                                   const int nelements, const int nparams,
+                                   const __global int * dev_nbor,
+                                   const __global int * dev_packed,
+                                   __global int * dev_short_nbor,
+                                   const int inum, const int nbor_pitch,
+                                   const int t_per_atom) {
+  __local int n_stride;
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    int itype=ix.w;
+    itype=map[itype];
+
+    int ncount = 0;
+    int m = nbor;
+    dev_short_nbor[m] = 0;
+    int nbor_short = nbor+n_stride;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int j=dev_packed[nbor];
+      int nj = j;
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int jtype=jx.w;
+      jtype=map[jtype];
+      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<cutsq[ijparam]) {
+        dev_short_nbor[nbor_short] = nj;
+        nbor_short += n_stride;
+        ncount++;
+      }
+    } // for nbor
+
+    // store the number of neighbors for each thread
+    dev_short_nbor[m] = ncount;
+
+  } // if ii
+}
+
 // Tersoff is currently used for 3 elements at most: 3*3*3 = 27 entries
 // while the block size should never be less than 32.
 // SHARED_SIZE = 32 for now to reduce the pressure on the shared memory per block
@ -184,6 +243,7 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
                             __global acctyp4 * zetaij,
                             const __global int * dev_nbor,
                             const __global int * dev_packed,
+                             const __global int * dev_short_nbor,
                             const int eflag, const int inum,
                             const int nbor_pitch, const int t_per_atom) {
  __local int tpa_sq,n_stride;
@ -211,22 +271,29 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
  __syncthreads();

  if (ii<inum) {
-    int nbor_j, nbor_end;
-    int i, numj;
-
+    int nbor_j, nbor_end, i, numj;
+    const int* nbor_mem=dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
    int offset_k=tid & (t_per_atom-1);
-    int nborj_start = nbor_j;

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;
    itype=map[itype];

+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+    int nborj_start = nbor_j;
+
    for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {

-      int j=dev_packed[nbor_j];
+      int j=nbor_mem[nbor_j];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -241,14 +308,18 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
      delr1.z = jx.z-ix.z;
      numtyp rsq1 = delr1.x*delr1.x+delr1.y*delr1.y+delr1.z*delr1.z;

-      if (rsq1 > cutsq[ijparam]) continue;
-
      // compute zeta_ij
-      z = (numtyp)0;
+      z = (acctyp)0;

      int nbor_k = nborj_start-offset_j+offset_k;
-      for ( ; nbor_k < nbor_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+      int k_end = nbor_end;
+      if (dev_packed==dev_nbor) {
+        int numk = dev_short_nbor[nbor_k-n_stride];
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
+
+      for ( ; nbor_k < k_end; nbor_k+=n_stride) {
+        int k=nbor_mem[nbor_k];
        k &= NEIGHMASK;

        if (k == j) continue;
@ -287,10 +358,12 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,

      //int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
      //int idx = jj*n_stride + i*t_per_atom + offset_j;
-      int idx;
-      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-               i, nbor_j, offset_j, idx);
-      store_zeta(z, tid, t_per_atom, offset_k);
+      //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
+      int idx = nbor_j;
+      if (dev_packed==dev_nbor) idx -= n_stride;
+//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//               i, nbor_j, offset_j, idx);
+      acc_zeta(z, tid, t_per_atom, offset_k);

      numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
      numtyp ijparam_lam2 = ts1_ijparam.y;
@ -331,6 +404,7 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
                                  const int nelements, const int nparams,
                                  const __global int * dev_nbor,
                                  const __global int * dev_packed,
+                                  const __global int * dev_short_nbor,
                                  __global acctyp4 *restrict ans,
                                  __global acctyp *restrict engv,
                                  const int eflag, const int vflag,
@ -357,8 +431,8 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
  __syncthreads();

  if (ii<inum) {
-    int nbor, nbor_end;
-    int i, numj;
+    int nbor, nbor_end, i, numj;
+    const int* nbor_mem=dev_packed;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

@ -366,9 +440,17 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
    int itype=ix.w;
    itype=map[itype];

+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
    for ( ; nbor<nbor_end; nbor+=n_stride) {

-      int j=dev_packed[nbor];
+      int j=nbor_mem[nbor];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -383,32 +465,31 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;

-      if (rsq<cutsq[ijparam]) {
-        numtyp feng[2];
-        numtyp ijparam_lam1 = ts1[ijparam].x;
-        numtyp4 ts2_ijparam = ts2[ijparam];
-        numtyp ijparam_biga = ts2_ijparam.x;
-        numtyp ijparam_bigr = ts2_ijparam.z;
-        numtyp ijparam_bigd = ts2_ijparam.w;
+      // rsq<cutsq[ijparam]
+      numtyp feng[2];
+      numtyp ijparam_lam1 = ts1[ijparam].x;
+      numtyp4 ts2_ijparam = ts2[ijparam];
+      numtyp ijparam_biga = ts2_ijparam.x;
+      numtyp ijparam_bigr = ts2_ijparam.z;
+      numtyp ijparam_bigd = ts2_ijparam.w;

-        repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
-                  rsq, eflag, feng);
+      repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
+                rsq, eflag, feng);

-        numtyp force = feng[0];
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
+      numtyp force = feng[0];
+      f.x+=delx*force;
+      f.y+=dely*force;
+      f.z+=delz*force;

-        if (eflag>0)
-          energy+=feng[1];
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
+      if (eflag>0)
+        energy+=feng[1];
+      if (vflag>0) {
+        virial[0] += delx*delx*force;
+        virial[1] += dely*dely*force;
+        virial[2] += delz*delz*force;
+        virial[3] += delx*dely*force;
+        virial[4] += delx*delz*force;
+        virial[5] += dely*delz*force;
      }
    } // for nbor

@ -430,6 +511,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
                                     const __global acctyp4 *restrict zetaij,
                                     const __global int * dev_nbor,
                                     const __global int * dev_packed,
+                                     const __global int * dev_short_nbor,
                                     __global acctyp4 *restrict ans,
                                     __global acctyp *restrict engv,
                                     const int eflag, const int vflag,
@ -465,20 +547,28 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int i, numj, nbor_j, nbor_end;
-
+    const int* nbor_mem=dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
    int offset_k=tid & (t_per_atom-1);
-    int nborj_start = nbor_j;

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;
    itype=map[itype];

+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+    int nborj_start = nbor_j;
+
    for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {

-      int j=dev_packed[nbor_j];
+      int j=nbor_mem[nbor_j];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -493,7 +583,6 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
      delr1[2] = jx.z-ix.z;
      numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];

-      if (rsq1 > cutsq[ijparam]) continue;
      numtyp r1 = ucl_sqrt(rsq1);
      numtyp r1inv = ucl_rsqrt(rsq1);

@ -501,9 +590,11 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,

      //int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
      //int idx = jj*n_stride + i*t_per_atom + offset_j;
-      int idx;
-      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-               i, nbor_j, offset_j, idx);
+      //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
+      int idx = nbor_j;
+      if (dev_packed==dev_nbor) idx -= n_stride;
+//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//               i, nbor_j, offset_j, idx);
      acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
      numtyp force = zeta_ij.x*tpainv;
      numtyp prefactor = zeta_ij.y;
@ -524,9 +615,15 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
        virial[5] += delr1[1]*delr1[2]*mforce;
      }

-      int nbor_k=nborj_start-offset_j+offset_k;
-      for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+      int nbor_k = nborj_start-offset_j+offset_k;
+      int k_end = nbor_end;
+      if (dev_packed==dev_nbor) {
+        int numk = dev_short_nbor[nbor_k-n_stride];
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
+
+      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
+        int k=nbor_mem[nbor_k];
        k &= NEIGHMASK;

        if (j == k) continue;
@ -606,6 +703,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
                                  const __global int * dev_nbor,
                                  const __global int * dev_packed,
                                  const __global int * dev_acc,
+                                  const __global int * dev_short_nbor,
                                  __global acctyp4 *restrict ans,
                                  __global acctyp *restrict engv,
                                  const int eflag, const int vflag,
@ -642,7 +740,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int i, numj, nbor_j, nbor_end, k_end;
-
+    const int* nbor_mem=dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
@ -653,9 +751,18 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
    itype=map[itype];

    numtyp tpainv = ucl_recip((numtyp)t_per_atom);
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
    for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {

-      int j=dev_packed[nbor_j];
+      int j=nbor_mem[nbor_j];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -670,8 +777,6 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
      delr1[2] = jx.z-ix.z;
      numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];

-      if (rsq1 > cutsq[ijparam]) continue;
-
      numtyp mdelr1[3];
      mdelr1[0] = -delr1[0];
      mdelr1[1] = -delr1[1];
@ -693,13 +798,20 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
        k_end=nbor_k+numk;
        nbor_k+=offset_k;
      }
+
+      // recalculate numk and k_end for the use of short neighbor list
+      if (dev_packed==dev_nbor) {
+        numk = dev_short_nbor[nbor_k];
+        nbor_k += n_stride;
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
      int nbork_start = nbor_k;

      // look up for zeta_ji: find i in the j's neighbor list
      int m = tid / t_per_atom;
      int ijnum = -1;
      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=nbor_mem[nbor_k];
        k &= NEIGHMASK;
        if (k == i) {
          ijnum = nbor_k;
@ -721,9 +833,11 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,

      //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
      //int idx = iix*n_stride + j*t_per_atom + offset_kf;
-      int idx;
-      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-               j, ijnum, offset_kf, idx);
+      //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
+      int idx = ijnum;
+      if (dev_packed==dev_nbor) idx -= n_stride;
+//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//               j, ijnum, offset_kf, idx);
      acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
      numtyp force = zeta_ji.x*tpainv;
      numtyp prefactor_ji = zeta_ji.y;
@ -746,7 +860,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,

      // attractive forces
      for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=nbor_mem[nbor_k];
        k &= NEIGHMASK;

        if (k == i) continue;
@ -790,9 +904,11 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,

        //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
        //int idx = kk*n_stride + j*t_per_atom + offset_k;
-        int idx;
-        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-                 j, nbor_k, offset_k, idx);
+        //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
+        int idx = nbor_k;
+        if (dev_packed==dev_nbor) idx -= n_stride;
+//        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//                 j, nbor_k, offset_k, idx);
        acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
        numtyp prefactor_jk = zeta_jk.y;
        int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
@ -841,6 +957,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
                                        const __global int * dev_nbor,
                                        const __global int * dev_packed,
                                        const __global int * dev_acc,
+                                        const __global int * dev_short_nbor,
                                        __global acctyp4 *restrict ans,
                                        __global acctyp *restrict engv,
                                        const int eflag, const int vflag,
@ -877,7 +994,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int i, numj, nbor_j, nbor_end, k_end;
-
+    const int* nbor_mem = dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
@ -888,9 +1005,18 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
    itype=map[itype];

    numtyp tpainv = ucl_recip((numtyp)t_per_atom);
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
    for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {

-      int j=dev_packed[nbor_j];
+      int j=nbor_mem[nbor_j];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -905,8 +1031,6 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
      delr1[2] = jx.z-ix.z;
      numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];

-      if (rsq1 > cutsq[ijparam]) continue;
-
      numtyp mdelr1[3];
      mdelr1[0] = -delr1[0];
      mdelr1[1] = -delr1[1];
@ -928,13 +1052,20 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
        k_end=nbor_k+numk;
        nbor_k+=offset_k;
      }
+
+      // recalculate numk and k_end for the use of short neighbor list
+      if (dev_packed==dev_nbor) {
+        numk = dev_short_nbor[nbor_k];
+        nbor_k += n_stride;
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
      int nbork_start = nbor_k;

      // look up for zeta_ji
      int m = tid / t_per_atom;
      int ijnum = -1;
      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=nbor_mem[nbor_k];
        k &= NEIGHMASK;
        if (k == i) {
          ijnum = nbor_k;
@ -956,9 +1087,11 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,

      //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
      //int idx = iix*n_stride + j*t_per_atom + offset_kf;
-      int idx;
-      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-               j, ijnum, offset_kf, idx);
+      //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
+      int idx = ijnum;
+      if (dev_packed==dev_nbor) idx -= n_stride;
+//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//               j, ijnum, offset_kf, idx);
      acctyp4 zeta_ji = zetaij[idx]; //  fetch(zeta_ji,idx,zeta_tex);
      numtyp force = zeta_ji.x*tpainv;
      numtyp prefactor_ji = zeta_ji.y;
@ -981,7 +1114,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,

      // attractive forces
      for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=nbor_mem[nbor_k];
        k &= NEIGHMASK;

        if (k == i) continue;
@ -1032,9 +1165,11 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,

        //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
        //int idx = kk*n_stride + j*t_per_atom + offset_k;
-        int idx;
-        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-                 j, nbor_k, offset_k, idx);
+        //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
+        int idx = nbor_k;
+        if (dev_packed==dev_nbor) idx -= n_stride;
+//        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//                 j, nbor_k, offset_k, idx);
        acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
        numtyp prefactor_jk = zeta_jk.y;

--- a/lib/gpu/lal_tersoff_mod.h
+++ b/lib/gpu/lal_tersoff_mod.h
@ -47,21 +47,6 @@ class TersoffMod : public BaseThree<numtyp, acctyp> {
           const double* h, const double* beta, const double* powern,
           const double* powern_del, const double* ca1, const double* cutsq);

-  /// Pair loop with host neighboring
-  void compute(const int f_ago, const int inum_full, const int nall,
-               const int nlist, double **host_x, int *host_type,
-               int *ilist, int *numj, int **firstneigh, const bool eflag,
-               const bool vflag, const bool eatom, const bool vatom,
-               int &host_start, const double cpu_time, bool &success);
-
-  /// Pair loop with device neighboring
-  int ** compute(const int ago, const int inum_full,
-                 const int nall, double **host_x, int *host_type, double *sublo,
-                 double *subhi, tagint *tag, int **nspecial,
-                 tagint **special, const bool eflag, const bool vflag,
-                 const bool eatom, const bool vatom, int &host_start,
-                 int **ilist, int **numj, const double cpu_time, bool &success);
-
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear();
@ -104,8 +89,7 @@ class TersoffMod : public BaseThree<numtyp, acctyp> {

  UCL_Kernel k_zeta;
  UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex;
-
-  int _max_nbors;
+  numtyp _cutshortsq;

 private:
  bool _allocated;
--- a/lib/gpu/lal_tersoff_zbl.cpp
+++ b/lib/gpu/lal_tersoff_zbl.cpp
@ -62,7 +62,8 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
  int success;
  success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
                           _screen,tersoff_zbl,"k_tersoff_zbl_repulsive",
-                           "k_tersoff_zbl_three_center", "k_tersoff_zbl_three_end");
+                           "k_tersoff_zbl_three_center", "k_tersoff_zbl_three_end",
+                           "k_tersoff_zbl_short_nbor");
  if (success!=0)
    return success;

@ -177,11 +178,16 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,

  UCL_H_Vec<numtyp> cutsq_view(nparams,*(this->ucl_device),
                               UCL_WRITE_ONLY);
-  for (int i=0; i<nparams; i++)
+  double cutsqmax = 0.0;
+  for (int i=0; i<nparams; i++) {
    cutsq_view[i]=static_cast<numtyp>(host_cutsq[i]);
+    if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i];
+  }
  cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
  ucl_copy(cutsq,cutsq_view,false);

+  _cutshortsq = static_cast<numtyp>(cutsqmax);
+
  UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements,
                           *(this->ucl_device), UCL_WRITE_ONLY);

@ -244,171 +250,6 @@ double TersoffZT::host_memory_usage() const {

 #define KTHREADS this->_threads_per_atom
 #define JTHREADS this->_threads_per_atom
-// ---------------------------------------------------------------------------
-// Copy nbor list from host if necessary and then calculate forces, virials,..
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-void TersoffZT::compute(const int f_ago, const int inum_full, const int nall,
-                       const int nlist, double **host_x, int *host_type,
-                       int *ilist, int *numj, int **firstneigh,
-                       const bool eflag, const bool vflag, const bool eatom,
-                       const bool vatom, int &host_start,
-                       const double cpu_time, bool &success) {
-  this->acc_timers();
-  if (inum_full==0) {
-    host_start=0;
-    // Make sure textures are correct if realloc by a different hybrid style
-    this->resize_atom(0,nall,success);
-    this->zero_timers();
-    return;
-  }
-
-  int ago=this->hd_balancer.ago_first(f_ago);
-  int inum=this->hd_balancer.balance(ago,inum_full,cpu_time);
-  this->ans->inum(inum);
-  #ifdef THREE_CONCURRENT
-  this->ans2->inum(inum);
-  #endif
-  host_start=inum;
-
-  if (ago==0) {
-    this->reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
-    if (!success)
-      return;
-    _max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist);
-  }
-
-  this->atom->cast_x_data(host_x,host_type);
-  this->hd_balancer.start_timer();
-  this->atom->add_x_data(host_x,host_type);
-
-  // re-allocate zetaij if necessary
-  if (nall*_max_nbors > _zetaij.cols()) {
-    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-    _zetaij.resize(_max_nbors*_nmax);
-  }
-
-  int _eflag;
-  if (eflag)
-    _eflag=1;
-  else
-    _eflag=0;
-
-  int ainum=nlist;
-  int nbor_pitch=this->nbor->nbor_pitch();
-  int BX=this->block_pair();
-  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
-                               (BX/(JTHREADS*KTHREADS))));
-
-  this->k_zeta.set_size(GX,BX);
-  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
-                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
-
-  int evatom=0;
-  if (eatom || vatom)
-    evatom=1;
-  #ifdef THREE_CONCURRENT
-  this->ucl_device->sync();
-  #endif
-  loop(eflag,vflag,evatom);
-  this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
-  this->device->add_ans_object(this->ans);
-  #ifdef THREE_CONCURRENT
-  this->ans2->copy_answers(eflag,vflag,eatom,vatom,ilist);
-  this->device->add_ans_object(this->ans2);
-  #endif
-  this->hd_balancer.stop_timer();
-}
-
-// ---------------------------------------------------------------------------
-// Reneighbor on GPU if necessary and then compute forces, virials, energies
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-int ** TersoffZT::compute(const int ago, const int inum_full,
-                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag,
-                         int **nspecial, tagint **special, const bool eflag,
-                         const bool vflag, const bool eatom,
-                         const bool vatom, int &host_start,
-                         int **ilist, int **jnum,
-                         const double cpu_time, bool &success) {
-  this->acc_timers();
-
-  if (inum_full==0) {
-    host_start=0;
-    // Make sure textures are correct if realloc by a different hybrid style
-    this->resize_atom(0,nall,success);
-    this->zero_timers();
-    return NULL;
-  }
-
-  this->hd_balancer.balance(cpu_time);
-  int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
-  this->ans->inum(inum);
-  #ifdef THREE_CONCURRENT
-  this->ans2->inum(inum);
-  #endif
-  host_start=inum;
-
-  // Build neighbor list on GPU if necessary
-  if (ago==0) {
-    _max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
-                    sublo, subhi, tag, nspecial, special, success);
-    if (!success)
-      return NULL;
-    this->hd_balancer.start_timer();
-  } else {
-    this->atom->cast_x_data(host_x,host_type);
-    this->hd_balancer.start_timer();
-    this->atom->add_x_data(host_x,host_type);
-  }
-  *ilist=this->nbor->host_ilist.begin();
-  *jnum=this->nbor->host_acc.begin();
-
-  // re-allocate zetaij if necessary
-  if (nall*_max_nbors > _zetaij.cols()) {
-    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-    _zetaij.resize(_max_nbors*_nmax);
-  }
-
-  int _eflag;
-  if (eflag)
-    _eflag=1;
-  else
-    _eflag=0;
-
-  int ainum=nall;
-  int nbor_pitch=this->nbor->nbor_pitch();
-  int BX=this->block_pair();
-  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
-                               (BX/(JTHREADS*KTHREADS))));
-
-  this->k_zeta.set_size(GX,BX);
-  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
-                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom);
-
-  int evatom=0;
-  if (eatom || vatom)
-    evatom=1;
-  #ifdef THREE_CONCURRENT
-  this->ucl_device->sync();
-  #endif
-  loop(eflag,vflag,evatom);
-  this->ans->copy_answers(eflag,vflag,eatom,vatom);
-  this->device->add_ans_object(this->ans);
-  #ifdef THREE_CONCURRENT
-  this->ans2->copy_answers(eflag,vflag,eatom,vatom);
-  this->device->add_ans_object(this->ans2);
-  #endif
-  this->hd_balancer.stop_timer();
-
-  return this->nbor->host_jlist.begin()-host_start;
-}
-
 // ---------------------------------------------------------------------------
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
@ -427,9 +268,40 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
  else
    vflag=0;

-  int ainum=this->ans->inum();
+  // build the short neighbor list
+  int ainum=this->_ainum;
  int nbor_pitch=this->nbor->nbor_pitch();
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
+                               (BX/this->_threads_per_atom)));
+
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &cutsq, &map,
+                 &elem2param, &_nelements, &_nparams,
+                 &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                 &this->dev_short_nbor, &ainum,
+                 &nbor_pitch, &this->_threads_per_atom);
+
+  // re-allocate zetaij if necessary
+  int nall = this->_nall;
+  if (nall*this->_max_nbors > _zetaij.cols()) {
+    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
+    _zetaij.resize(this->_max_nbors*_nmax);
+  }
+
+  nbor_pitch=this->nbor->nbor_pitch();
+  GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/
+                               (BX/(JTHREADS*KTHREADS))));
+
+  this->k_zeta.set_size(GX,BX);
+  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
+                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
+                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                   &this->dev_short_nbor,
+                   &_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);
+
+  ainum=this->ans->inum();
+  nbor_pitch=this->nbor->nbor_pitch();
+  GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));

  this->time_pair.start();
@ -438,6 +310,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
                   &_global_e, &_global_a_0, &_global_epsilon_0, &cutsq,
                   &map, &elem2param, &_nelements, &_nparams,
                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                   &this->dev_short_nbor,
                   &this->ans->force, &this->ans->engv,
                   &eflag, &vflag, &ainum, &nbor_pitch,
                   &this->_threads_per_atom);
@ -449,6 +322,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
  this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                           &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                           &this->dev_short_nbor,
                           &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
                           &nbor_pitch, &this->_threads_per_atom, &evatom);

@ -463,7 +337,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                          &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc,
+                          &this->nbor->dev_acc, &this->dev_short_nbor,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);

@ -472,7 +346,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
    this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
                          &map, &elem2param, &_nelements, &_nparams, &_zetaij,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc,
+                          &this->nbor->dev_acc, &this->dev_short_nbor,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
  }
--- a/lib/gpu/lal_tersoff_zbl.cu
+++ b/lib/gpu/lal_tersoff_zbl.cu
@ -109,7 +109,7 @@ texture<int4> ts6_tex;
    ans[ii]=old;                                                            \
  }

-#define store_zeta(z, tid, t_per_atom, offset)                              \
+#define acc_zeta(z, tid, t_per_atom, offset)                                \
  if (t_per_atom>1) {                                                       \
    __local acctyp red_acc[BLOCK_PAIR];                                     \
    red_acc[tid]=z;                                                         \
@ -158,7 +158,7 @@ texture<int4> ts6_tex;
    ans[ii]=old;                                                            \
  }

-#define store_zeta(z, tid, t_per_atom, offset)                              \
+#define acc_zeta(z, tid, t_per_atom, offset)                                \
  if (t_per_atom>1) {                                                       \
    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
      z += shfl_xor(z, s, t_per_atom);                                      \
@ -167,6 +167,65 @@ texture<int4> ts6_tex;

 #endif

+__kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
+                                   const __global numtyp *restrict cutsq,
+                                   const __global int *restrict map,
+                                   const __global int *restrict elem2param,
+                                   const int nelements, const int nparams,
+                                   const __global int * dev_nbor,
+                                   const __global int * dev_packed,
+                                   __global int * dev_short_nbor,
+                                   const int inum, const int nbor_pitch,
+                                   const int t_per_atom) {
+  __local int n_stride;
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    int itype=ix.w;
+    itype=map[itype];
+
+    int ncount = 0;
+    int m = nbor;
+    dev_short_nbor[m] = 0;
+    int nbor_short = nbor+n_stride;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int j=dev_packed[nbor];
+      int nj = j;
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int jtype=jx.w;
+      jtype=map[jtype];
+      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<cutsq[ijparam]) {
+        dev_short_nbor[nbor_short] = nj;
+        nbor_short += n_stride;
+        ncount++;
+      }
+    } // for nbor
+
+    // store the number of neighbors for each thread
+    dev_short_nbor[m] = ncount;
+
+  } // if ii
+}
+
 // Tersoff is currently used for 3 elements at most: 3*3*3 = 27 entries
 // while the block size should never be less than 32.
 // SHARED_SIZE = 32 for now to reduce the pressure on the shared memory per block
@ -188,6 +247,7 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
                             __global acctyp4 * zetaij,
                             const __global int * dev_nbor,
                             const __global int * dev_packed,
+                             const __global int * dev_short_nbor,
                             const int eflag, const int inum,
                             const int nbor_pitch, const int t_per_atom) {
  __local int tpa_sq,n_stride;
@ -217,22 +277,29 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
  __syncthreads();

  if (ii<inum) {
-    int nbor_j, nbor_end;
-    int i, numj;
-
+    int nbor_j, nbor_end, i, numj;
+    const int* nbor_mem=dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
    int offset_k=tid & (t_per_atom-1);
-    int nborj_start = nbor_j;

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;
    itype=map[itype];

+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+    int nborj_start = nbor_j;
+
    for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {

-      int j=dev_packed[nbor_j];
+      int j=nbor_mem[nbor_j];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -247,14 +314,18 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
      delr1.z = jx.z-ix.z;
      numtyp rsq1 = delr1.x*delr1.x+delr1.y*delr1.y+delr1.z*delr1.z;

-      if (rsq1 > cutsq[ijparam]) continue;
-
      // compute zeta_ij
      z = (acctyp)0;

      int nbor_k = nborj_start-offset_j+offset_k;
-      for ( ; nbor_k < nbor_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+      int k_end = nbor_end;
+      if (dev_packed==dev_nbor) {
+        int numk = dev_short_nbor[nbor_k-n_stride];
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
+
+      for ( ; nbor_k < k_end; nbor_k+=n_stride) {
+        int k=nbor_mem[nbor_k];
        k &= NEIGHMASK;

        if (k == j) continue;
@ -290,10 +361,12 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,

      //int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;
      //int idx = jj*n_stride + i*t_per_atom + offset_j;
-      int idx;
-      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-               i, nbor_j, offset_j, idx);
-      store_zeta(z, tid, t_per_atom, offset_k);
+      //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
+      int idx = nbor_j;
+      if (dev_packed==dev_nbor) idx -= n_stride;
+//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//               i, nbor_j, offset_j, idx);
+      acc_zeta(z, tid, t_per_atom, offset_k);

      numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
      numtyp ijparam_lam2 = ts1_ijparam.y;
@ -342,6 +415,7 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
                                  const int nelements, const int nparams,
                                  const __global int * dev_nbor,
                                  const __global int * dev_packed,
+                                  const __global int * dev_short_nbor,
                                  __global acctyp4 *restrict ans,
                                  __global acctyp *restrict engv,
                                  const int eflag, const int vflag,
@ -370,8 +444,8 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
  __syncthreads();

  if (ii<inum) {
-    int nbor, nbor_end;
-    int i, numj;
+    int nbor, nbor_end, i, numj;
+    const int* nbor_mem=dev_packed;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

@ -379,9 +453,17 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
    int itype=ix.w;
    itype=map[itype];

+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
    for ( ; nbor<nbor_end; nbor+=n_stride) {

-      int j=dev_packed[nbor];
+      int j=nbor_mem[nbor];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -396,38 +478,37 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;

-      if (rsq<cutsq[ijparam]) {
-        numtyp feng[2];
-        numtyp ijparam_lam1 = ts1[ijparam].x;
-        numtyp4 ts2_ijparam = ts2[ijparam];
-        numtyp ijparam_biga = ts2_ijparam.x;
-        numtyp ijparam_bigr = ts2_ijparam.z;
-        numtyp ijparam_bigd = ts2_ijparam.w;
-        numtyp4 ts6_ijparam = ts6[ijparam];
-        numtyp ijparam_Z_i = ts6_ijparam.x;
-        numtyp ijparam_Z_j = ts6_ijparam.y;
-        numtyp ijparam_ZBLcut = ts6_ijparam.z;
-        numtyp ijparam_ZBLexpscale = ts6_ijparam.w;
+      // rsq<cutsq[ijparam]
+      numtyp feng[2];
+      numtyp ijparam_lam1 = ts1[ijparam].x;
+      numtyp4 ts2_ijparam = ts2[ijparam];
+      numtyp ijparam_biga = ts2_ijparam.x;
+      numtyp ijparam_bigr = ts2_ijparam.z;
+      numtyp ijparam_bigd = ts2_ijparam.w;
+      numtyp4 ts6_ijparam = ts6[ijparam];
+      numtyp ijparam_Z_i = ts6_ijparam.x;
+      numtyp ijparam_Z_j = ts6_ijparam.y;
+      numtyp ijparam_ZBLcut = ts6_ijparam.z;
+      numtyp ijparam_ZBLexpscale = ts6_ijparam.w;

-        repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
-                  ijparam_Z_i, ijparam_Z_j, ijparam_ZBLcut, ijparam_ZBLexpscale,
-                  global_e, global_a_0, global_epsilon_0, rsq, eflag, feng);
+      repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
+                ijparam_Z_i, ijparam_Z_j, ijparam_ZBLcut, ijparam_ZBLexpscale,
+                global_e, global_a_0, global_epsilon_0, rsq, eflag, feng);

-        numtyp force = feng[0];
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
+      numtyp force = feng[0];
+      f.x+=delx*force;
+      f.y+=dely*force;
+      f.z+=delz*force;

-        if (eflag>0)
-          energy+=feng[1];
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
+      if (eflag>0)
+        energy+=feng[1];
+      if (vflag>0) {
+        virial[0] += delx*delx*force;
+        virial[1] += dely*dely*force;
+        virial[2] += delz*delz*force;
+        virial[3] += delx*dely*force;
+        virial[4] += delx*delz*force;
+        virial[5] += dely*delz*force;
      }
    } // for nbor

@ -448,6 +529,7 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
                                     const __global acctyp4 *restrict zetaij,
                                     const __global int * dev_nbor,
                                     const __global int * dev_packed,
+                                     const __global int * dev_short_nbor,
                                     __global acctyp4 *restrict ans,
                                     __global acctyp *restrict engv,
                                     const int eflag, const int vflag,
@ -481,20 +563,28 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int i, numj, nbor_j, nbor_end;
-
+    const int* nbor_mem=dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
    int offset_k=tid & (t_per_atom-1);
-    int nborj_start = nbor_j;

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;
    itype=map[itype];

+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+    int nborj_start = nbor_j;
+
    for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {

-      int j=dev_packed[nbor_j];
+      int j=nbor_mem[nbor_j];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -509,7 +599,6 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
      delr1[2] = jx.z-ix.z;
      numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];

-      if (rsq1 > cutsq[ijparam]) continue;
      numtyp r1 = ucl_sqrt(rsq1);
      numtyp r1inv = ucl_rsqrt(rsq1);

@ -517,9 +606,11 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,

      //int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride;
      //int idx = jj*n_stride + i*t_per_atom + offset_j;
-      int idx;
-      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-               i, nbor_j, offset_j, idx);
+      //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
+      int idx = nbor_j;
+      if (dev_packed==dev_nbor) idx -= n_stride;
+//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//               i, nbor_j, offset_j, idx);
      acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
      numtyp force = zeta_ij.x*tpainv;
      numtyp prefactor = zeta_ij.y;
@ -540,9 +631,15 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
        virial[5] += delr1[1]*delr1[2]*mforce;
      }

-      int nbor_k=nborj_start-offset_j+offset_k;
-      for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+      int nbor_k = nborj_start-offset_j+offset_k;
+      int k_end = nbor_end;
+      if (dev_packed==dev_nbor) {
+        int numk = dev_short_nbor[nbor_k-n_stride];
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
+
+      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
+        int k=nbor_mem[nbor_k];
        k &= NEIGHMASK;

        if (j == k) continue;
@ -618,6 +715,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
                                  const __global int * dev_nbor,
                                  const __global int * dev_packed,
                                  const __global int * dev_acc,
+                                  const __global int * dev_short_nbor,
                                  __global acctyp4 *restrict ans,
                                  __global acctyp *restrict engv,
                                  const int eflag, const int vflag,
@ -652,7 +750,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int i, numj, nbor_j, nbor_end, k_end;
-
+    const int* nbor_mem=dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
@ -663,9 +761,18 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
    itype=map[itype];

    numtyp tpainv = ucl_recip((numtyp)t_per_atom);
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
    for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {

-      int j=dev_packed[nbor_j];
+      int j=nbor_mem[nbor_j];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -680,8 +787,6 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
      delr1[2] = jx.z-ix.z;
      numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];

-      if (rsq1 > cutsq[ijparam]) continue;
-
      numtyp mdelr1[3];
      mdelr1[0] = -delr1[0];
      mdelr1[1] = -delr1[1];
@ -703,13 +808,20 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
        k_end=nbor_k+numk;
        nbor_k+=offset_k;
      }
+
+      // recalculate numk and k_end for the use of short neighbor list
+      if (dev_packed==dev_nbor) {
+        numk = dev_short_nbor[nbor_k];
+        nbor_k += n_stride;
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
      int nbork_start = nbor_k;

      // look up for zeta_ji: find i in the j's neighbor list
      int m = tid / t_per_atom;
      int ijnum = -1;
      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=nbor_mem[nbor_k];
        k &= NEIGHMASK;
        if (k == i) {
          ijnum = nbor_k;
@ -731,9 +843,11 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,

      //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
      //int idx = iix*n_stride + j*t_per_atom + offset_kf;
-      int idx;
-      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-               j, ijnum, offset_kf, idx);
+      //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
+      int idx = ijnum;
+      if (dev_packed==dev_nbor) idx -= n_stride;
+//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//               j, ijnum, offset_kf, idx);
      acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
      numtyp force = zeta_ji.x*tpainv;
      numtyp prefactor_ji = zeta_ji.y;
@ -756,7 +870,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,

      // attractive forces
      for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=nbor_mem[nbor_k];
        k &= NEIGHMASK;

        if (k == i) continue;
@ -797,9 +911,11 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,

        //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
        //int idx = kk*n_stride + j*t_per_atom + offset_k;
-        int idx;
-        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-                 j, nbor_k, offset_k, idx);
+        //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
+        int idx = nbor_k;
+        if (dev_packed==dev_nbor) idx -= n_stride;
+//        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//                 j, nbor_k, offset_k, idx);
        acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
        numtyp prefactor_jk = zeta_jk.y;
        int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
@ -844,6 +960,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
                                        const __global int * dev_nbor,
                                        const __global int * dev_packed,
                                        const __global int * dev_acc,
+                                        const __global int * dev_short_nbor,
                                        __global acctyp4 *restrict ans,
                                        __global acctyp *restrict engv,
                                        const int eflag, const int vflag,
@ -878,7 +995,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int i, numj, nbor_j, nbor_end, k_end;
-
+    const int* nbor_mem = dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
@ -889,9 +1006,18 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
    itype=map[itype];

    numtyp tpainv = ucl_recip((numtyp)t_per_atom);
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
    for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {

-      int j=dev_packed[nbor_j];
+      int j=nbor_mem[nbor_j];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -906,8 +1032,6 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
      delr1[2] = jx.z-ix.z;
      numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];

-      if (rsq1 > cutsq[ijparam]) continue;
-
      numtyp mdelr1[3];
      mdelr1[0] = -delr1[0];
      mdelr1[1] = -delr1[1];
@ -929,13 +1053,20 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
        k_end=nbor_k+numk;
        nbor_k+=offset_k;
      }
+
+      // recalculate numk and k_end for the use of short neighbor list
+      if (dev_packed==dev_nbor) {
+        numk = dev_short_nbor[nbor_k];
+        nbor_k += n_stride;
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
      int nbork_start = nbor_k;

      // look up for zeta_ji
      int m = tid / t_per_atom;
      int ijnum = -1;
      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=nbor_mem[nbor_k];
        k &= NEIGHMASK;
        if (k == i) {
          ijnum = nbor_k;
@ -957,9 +1088,11 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,

      //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride;
      //int idx = iix*n_stride + j*t_per_atom + offset_kf;
-      int idx;
-      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-               j, ijnum, offset_kf, idx);
+      //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
+      int idx = ijnum;
+      if (dev_packed==dev_nbor) idx -= n_stride;
+//      zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//               j, ijnum, offset_kf, idx);
      acctyp4 zeta_ji = zetaij[idx]; //  fetch(zeta_ji,idx,zeta_tex);
      numtyp force = zeta_ji.x*tpainv;
      numtyp prefactor_ji = zeta_ji.y;
@ -982,7 +1115,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,

      // attractive forces
      for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=nbor_mem[nbor_k];
        k &= NEIGHMASK;

        if (k == i) continue;
@ -1030,9 +1163,11 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,

        //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride;
        //int idx = kk*n_stride + j*t_per_atom + offset_k;
-        int idx;
-        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
-                 j, nbor_k, offset_k, idx);
+        //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
+        int idx = nbor_k;
+        if (dev_packed==dev_nbor) idx -= n_stride;
+//        zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom,
+//                 j, nbor_k, offset_k, idx);
        acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
        numtyp prefactor_jk = zeta_jk.y;

--- a/lib/gpu/lal_tersoff_zbl.h
+++ b/lib/gpu/lal_tersoff_zbl.h
@ -49,21 +49,6 @@ class TersoffZBL : public BaseThree<numtyp, acctyp> {
           const double* ZBLcut, const double* ZBLexpscale, const double global_e,
           const double global_a_0, const double global_epsilon_0, const double* cutsq);

-  /// Pair loop with host neighboring
-  void compute(const int f_ago, const int inum_full, const int nall,
-               const int nlist, double **host_x, int *host_type,
-               int *ilist, int *numj, int **firstneigh, const bool eflag,
-               const bool vflag, const bool eatom, const bool vatom,
-               int &host_start, const double cpu_time, bool &success);
-
-  /// Pair loop with device neighboring
-  int ** compute(const int ago, const int inum_full,
-                 const int nall, double **host_x, int *host_type, double *sublo,
-                 double *subhi, tagint *tag, int **nspecial,
-                 tagint **special, const bool eflag, const bool vflag,
-                 const bool eatom, const bool vatom, int &host_start,
-                 int **ilist, int **numj, const double cpu_time, bool &success);
-
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear();
@ -109,8 +94,8 @@ class TersoffZBL : public BaseThree<numtyp, acctyp> {
  UCL_Kernel k_zeta;
  UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex, ts6_tex;

-  int _max_nbors;
  numtyp _global_e,_global_a_0,_global_epsilon_0;
+  numtyp _cutshortsq;

 private:
  bool _allocated;
--- a/lib/gpu/lal_vashishta.cpp
+++ b/lib/gpu/lal_vashishta.cpp
@ -59,7 +59,7 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i
  int success;
  success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
                           _screen,vashishta,"k_vashishta","k_vashishta_three_center",
-                           "k_vashishta_three_end");
+                           "k_vashishta_three_end","k_vashishta_short_nbor");
  if (success!=0)
    return success;

@ -128,15 +128,18 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i

  param4.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);

+  double r0sqmax = 0;
  for (int i=0; i<nparams; i++) {
-    double r0sq = r0[i]*r0[i]-1e-4; // TODO: should we have the 1e-4?
-
+    double r0sq = r0[i]*r0[i]; // TODO: should we have the 1e-4?
+    if (r0sqmax < r0sq) r0sqmax = r0sq;
    dview[i].x=static_cast<numtyp>(r0sq);
    dview[i].y=static_cast<numtyp>(gamma[i]);
    dview[i].z=static_cast<numtyp>(cutsq[i]);
    dview[i].w=static_cast<numtyp>(r0[i]);
  }

+  _cutshortsq = static_cast<numtyp>(r0sqmax);
+
  ucl_copy(param4,dview,false);
  param4_tex.get_texture(*(this->pair_program),"param4_tex");
  param4_tex.bind_float(param4,4);
@ -223,15 +226,28 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
  else
    vflag=0;

-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+  // build the short neighbor list
+  int ainum=this->_ainum;
+  int nbor_pitch=this->nbor->nbor_pitch();
+  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
                               (BX/this->_threads_per_atom)));

+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &param4, &map,
+                 &elem2param, &_nelements, &_nparams,
+                 &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                 &this->dev_short_nbor, &ainum,
+                 &nbor_pitch, &this->_threads_per_atom);
+
  // this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1
  // this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1
-  int ainum=this->ans->inum();
-  int nbor_pitch=this->nbor->nbor_pitch();
+  ainum=this->ans->inum();
+  nbor_pitch=this->nbor->nbor_pitch();
+  GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
  this->time_pair.start();

+  // note that k_pair does not run with the short neighbor list
  this->k_pair.set_size(GX,BX);
  this->k_pair.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
                   &map, &elem2param, &_nelements,
@ -248,6 +264,7 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
  this->k_three_center.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
                           &map, &elem2param, &_nelements,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                           &this->dev_short_nbor,
                           &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
                           &nbor_pitch, &this->_threads_per_atom, &evatom);
  Answer<numtyp,acctyp> *end_ans;
@ -257,21 +274,19 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
  end_ans=this->ans;
  #endif
  if (evatom!=0) {
-    
    this->k_three_end_vatom.set_size(GX,BX);
    this->k_three_end_vatom.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
                          &map, &elem2param, &_nelements,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc,
+                          &this->nbor->dev_acc, &this->dev_short_nbor,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
  } else {
-    
    this->k_three_end.set_size(GX,BX);
    this->k_three_end.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
                          &map, &elem2param, &_nelements,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_acc,
+                          &this->nbor->dev_acc, &this->dev_short_nbor,
                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
  }
--- a/lib/gpu/lal_vashishta.cu
+++ b/lib/gpu/lal_vashishta.cu
@ -136,6 +136,64 @@ texture<int4> param5_tex;

 #endif

+__kernel void k_vashishta_short_nbor(const __global numtyp4 *restrict x_,
+                                     const __global numtyp4 *restrict param4,
+                                     const __global int *restrict map,
+                                     const __global int *restrict elem2param,
+                                     const int nelements, const int nparams,
+                                     const __global int * dev_nbor,
+                                     const __global int * dev_packed,
+                                     __global int * dev_short_nbor,
+                                     const int inum, const int nbor_pitch,
+                                     const int t_per_atom) {
+  __local int n_stride;
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    int itype=ix.w;
+    itype=map[itype];
+
+    int ncount = 0;
+    int m = nbor;
+    dev_short_nbor[m] = 0;
+    int nbor_short = nbor+n_stride;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int j=dev_packed[nbor];
+      int nj = j;
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int jtype=jx.w;
+      jtype=map[jtype];
+      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<param4[ijparam].x) { //param4[ijparam].x = r0sq; //param4[ijparam].z=cutsq
+        dev_short_nbor[nbor_short] = nj;
+        nbor_short += n_stride;
+        ncount++;
+      }
+    } // for nbor
+
+    // store the number of neighbors for each thread
+    dev_short_nbor[m] = ncount;
+
+  } // if ii
+}

 __kernel void k_vashishta(const __global numtyp4 *restrict x_,
                   const __global numtyp4 *restrict param1,
@ -166,8 +224,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
  __syncthreads();

  if (ii<inum) {
-    int nbor, nbor_end;
-    int i, numj;
+    int nbor, nbor_end, i, numj;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

@ -211,7 +268,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
        numtyp param3_dvrc=param3_ijparam.z;
        numtyp param3_c0  =param3_ijparam.w;

-        numtyp r=sqrt(rsq);
+        numtyp r=ucl_sqrt(rsq);
        numtyp rinvsq=1.0/rsq;
        numtyp r4inv = rinvsq*rinvsq;
        numtyp r6inv = rinvsq*r4inv;
@ -219,8 +276,8 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
        numtyp reta = pow(r,-param1_eta);
        numtyp lam1r = r*param1_lam1inv;
        numtyp lam4r = r*param1_lam4inv;
-        numtyp vc2 = param1_zizj * exp(-lam1r)/r;
-        numtyp vc3 = param2_mbigd * r4inv*exp(-lam4r);
+        numtyp vc2 = param1_zizj * ucl_exp(-lam1r)/r;
+        numtyp vc3 = param2_mbigd * r4inv*ucl_exp(-lam4r);

        numtyp force = (param2_dvrc*r
            - (4.0*vc3 + lam4r*vc3+param2_big6w*r6inv
@ -230,6 +287,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
        f.x+=delx*force;
        f.y+=dely*force;
        f.z+=delz*force;
+
        if (eflag>0)
          energy += (param3_bigh*reta+vc2-vc3-param3_bigw*r6inv-r*param3_dvrc+param3_c0);
          
@ -255,31 +313,31 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
  numtyp r1 = ucl_sqrt(rsq1);                                                \
  numtyp rinvsq1 = ucl_recip(rsq1);                                          \
  numtyp rainv1 = ucl_recip(r1 - param_r0_ij);                               \
-  numtyp gsrainv1 = param_gamma_ij * rainv1;                                    \
+  numtyp gsrainv1 = param_gamma_ij * rainv1;                                 \
  numtyp gsrainvsq1 = gsrainv1*rainv1/r1;                                    \
  numtyp expgsrainv1 = ucl_exp(gsrainv1);                                    \
                                                                             \
  numtyp r2 = ucl_sqrt(rsq2);                                                \
  numtyp rinvsq2 = ucl_recip(rsq2);                                          \
  numtyp rainv2 = ucl_recip(r2 - param_r0_ik);                               \
-  numtyp gsrainv2 = param_gamma_ik * rainv2;                                    \
+  numtyp gsrainv2 = param_gamma_ik * rainv2;                                 \
  numtyp gsrainvsq2 = gsrainv2*rainv2/r2;                                    \
  numtyp expgsrainv2 = ucl_exp(gsrainv2);                                    \
                                                                             \
  numtyp rinv12 = ucl_recip(r1*r2);                                          \
  numtyp cs = (delr1x*delr2x + delr1y*delr2y + delr1z*delr2z) * rinv12;      \
-  numtyp delcs = cs - param_costheta_ijk;                                       \
+  numtyp delcs = cs - param_costheta_ijk;                                    \
  numtyp delcssq = delcs*delcs;                                              \
-  numtyp pcsinv = param_bigc_ijk*delcssq+1.0;                                   \
+  numtyp pcsinv = param_bigc_ijk*delcssq+1.0;                                \
  numtyp pcsinvsq = pcsinv*pcsinv;                                           \
  numtyp pcs = delcssq/pcsinv;                                               \
                                                                             \
  numtyp facexp = expgsrainv1*expgsrainv2;                                   \
                                                                             \
-  numtyp facrad = param_bigb_ijk * facexp*pcs;                                  \
+  numtyp facrad = param_bigb_ijk * facexp*pcs;                               \
  numtyp frad1 = facrad*gsrainvsq1;                                          \
  numtyp frad2 = facrad*gsrainvsq2;                                          \
-  numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq;                      \
+  numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq;                   \
  numtyp facang12 = rinv12*facang;                                           \
  numtyp csfacang = cs*facang;                                               \
  numtyp csfac1 = rinvsq1*csfacang;                                          \
@ -311,28 +369,28 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
  numtyp r1 = ucl_sqrt(rsq1);                                                \
  numtyp rinvsq1 = ucl_recip(rsq1);                                          \
  numtyp rainv1 = ucl_recip(r1 - param_r0_ij);                               \
-  numtyp gsrainv1 = param_gamma_ij * rainv1;                                    \
+  numtyp gsrainv1 = param_gamma_ij * rainv1;                                 \
  numtyp gsrainvsq1 = gsrainv1*rainv1/r1;                                    \
  numtyp expgsrainv1 = ucl_exp(gsrainv1);                                    \
                                                                             \
  numtyp r2 = ucl_sqrt(rsq2);                                                \
  numtyp rainv2 = ucl_recip(r2 - param_r0_ik);                               \
-  numtyp gsrainv2 = param_gamma_ik * rainv2;                                    \
+  numtyp gsrainv2 = param_gamma_ik * rainv2;                                 \
  numtyp expgsrainv2 = ucl_exp(gsrainv2);                                    \
                                                                             \
  numtyp rinv12 = ucl_recip(r1*r2);                                          \
  numtyp cs = (delr1x*delr2x + delr1y*delr2y + delr1z*delr2z) * rinv12;      \
-  numtyp delcs = cs - param_costheta_ijk;                                       \
+  numtyp delcs = cs - param_costheta_ijk;                                    \
  numtyp delcssq = delcs*delcs;                                              \
-  numtyp pcsinv = param_bigc_ijk*delcssq+1.0;                                   \
+  numtyp pcsinv = param_bigc_ijk*delcssq+1.0;                                \
  numtyp pcsinvsq = pcsinv*pcsinv;                                           \
  numtyp pcs = delcssq/pcsinv;                                               \
                                                                             \
  numtyp facexp = expgsrainv1*expgsrainv2;                                   \
                                                                             \
-  numtyp facrad = param_bigb_ijk * facexp*pcs;                                  \
+  numtyp facrad = param_bigb_ijk * facexp*pcs;                               \
  numtyp frad1 = facrad*gsrainvsq1;                                          \
-  numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq;                      \
+  numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq;                   \
  numtyp facang12 = rinv12*facang;                                           \
  numtyp csfacang = cs*facang;                                               \
  numtyp csfac1 = rinvsq1*csfacang;                                          \
@ -353,6 +411,7 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
                                const int nelements,
                                const __global int * dev_nbor,
                                const __global int * dev_packed,
+                                const __global int * dev_short_nbor,
                                __global acctyp4 *restrict ans,
                                __global acctyp *restrict engv,
                                const int eflag, const int vflag,
@ -377,7 +436,7 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int i, numj, nbor_j, nbor_end;
-
+    const int* nbor_mem = dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
@ -387,9 +446,18 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
    int itype=ix.w;
    itype=map[itype];

+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+    int nborj_start = nbor_j;
+
    for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {

-      int j=dev_packed[nbor_j];
+      int j=nbor_mem[nbor_j];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -406,18 +474,27 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
      
      numtyp4 param4_ijparam; fetch4(param4_ijparam,ijparam,param4_tex);
      param_r0sq_ij=param4_ijparam.x;
-      if (rsq1 > param_r0sq_ij) continue;
+      if (rsq1 > param_r0sq_ij) continue; // still keep this for neigh no and tpa > 1
      param_gamma_ij=param4_ijparam.y;
      param_r0_ij=param4_ijparam.w;
      
-      int nbor_k=nbor_j-offset_j+offset_k;
-      if (nbor_k<=nbor_j)
-        nbor_k+=n_stride;
+      int nbor_k,k_end;
+      if (dev_packed==dev_nbor) {
+        nbor_k=nborj_start-offset_j+offset_k;
+        int numk = dev_short_nbor[nbor_k-n_stride];
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      } else {
+        nbor_k = nbor_j-offset_j+offset_k;
+        if (nbor_k<=nbor_j) nbor_k += n_stride;
+        k_end = nbor_end;
+      }

-      for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
+        int k=nbor_mem[nbor_k];
        k &= NEIGHMASK;

+        if (dev_packed==dev_nbor && k <= j) continue;
+
        numtyp4 kx; fetch4(kx,k,pos_tex);
        int ktype=kx.w;
        ktype=map[ktype];
@ -478,6 +555,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
                             const __global int * dev_nbor,
                             const __global int * dev_packed,
                             const __global int * dev_acc,
+                             const __global int * dev_short_nbor,
                             __global acctyp4 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag,
@ -502,7 +580,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int i, numj, nbor_j, nbor_end, k_end;
-
+    const int* nbor_mem = dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
@ -512,8 +590,16 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
    int itype=ix.w;
    itype=map[itype];

+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
    for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
-      int j=dev_packed[nbor_j];
+      int j=nbor_mem[nbor_j];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -529,7 +615,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
      numtyp4 param4_ijparam; fetch4(param4_ijparam,ijparam,param4_tex);
      param_r0sq_ij = param4_ijparam.x;
-      if (rsq1 > param_r0sq_ij) continue;
+      if (rsq1 > param_r0sq_ij) continue; // still keep this for neigh no and tpa > 1

      param_gamma_ij=param4_ijparam.y;
      param_r0_ij = param4_ijparam.w;
@ -551,8 +637,15 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
        nbor_k+=offset_k;
      }

+      // recalculate numk and k_end for the use of short neighbor list
+      if (dev_packed==dev_nbor) {
+        numk = dev_short_nbor[nbor_k];
+        nbor_k += n_stride;
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
+
      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=nbor_mem[nbor_k];
        k &= NEIGHMASK;

        if (k == i) continue;
@ -617,6 +710,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
                             const __global int * dev_nbor,
                             const __global int * dev_packed,
                             const __global int * dev_acc,
+                             const __global int * dev_short_nbor,
                             __global acctyp4 *restrict ans,
                             __global acctyp *restrict engv,
                             const int eflag, const int vflag,
@ -641,7 +735,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int i, numj, nbor_j, nbor_end, k_end;
-
+    const int* nbor_mem = dev_packed;
    int offset_j=offset/t_per_atom;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
              n_stride,nbor_end,nbor_j);
@ -651,8 +745,16 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
    int itype=ix.w;
    itype=map[itype];

+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor_j];
+      nbor_j += n_stride;
+      nbor_end = nbor_j+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
    for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
-      int j=dev_packed[nbor_j];
+      int j=nbor_mem[nbor_j];
      j &= NEIGHMASK;

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@ -668,7 +770,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
      numtyp4 param4_ijparam; fetch4(param4_ijparam,ijparam,param4_tex);
      param_r0sq_ij=param4_ijparam.x;
-      if (rsq1 > param_r0sq_ij) continue;
+      if (rsq1 > param_r0sq_ij) continue;  // still keep this for neigh no and tpa > 1

      param_gamma_ij=param4_ijparam.y;
      param_r0_ij=param4_ijparam.w;
@ -690,8 +792,15 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
        nbor_k+=offset_k;
      }

+      // recalculate numk and k_end for the use of short neighbor list
+      if (dev_packed==dev_nbor) {
+        numk = dev_short_nbor[nbor_k];
+        nbor_k += n_stride;
+        k_end = nbor_k+fast_mul(numk,n_stride);
+      }
+
      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=dev_packed[nbor_k];
+        int k=nbor_mem[nbor_k];
        k &= NEIGHMASK;

        if (k == i) continue;
--- a/lib/gpu/lal_vashishta.h
+++ b/lib/gpu/lal_vashishta.h
@ -82,6 +82,7 @@ class Vashishta : public BaseThree<numtyp, acctyp> {
  UCL_D_Vec<int> elem2param;
  UCL_D_Vec<int> map;
  int _nparams,_nelements;
+  numtyp _cutshortsq;

  UCL_Texture param1_tex, param2_tex, param3_tex, param4_tex, param5_tex;

--- a/lib/kim/Install.py
+++ b/lib/kim/Install.py
@ -6,6 +6,8 @@
 from __future__ import print_function
 import sys,os,re,subprocess

+# help message
+
 help = """
 Syntax from src dir: make lib-kim args="-b -v version  -a kim-name"
                 or: make lib-kim args="-b -a everything"
@ -23,7 +25,7 @@ specify one or more options, order does not matter
  -b = download and build base KIM API library with example Models
       this will delete any previous installation in the current folder
  -n = do NOT download and build base KIM API library.
-       Use an existing installation 
+       Use an existing installation
  -p = specify location of KIM API installation (implies -n)
  -a = add single KIM model or model driver with kim-name
       to existing KIM API lib (see example below).
@ -78,13 +80,27 @@ def which(program):
  return None

 def geturl(url,fname):
+  success = False
+
  if which('curl') != None:
    cmd = 'curl -L -o "%s" %s' % (fname,url)
-  elif which('wget') != None:
+    try:
+      subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
+      success = True
+    except subprocess.CalledProcessError as e:
+      print("Calling curl failed with: %s" % e.output.decode('UTF-8'))
+
+  if not success and which('wget') != None:
    cmd = 'wget -O "%s" %s' % (fname,url)
-  else: error("cannot find 'wget' or 'curl' to download source code")
-  txt = subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
-  return txt
+    try:
+      subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
+      success = True
+    except subprocess.CalledProcessError as e:
+      print("Calling wget failed with: %s" % e.output.decode('UTF-8'))
+
+  if not success:
+    error("Failed to download source code with 'curl' or 'wget'")
+  return

 # parse args

--- a/lib/kokkos/CHANGELOG.md
+++ b/lib/kokkos/CHANGELOG.md
@ -1,5 +1,46 @@
 # Change Log

+## [2.03.13](https://github.com/kokkos/kokkos/tree/2.03.13) (2017-07-27)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.05...2.03.13)
+
+**Implemented enhancements:**
+
+- Disallow enabling both OpenMP and Threads in the same executable [\#406](https://github.com/kokkos/kokkos/issues/406)
+- Make Kokkos::OpenMP respect OMP environment even if hwloc is available [\#630](https://github.com/kokkos/kokkos/issues/630)
+- Improve Atomics Performance on KNL/Broadwell where PREFETCHW/RFO is Available [\#898](https://github.com/kokkos/kokkos/issues/898)
+- Kokkos::resize should test whether dimensions have changed before resizing [\#904](https://github.com/kokkos/kokkos/issues/904)
+- Develop performance-regression/acceptance tests [\#737](https://github.com/kokkos/kokkos/issues/737)
+- Make the deep\_copy Profiling hook a start/end system [\#890](https://github.com/kokkos/kokkos/issues/890)
+- Add deep\_copy Profiling hook [\#843](https://github.com/kokkos/kokkos/issues/843)
+- Append tag name to parallel construct name for Profiling [\#842](https://github.com/kokkos/kokkos/issues/842)
+- Add view label to `View bounds error` message for CUDA backend [\#870](https://github.com/kokkos/kokkos/issues/870)
+- Disable printing the loaded profiling library [\#824](https://github.com/kokkos/kokkos/issues/824)
+- "Declared but never referenced" warnings [\#853](https://github.com/kokkos/kokkos/issues/853)
+- Warnings about lock\_address\_cuda\_space [\#852](https://github.com/kokkos/kokkos/issues/852)
+- WorkGraph execution policy [\#771](https://github.com/kokkos/kokkos/issues/771)
+- Simplify makefiles by guarding compilation with appropriate KOKKOS\_ENABLE\_\#\#\# macros [\#716](https://github.com/kokkos/kokkos/issues/716)
+- Cmake build: wrong include install directory [\#668](https://github.com/kokkos/kokkos/issues/668)
+- Derived View type and allocation [\#566](https://github.com/kokkos/kokkos/issues/566)
+- Fix Compiler warnings when compiling core unit tests for Cuda [\#214](https://github.com/kokkos/kokkos/issues/214)
+
+**Fixed bugs:**
+
+- Out-of-bounds read in Kokkos\_Layout.hpp [\#975](https://github.com/kokkos/kokkos/issues/975)
+- CudaClang: Fix failing test with Clang 4.0 [\#941](https://github.com/kokkos/kokkos/issues/941)
+- Respawn when memory pool allocation fails \(not available memory\) [\#940](https://github.com/kokkos/kokkos/issues/940)
+- Memory pool aborts on zero allocation request, returns NULL for \< minimum [\#939](https://github.com/kokkos/kokkos/issues/939)
+- Error with TaskScheduler query of underlying memory pool [\#917](https://github.com/kokkos/kokkos/issues/917)
+- Profiling::\*Callee static variables declared in header [\#863](https://github.com/kokkos/kokkos/issues/863)
+- calling \*Space::name\(\) causes compile error [\#862](https://github.com/kokkos/kokkos/issues/862)
+- bug in Profiling::deallocateData [\#860](https://github.com/kokkos/kokkos/issues/860)
+- task\_depend test failing, CUDA 8.0 + Pascal + RDC [\#829](https://github.com/kokkos/kokkos/issues/829)
+- \[develop branch\] Standalone cmake issues [\#826](https://github.com/kokkos/kokkos/issues/826)
+- Kokkos CUDA failes to compile with OMPI\_CXX and MPICH\_CXX wrappers [\#776](https://github.com/kokkos/kokkos/issues/776)
+- Task Team reduction on Pascal [\#767](https://github.com/kokkos/kokkos/issues/767)
+- CUDA stack overflow with TaskDAG test [\#758](https://github.com/kokkos/kokkos/issues/758)
+- TeamVector test on Cuda [\#670](https://github.com/kokkos/kokkos/issues/670)
+- Clang 4.0 Cuda Build broken again [\#560](https://github.com/kokkos/kokkos/issues/560)
+

 ## [2.03.05](https://github.com/kokkos/kokkos/tree/2.03.05) (2017-05-27)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.00...2.03.05)
--- a/lib/kokkos/Makefile.kokkos
+++ b/lib/kokkos/Makefile.kokkos
@ -33,6 +33,7 @@ KOKKOS_INTERNAL_USE_LIBRT := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "lib
 KOKKOS_INTERNAL_USE_MEMKIND := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "experimental_memkind" | wc -l))

 # Check for advanced settings.
+KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "compiler_warnings" | wc -l))
 KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "aggressive_vectorization" | wc -l))
 KOKKOS_INTERNAL_DISABLE_PROFILING := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "disable_profiling" | wc -l))
 KOKKOS_INTERNAL_DISABLE_DUALVIEW_MODIFY_CHECK := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "disable_dualview_modify_check" | wc -l))
@ -78,14 +79,14 @@ KOKKOS_INTERNAL_COMPILER_PGI         := $(strip $(shell $(CXX) --version       2
 KOKKOS_INTERNAL_COMPILER_XL          := $(strip $(shell $(CXX) -qversion       2>&1 | grep XL                  | wc -l))
 KOKKOS_INTERNAL_COMPILER_CRAY        := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep "CC-"               | wc -l))
 KOKKOS_INTERNAL_COMPILER_NVCC        := $(strip $(shell $(CXX) --version       2>&1 | grep nvcc                | wc -l))
-KOKKOS_INTERNAL_COMPILER_CLANG       := $(strip $(shell $(CXX) --version       2>&1 | grep clang               | wc -l))
-KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(strip $(shell $(CXX) --version       2>&1 | grep "apple-darwin"      | wc -l))
 ifneq ($(OMPI_CXX),)
-  KOKKOS_INTERNAL_COMPILER_NVCC  := $(strip $(shell $(OMPI_CXX) --version   2>&1 | grep "nvcc" | wc -l))
+  KOKKOS_INTERNAL_COMPILER_NVCC      := $(strip $(shell $(OMPI_CXX) --version       2>&1 | grep nvcc                | wc -l))
 endif
 ifneq ($(MPICH_CXX),)
-  KOKKOS_INTERNAL_COMPILER_NVCC  := $(strip $(shell $(MPICH_CXX) --version  2>&1 | grep "nvcc" | wc -l))
+  KOKKOS_INTERNAL_COMPILER_NVCC      := $(strip $(shell $(MPICH_CXX) --version       2>&1 | grep nvcc                | wc -l))
 endif
+KOKKOS_INTERNAL_COMPILER_CLANG       := $(strip $(shell $(CXX) --version       2>&1 | grep clang               | wc -l))
+KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(strip $(shell $(CXX) --version       2>&1 | grep "apple-darwin"      | wc -l))

 ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 2)
  KOKKOS_INTERNAL_COMPILER_CLANG = 1
@ -111,6 +112,36 @@ ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
  endif
 endif

+# Set compiler warnings flags.
+ifeq ($(KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS), 1)
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+    # TODO check if PGI accepts GNU style warnings
+    KOKKOS_INTERNAL_COMPILER_WARNINGS =
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
+      KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
+    else
+      ifeq ($(KOKKOS_INTERNAL_COMPILER_APPLE_CLANG), 1)
+        KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
+      else
+        ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
+          KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
+        else
+          ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+            # TODO check if cray accepts GNU style warnings
+            KOKKOS_INTERNAL_COMPILER_WARNINGS =
+          else
+            #gcc
+            KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized
+          endif
+        endif
+      endif
+    endif
+  endif
+else
+  KOKKOS_INTERNAL_COMPILER_WARNINGS =
+endif
+
 # Set OpenMP flags.
 ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
  KOKKOS_INTERNAL_OPENMP_FLAG := -mp
@ -162,6 +193,7 @@ endif

 # Intel based.
 KOKKOS_INTERNAL_USE_ARCH_KNC := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNC | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_WSM := $(strip $(shell echo $(KOKKOS_ARCH) | grep WSM | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_SNB := $(strip $(shell echo $(KOKKOS_ARCH) | grep SNB | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_HSW := $(strip $(shell echo $(KOKKOS_ARCH) | grep HSW | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_BDW := $(strip $(shell echo $(KOKKOS_ARCH) | grep BDW | wc -l))
@ -229,13 +261,14 @@ KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_
 KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(strip $(shell echo $(KOKKOS_ARCH) | grep AMDAVX | wc -l))

 # Any AVX?
+KOKKOS_INTERNAL_USE_ARCH_SSE42      := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_WSM) | bc ))
 KOKKOS_INTERNAL_USE_ARCH_AVX        := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX) | bc ))
 KOKKOS_INTERNAL_USE_ARCH_AVX2       := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW) | bc ))
 KOKKOS_INTERNAL_USE_ARCH_AVX512MIC  := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
 KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))

 # Decide what ISA level we are able to support.
-KOKKOS_INTERNAL_USE_ISA_X86_64    := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
+KOKKOS_INTERNAL_USE_ISA_X86_64    := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_WSM)+$(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
 KOKKOS_INTERNAL_USE_ISA_KNC       := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNC) | bc ))
 KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc ))

@ -243,7 +276,7 @@ KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_
 KOKKOS_INTERNAL_USE_TM            := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))

 # Incompatible flags?
-KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1" | bc ))
+KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1" | bc ))
 KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1" | bc))

 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1)
@ -257,12 +290,10 @@ endif

 KOKKOS_CPPFLAGS = -I./ -I$(KOKKOS_PATH)/core/src -I$(KOKKOS_PATH)/containers/src -I$(KOKKOS_PATH)/algorithms/src

-# No warnings:
 KOKKOS_CXXFLAGS =
-# INTEL and CLANG warnings:
-#KOKKOS_CXXFLAGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
-# GCC warnings:
-#KOKKOS_CXXFLAGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized -Wignored-qualifiers -Wempty-body -Wclobbered
+ifeq ($(KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS), 1)
+  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_WARNINGS)
+endif

 KOKKOS_LIBS = -lkokkos -ldl
 KOKKOS_LDFLAGS = -L$(shell pwd)
@ -486,6 +517,28 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX), 1)
  endif
 endif

+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_SSE42), 1)
+  tmp := $(shell echo "\#define KOKKOS_ARCH_SSE42 1" >> KokkosCore_config.tmp )
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
+    KOKKOS_CXXFLAGS += -xSSE4.2
+    KOKKOS_LDFLAGS  += -xSSE4.2
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+
+    else
+      ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+        KOKKOS_CXXFLAGS += -tp=nehalem
+        KOKKOS_LDFLAGS  += -tp=nehalem
+      else
+        # Assume that this is a really a GNU compiler.
+        KOKKOS_CXXFLAGS += -msse4.2
+        KOKKOS_LDFLAGS  += -msse4.2
+      endif
+    endif
+  endif
+endif
+
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1)
  tmp := $(shell echo "\#define KOKKOS_ARCH_AVX 1" >> KokkosCore_config.tmp )

@ -689,7 +742,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
  endif
 endif

-KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h)
+KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h 2>&1)
 ifeq ($(KOKKOS_INTERNAL_LS_CONFIG), KokkosCore_config.h)
  KOKKOS_INTERNAL_NEW_CONFIG := $(strip $(shell diff KokkosCore_config.h KokkosCore_config.tmp | grep define | wc -l))
 else
--- a/lib/kokkos/Makefile.targets
+++ b/lib/kokkos/Makefile.targets
@ -20,8 +20,10 @@ Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Ta
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
 Kokkos_HostThreadTeam.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp
-Kokkos_spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
+Kokkos_Spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp
+Kokkos_Rendezvous.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Rendezvous.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Rendezvous.cpp
 Kokkos_Profiling_Interface.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp
 Kokkos_SharedAlloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_SharedAlloc.cpp
@ -36,6 +38,8 @@ Kokkos_CudaSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cu
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp
 Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
+Kokkos_Cuda_Locks.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
--- a/lib/kokkos/algorithms/unit_tests/TestOpenMP.cpp
+++ b/lib/kokkos/algorithms/unit_tests/TestOpenMP.cpp
@ -61,14 +61,19 @@ protected:
  {
    std::cout << std::setprecision(5) << std::scientific;

-    unsigned threads_count = omp_get_max_threads();
+    int threads_count = 0;
+    #pragma omp parallel
+    {
+      #pragma omp atomic
+      ++threads_count;
+    }

-    if ( Kokkos::hwloc::available() ) {
-      threads_count = Kokkos::hwloc::get_available_numa_count() *
-                      Kokkos::hwloc::get_available_cores_per_numa();
+    if (threads_count > 3) {
+      threads_count /= 2;
    }

    Kokkos::OpenMP::initialize( threads_count );
+    Kokkos::OpenMP::print_configuration( std::cout );
  }

  static void TearDownTestCase()
--- a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp
+++ b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp
@ -1,12 +1,12 @@
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -35,7 +35,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER

@ -283,12 +283,12 @@ struct test_random_scalar {
                      RandomGenerator& pool,
                      unsigned int num_draws)
  {
-    using std::cerr;
+    using std::cout;
    using std::endl;
    using Kokkos::parallel_reduce;

    {
-      cerr << " -- Testing randomness properties" << endl;
+      cout << " -- Testing randomness properties" << endl;

      RandomProperties result;
      typedef test_random_functor<RandomGenerator, Scalar> functor_type;
@ -307,7 +307,7 @@ struct test_random_scalar {
                    ( 1.5*tolerance > variance_eps)) ? 1:0;
      pass_covar = ((-2.0*tolerance < covariance_eps) &&
                    ( 2.0*tolerance > covariance_eps)) ? 1:0;
-      cerr << "Pass: " << pass_mean
+      cout << "Pass: " << pass_mean
           << " " << pass_var
           << " " << mean_eps
           << " " << variance_eps
@ -315,7 +315,7 @@ struct test_random_scalar {
           << " || " << tolerance << endl;
    }
    {
-      cerr << " -- Testing 1-D histogram" << endl;
+      cout << " -- Testing 1-D histogram" << endl;

      RandomProperties result;
      typedef test_histogram1d_functor<typename RandomGenerator::device_type> functor_type;
@ -335,7 +335,7 @@ struct test_random_scalar {
      pass_hist1d_covar = ((-0.06 < covariance_eps) &&
                           ( 0.06 > covariance_eps)) ? 1:0;

-      cerr << "Density 1D: " << mean_eps
+      cout << "Density 1D: " << mean_eps
           << " " << variance_eps
           << " " << (result.covariance/HIST_DIM1D/HIST_DIM1D)
           << " || " << tolerance
@ -348,7 +348,7 @@ struct test_random_scalar {
           << endl;
    }
    {
-      cerr << " -- Testing 3-D histogram" << endl;
+      cout << " -- Testing 3-D histogram" << endl;

      RandomProperties result;
      typedef test_histogram3d_functor<typename RandomGenerator::device_type> functor_type;
@ -368,7 +368,7 @@ struct test_random_scalar {
      pass_hist3d_covar = ((-tolerance < covariance_eps) &&
                           ( tolerance > covariance_eps)) ? 1:0;

-      cerr << "Density 3D: " << mean_eps
+      cout << "Density 3D: " << mean_eps
           << " " << variance_eps
           << " " << result.covariance/HIST_DIM1D/HIST_DIM1D
           << " || " << tolerance
@ -381,18 +381,18 @@ struct test_random_scalar {
 template <class RandomGenerator>
 void test_random(unsigned int num_draws)
 {
-  using std::cerr;
+  using std::cout;
  using std::endl;
  typename test_random_functor<RandomGenerator,int>::type_1d density_1d("D1d");
  typename test_random_functor<RandomGenerator,int>::type_3d density_3d("D3d");


  uint64_t ticks = std::chrono::high_resolution_clock::now().time_since_epoch().count();
-  cerr << "Test Seed:" << ticks << endl;
+  cout << "Test Seed:" << ticks << endl;

  RandomGenerator pool(ticks);

-  cerr << "Test Scalar=int" << endl;
+  cout << "Test Scalar=int" << endl;
  test_random_scalar<RandomGenerator,int> test_int(density_1d,density_3d,pool,num_draws);
  ASSERT_EQ( test_int.pass_mean,1);
  ASSERT_EQ( test_int.pass_var,1);
@ -406,7 +406,7 @@ void test_random(unsigned int num_draws)
  deep_copy(density_1d,0);
  deep_copy(density_3d,0);

-  cerr << "Test Scalar=unsigned int" << endl;
+  cout << "Test Scalar=unsigned int" << endl;
  test_random_scalar<RandomGenerator,unsigned int> test_uint(density_1d,density_3d,pool,num_draws);
  ASSERT_EQ( test_uint.pass_mean,1);
  ASSERT_EQ( test_uint.pass_var,1);
@ -420,7 +420,7 @@ void test_random(unsigned int num_draws)
  deep_copy(density_1d,0);
  deep_copy(density_3d,0);

-  cerr << "Test Scalar=int64_t" << endl;
+  cout << "Test Scalar=int64_t" << endl;
  test_random_scalar<RandomGenerator,int64_t> test_int64(density_1d,density_3d,pool,num_draws);
  ASSERT_EQ( test_int64.pass_mean,1);
  ASSERT_EQ( test_int64.pass_var,1);
@ -434,7 +434,7 @@ void test_random(unsigned int num_draws)
  deep_copy(density_1d,0);
  deep_copy(density_3d,0);

-  cerr << "Test Scalar=uint64_t" << endl;
+  cout << "Test Scalar=uint64_t" << endl;
  test_random_scalar<RandomGenerator,uint64_t> test_uint64(density_1d,density_3d,pool,num_draws);
  ASSERT_EQ( test_uint64.pass_mean,1);
  ASSERT_EQ( test_uint64.pass_var,1);
@ -448,7 +448,7 @@ void test_random(unsigned int num_draws)
  deep_copy(density_1d,0);
  deep_copy(density_3d,0);

-  cerr << "Test Scalar=float" << endl;
+  cout << "Test Scalar=float" << endl;
  test_random_scalar<RandomGenerator,float> test_float(density_1d,density_3d,pool,num_draws);
  ASSERT_EQ( test_float.pass_mean,1);
  ASSERT_EQ( test_float.pass_var,1);
@ -462,7 +462,7 @@ void test_random(unsigned int num_draws)
  deep_copy(density_1d,0);
  deep_copy(density_3d,0);

-  cerr << "Test Scalar=double" << endl;
+  cout << "Test Scalar=double" << endl;
  test_random_scalar<RandomGenerator,double> test_double(density_1d,density_3d,pool,num_draws);
  ASSERT_EQ( test_double.pass_mean,1);
  ASSERT_EQ( test_double.pass_var,1);
--- a/lib/kokkos/algorithms/unit_tests/UnitTestMain.cpp
+++ b/lib/kokkos/algorithms/unit_tests/UnitTestMain.cpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
--- a/lib/kokkos/benchmarks/bytes_and_flops/main.cpp
+++ b/lib/kokkos/benchmarks/bytes_and_flops/main.cpp
@ -44,12 +44,13 @@
 #include<Kokkos_Core.hpp>
 #include<impl/Kokkos_Timer.hpp>
 #include<bench.hpp>
+#include<cstdlib>

 int main(int argc, char* argv[]) {
  Kokkos::initialize();
-  

-  if(argc<10) { 
+
+  if(argc<10) {
    printf("Arguments: N K R D U F T S\n");
    printf("  P:   Precision (1==float, 2==double)\n");
    printf("  N,K: dimensions of the 2D array to allocate\n");
@ -68,7 +69,7 @@ int main(int argc, char* argv[]) {
    Kokkos::finalize();
    return 0;
  }
-  
+

  int P = atoi(argv[1]);
  int N = atoi(argv[2]);
@ -80,7 +81,7 @@ int main(int argc, char* argv[]) {
  int T = atoi(argv[8]);
  int S = atoi(argv[9]);

-  if(U>8) {printf("U must be 1-8\n"); return 0;} 
+  if(U>8) {printf("U must be 1-8\n"); return 0;}
  if( (D!=1) && (D!=2) && (D!=4) && (D!=8) && (D!=16) && (D!=32)) {printf("D must be one of 1,2,4,8,16,32\n"); return 0;}
  if( (P!=1) && (P!=2) ) {printf("P must be one of 1,2\n"); return 0;}

--- a/lib/kokkos/benchmarks/gather/main.cpp
+++ b/lib/kokkos/benchmarks/gather/main.cpp
@ -44,11 +44,11 @@
 #include<Kokkos_Core.hpp>
 #include<impl/Kokkos_Timer.hpp>
 #include<gather.hpp>
+#include<cstdlib>

 int main(int argc, char* argv[]) {
  Kokkos::initialize(argc,argv);

-
  if(argc<8) {
    printf("Arguments: S N K D\n");
    printf("  S:   Scalar Type Size (1==float, 2==double, 4=complex<double>)\n");
--- a/lib/kokkos/benchmarks/policy_performance/Makefile
+++ b/lib/kokkos/benchmarks/policy_performance/Makefile
@ -0,0 +1,44 @@
+KOKKOS_PATH = ../..
+SRC = $(wildcard *.cpp)
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
+CXXFLAGS = -O3 -g
+LINK = ${CXX}
+LINKFLAGS = 
+EXE = policy_performance.cuda
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+KOKKOS_CUDA_OPTIONS+=enable_lambda
+else
+CXX = g++
+CXXFLAGS = -O3 -g -Wall -Werror
+LINK = ${CXX}
+LINKFLAGS =  
+EXE = policy_performance.host
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+
+DEPFLAGS = -M
+
+OBJ = $(SRC:.cpp=.o)
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS) main.cpp policy_perf_test.hpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
--- a/lib/kokkos/benchmarks/policy_performance/main.cpp
+++ b/lib/kokkos/benchmarks/policy_performance/main.cpp
@ -0,0 +1,170 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include "policy_perf_test.hpp"
+
+int main(int argc, char* argv[] ) {
+  Kokkos::initialize(argc,argv);
+
+  if(argc<10) {
+    printf("  Ten arguments are needed to run this program:\n");
+    printf("    (1)team_range, (2)thread_range, (3)vector_range, (4)outer_repeat, (5)thread_repeat, (6)vector_repeat, (7)team_size, (8)vector_size, (9)schedule, (10)test_type\n");
+    printf("  team_range:     number of teams (league_size)\n");
+    printf("  thread_range:   range for nested TeamThreadRange parallel_*\n");
+    printf("  vector_range:   range for nested ThreadVectorRange parallel_*\n");
+    printf("  outer_repeat:   number of repeats for outer parallel_* call\n");
+    printf("  thread_repeat:  number of repeats for TeamThreadRange parallel_* call\n");
+    printf("  vector_repeat:  number of repeats for ThreadVectorRange parallel_* call\n");
+    printf("  team_size:      number of team members (team_size)\n");
+    printf("  vector_size:    desired vectorization (if possible)\n");
+    printf("  schedule:       1 == Static  2 == Dynamic\n");
+    printf("  test_type:      3-digit code XYZ for testing (nested) parallel_*\n");
+    printf("  code key:       XYZ    X in {1,2,3,4,5}, Y in {0,1,2}, Z in {0,1,2}\n");
+    printf("                  TeamPolicy:\n");
+    printf("                    X: 0 = none (never used, makes no sense); 1 = parallel_for; 2 = parallel_reduce\n");
+    printf("                    Y: 0 = none; 1 = parallel_for; 2 = parallel_reduce\n");
+    printf("                    Z: 0 = none; 1 = parallel_for; 2 = parallel_reduce\n");
+    printf("                  RangePolicy:\n");
+    printf("                    X: 3 = parallel_for; 4 = parallel_reduce; 5 = parallel_scan\n");
+    printf("                    Y: 0 = none\n");
+    printf("                    Z: 0 = none\n");
+    printf("  Example Input:\n");
+    printf("  100000 32 32 100 100 100 8 1 1 100\n"); 
+    Kokkos::finalize();
+    return 0;
+  }
+
+  int team_range = atoi(argv[1]);
+  int thread_range = atoi(argv[2]);
+  int vector_range = atoi(argv[3]);
+
+  int outer_repeat = atoi(argv[4]);
+  int thread_repeat = atoi(argv[5]);
+  int vector_repeat = atoi(argv[6]);
+
+  int team_size = atoi(argv[7]);
+  int vector_size = atoi(argv[8]);
+  int schedule = atoi(argv[9]);
+  int test_type = atoi(argv[10]);
+
+  int disable_verbose_output = 0; 
+  if ( argc > 11 ) {
+    disable_verbose_output = atoi(argv[11]);
+  }
+
+  if ( schedule != 1 && schedule != 2 ) {
+    printf("schedule: %d\n", schedule);
+    printf("Options for schedule are: 1 == Static  2 == Dynamic\n");
+    Kokkos::finalize();
+    return -1;
+  }
+
+  if ( test_type != 100 && test_type != 110 && test_type != 111 && test_type != 112 && test_type != 120  && test_type != 121  && test_type != 122
+     && test_type != 200 && test_type != 210 && test_type != 211 && test_type != 212 && test_type != 220  && test_type != 221  && test_type != 222
+     && test_type != 300 && test_type != 400 && test_type != 500
+     )
+  {
+    printf("Incorrect test_type option\n");
+    Kokkos::finalize();
+    return -2;
+  }
+
+  double result = 0.0;
+
+  Kokkos::parallel_reduce( "parallel_reduce warmup", Kokkos::TeamPolicy<>(10,1), 
+    KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type team, double& lval) {
+      lval += 1;
+    }, result);
+
+  typedef Kokkos::View<double*, Kokkos::LayoutRight>   view_type_1d;
+  typedef Kokkos::View<double**, Kokkos::LayoutRight>  view_type_2d;
+  typedef Kokkos::View<double***, Kokkos::LayoutRight> view_type_3d;
+
+  // Allocate view without initializing
+  // Call a 'warmup' test with 1 repeat - this will initialize the corresponding view appropriately for test and should obey first-touch etc
+  // Second call to test is the one we actually care about and time
+  view_type_1d v_1( Kokkos::ViewAllocateWithoutInitializing("v_1"), team_range*team_size);
+  view_type_2d v_2( Kokkos::ViewAllocateWithoutInitializing("v_2"), team_range*team_size, thread_range);
+  view_type_3d v_3( Kokkos::ViewAllocateWithoutInitializing("v_3"), team_range*team_size, thread_range, vector_range);
+
+  double result_computed = 0.0;
+  double result_expect = 0.0;
+  double time = 0.0;
+
+  if(schedule==1) {
+    if ( test_type != 500 ) {
+      // warmup - no repeat of loops
+      test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
+      test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
+    }
+    else {
+      // parallel_scan: initialize 1d view for parallel_scan
+      test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,100,v_1,v_2,v_3,result_computed,result_expect,time);
+      test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
+    }
+  }
+  if(schedule==2) {
+    if ( test_type != 500 ) {
+      // warmup - no repeat of loops
+      test_policy<Kokkos::Schedule<Kokkos::Dynamic>,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
+      test_policy<Kokkos::Schedule<Kokkos::Dynamic>,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
+    }
+    else {
+      // parallel_scan: initialize 1d view for parallel_scan
+      test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,100,v_1,v_2,v_3,result_computed,result_expect,time);
+      test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
+    }
+  }
+
+  if ( disable_verbose_output == 0 ) {
+    printf("%7i %4i %2i %9i %4i %4i %4i %2i %1i %3i %e %e %lf\n",team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,schedule,test_type,result_computed,result_expect,time);
+  }
+  else {
+    printf("%lf\n",time);
+  }
+
+  Kokkos::finalize();
+
+  return 0;
+}
--- a/lib/kokkos/benchmarks/policy_performance/policy_perf_test.hpp
+++ b/lib/kokkos/benchmarks/policy_performance/policy_perf_test.hpp
@ -0,0 +1,354 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+template < class ViewType >
+struct ParallelScanFunctor {
+  using value_type = double;
+  ViewType v;
+
+  ParallelScanFunctor( const ViewType & v_ )
+    : v(v_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+    void operator()( const int idx, value_type& val, const bool& final ) const
+    {
+      // inclusive scan
+      val += v(idx);
+      if ( final ) {
+        v(idx) = val;
+      }
+    }
+};
+
+template<class ScheduleType,class IndexType,class ViewType1, class ViewType2, class ViewType3>
+void test_policy(int team_range, int thread_range, int vector_range,
+          int outer_repeat, int thread_repeat, int inner_repeat,
+          int team_size, int vector_size, int test_type,
+          ViewType1 &v1, ViewType2 &v2, ViewType3 &v3,
+          double &result, double &result_expect, double &time) {
+  
+  typedef Kokkos::TeamPolicy<ScheduleType,IndexType> t_policy;
+  typedef typename t_policy::member_type t_team;
+  Kokkos::Timer timer;
+  
+  for(int orep = 0; orep<outer_repeat; orep++) {
+
+    if (test_type == 100) {
+      Kokkos::parallel_for("100 outer for", t_policy(team_range,team_size),
+        KOKKOS_LAMBDA (const t_team& team) {
+          long idx = team.league_rank()*team.team_size() + team.team_rank();
+          v1(idx) = idx;
+          // prevent compiler optimizing loop away
+      });
+    }
+
+    if (test_type == 110) {
+      Kokkos::parallel_for("110 outer for", t_policy(team_range,team_size),
+        KOKKOS_LAMBDA (const t_team& team) {
+          long idx = team.league_rank()*team.team_size() + team.team_rank();
+          for (int tr = 0; tr<thread_repeat; ++tr) {
+            // Each team launches a parallel_for; thread_range is partitioned among team members
+            Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
+              v2( idx, t ) = t;
+              // prevent compiler optimizing loop away
+            });
+          }             
+      });
+    }
+    if (test_type == 111) {
+      Kokkos::parallel_for("111 outer for", t_policy(team_range,team_size,vector_size),
+        KOKKOS_LAMBDA (const t_team& team) {
+          long idx = team.league_rank()*team.team_size() + team.team_rank();
+          for (int tr = 0; tr<thread_repeat; ++tr) {
+            // Each team launches a parallel_for; thread_range is partitioned among team members
+            Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
+              for (int vr = 0; vr<inner_repeat; ++vr)
+                Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,vector_range), [&] (const int vi) {
+                  v3( idx, t, vi ) = vi;
+                  // prevent compiler optimizing loop away
+                });
+            });
+          }
+      });
+    }
+    if (test_type == 112) {
+      Kokkos::parallel_for("112 outer for", t_policy(team_range,team_size,vector_size),
+        KOKKOS_LAMBDA (const t_team& team) {
+          long idx = team.league_rank()*team.team_size() + team.team_rank();
+          for (int tr = 0; tr<thread_repeat; ++tr) {
+            // Each team launches a parallel_for; thread_range is partitioned among team members
+            Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
+              double vector_result = 0.0;
+              for (int vr = 0; vr<inner_repeat; ++vr) {
+                vector_result = 0.0;
+                Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,vector_range), [&] (const int vi, double &vval) {
+                  vval += 1;
+                }, vector_result);
+              }
+              v2( idx, t ) = vector_result;
+              // prevent compiler optimizing loop away
+            });
+          }
+      });
+    }
+    if (test_type == 120) {
+      Kokkos::parallel_for("120 outer for", t_policy(team_range,team_size),
+        KOKKOS_LAMBDA (const t_team& team) {
+          long idx = team.league_rank()*team.team_size() + team.team_rank();
+          double team_result = 0.0;
+          for (int tr = 0; tr<thread_repeat; ++tr) {
+            team_result = 0.0;
+            Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double &lval) {
+              lval += 1;
+            }, team_result);
+          }
+          v1(idx) = team_result;
+          // prevent compiler optimizing loop away
+      });
+    }
+    if (test_type == 121) {
+      Kokkos::parallel_for("121 outer for", t_policy(team_range,team_size,vector_size),
+        KOKKOS_LAMBDA (const t_team& team) {
+          long idx = team.league_rank()*team.team_size() + team.team_rank();
+          double team_result = 0.0;
+          for (int tr = 0; tr<thread_repeat; ++tr) {
+            team_result = 0.0;
+            Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double &lval) {
+              lval += 1;
+              for (int vr = 0; vr<inner_repeat; ++vr) {
+                Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,vector_range), [&] (const int vi) {
+                  v3( idx, t, vi ) = vi;
+                  // prevent compiler optimizing loop away
+                });
+              }
+            }, team_result);
+          }
+          v3( idx, 0, 0 ) = team_result;
+          // prevent compiler optimizing loop away
+      });
+    }
+    if (test_type == 122) {
+      Kokkos::parallel_for("122 outer for", t_policy(team_range,team_size,vector_size),
+        KOKKOS_LAMBDA (const t_team& team) {
+          long idx = team.league_rank()*team.team_size() + team.team_rank();
+          double team_result = 0.0;
+          for (int tr = 0; tr<thread_repeat; ++tr) {
+            Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double &lval) {
+              double vector_result = 0.0;
+              for (int vr = 0; vr<inner_repeat; ++vr)
+                vector_result = 0.0;
+                Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,vector_range), [&] (const int vi, double &vval) {
+                  vval += 1;
+                }, vector_result);
+                lval += vector_result;
+            }, team_result);
+          }
+          v1(idx) = team_result;
+          // prevent compiler optimizing loop away
+      });
+    }
+    if (test_type == 200) {
+      Kokkos::parallel_reduce("200 outer reduce", t_policy(team_range,team_size),                        
+        KOKKOS_LAMBDA (const t_team& team, double& lval) {
+          lval+=team.team_size()*team.league_rank() + team.team_rank();
+      },result);
+      result_expect = 0.5* (team_range*team_size)*(team_range*team_size-1);
+      // sum ( seq( [0, team_range*team_size) )
+    }
+    if (test_type == 210) {
+      Kokkos::parallel_reduce("210 outer reduce", t_policy(team_range,team_size),
+        KOKKOS_LAMBDA (const t_team& team, double& lval) {
+        long idx = team.league_rank()*team.team_size() + team.team_rank();
+        double thread_for = 1.0;
+        for(int tr = 0; tr<thread_repeat; tr++) {
+          Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
+            v2(idx,t) = t;
+            // prevent compiler optimizing loop away
+          });
+        }
+        lval+=(team.team_size()*team.league_rank() + team.team_rank() + thread_for);
+      },result);
+      result_expect = 0.5* (team_range*team_size)*(team_range*team_size-1) + (team_range*team_size);
+      // sum ( seq( [0, team_range*team_size) + 1 per team_member (total of team_range*team_size) )
+    }
+    if (test_type == 211) {
+      Kokkos::parallel_reduce("211 outer reduce", t_policy(team_range,team_size,vector_size),
+        KOKKOS_LAMBDA (const t_team& team, double& lval) {
+        long idx = team.league_rank()*team.team_size() + team.team_rank();
+        double thread_for = 1.0;
+        for(int tr = 0; tr<thread_repeat; tr++) {
+          Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
+            for (int vr = 0; vr<inner_repeat; ++vr)
+              Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_range), [&] (const int vi) {
+                v3(idx, t, vi) = vi;
+                // prevent compiler optimizing loop away
+              });
+          });
+        }
+        lval+=idx+thread_for;
+      },result);
+      result_expect = 0.5*(team_range*team_size)*(team_range*team_size-1) + (team_range*team_size);
+      // sum ( seq( [0, team_range*team_size) + 1 per team_member (total of team_range*team_size) )
+    }
+    if (test_type == 212) {
+      Kokkos::parallel_reduce("212 outer reduce", t_policy(team_range,team_size,vector_size),
+        KOKKOS_LAMBDA (const t_team& team, double& lval) {
+        long idx = team.league_rank()*team.team_size() + team.team_rank();
+        double vector_result = 0.0;
+        for(int tr = 0; tr<thread_repeat; tr++) {
+          // This parallel_for is executed by each team; the thread_range is partitioned among the team members
+          Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
+            v2(idx,t) = t;
+            // prevent compiler optimizing loop away
+            for (int vr = 0; vr<inner_repeat; ++vr) {
+              vector_result = 0.0;
+              Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, vector_range), [&] (const int vi, double &vval) {
+                vval += vi;
+              }, vector_result );
+            }
+          });
+        }
+        lval+= idx + vector_result;
+      },result);
+      result_expect = 0.5*(team_range*team_size)*(team_range*team_size-1) + (0.5*vector_range*(vector_range-1)*team_range*team_size);
+      // sum ( seq( [0, team_range*team_size) + sum( seq( [0, vector_range) ) per team_member (total of team_range*team_size) )
+    }
+    if (test_type == 220) {
+      Kokkos::parallel_reduce("220 outer reduce", t_policy(team_range,team_size),
+        KOKKOS_LAMBDA (const t_team& team, double& lval) {
+        double team_result = 0.0;
+        for(int tr = 0; tr<thread_repeat; tr++) {
+          Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double& tval) {
+            tval += t;
+          },team_result);
+        }
+        lval+=team_result*team.league_rank(); // constant * league_rank
+      },result);
+      result_expect = 0.5*(team_range)*(team_range-1) * team_size * 0.5*(thread_range)*(thread_range-1);
+      // sum ( seq( [0, team_range) * constant ); constant = sum( seq( [0, thread_range) )*team_size (1 per member, result for each team)
+    }
+    if (test_type == 221) {
+      Kokkos::parallel_reduce("221 outer reduce", t_policy(team_range,team_size,vector_size),
+        KOKKOS_LAMBDA (const t_team& team, double& lval) {
+        long idx = team.league_rank()*team.team_size() + team.team_rank();
+        double team_result = 0;
+        for(int tr = 0; tr<thread_repeat; tr++) {
+          Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double& tval) {
+            double vector_for = 1.0;
+            for (int vr = 0; vr<inner_repeat; ++vr) {
+              Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_range), [&] (const int vi) {
+                v3(idx, t, vi) = vi;
+                // prevent compiler optimizing loop away
+              });
+            }
+            tval += t + vector_for;
+          },team_result);
+        }
+        lval+=team_result*team.league_rank();
+      },result);
+      result_expect = 0.5* (team_range)*(team_range-1) * team_size * (0.5*(thread_range) * (thread_range-1) + thread_range);
+      // sum ( seq( [0, team_range) * constant ) + 1 per member per team; constant = sum( seq( [0, thread_range) )*team_size (1 per member, result for each team)
+    }
+    if (test_type == 222) {
+      Kokkos::parallel_reduce("222 outer reduce", t_policy(team_range,team_size,vector_size),
+        KOKKOS_LAMBDA (const t_team& team, double& lval) {
+        double team_result = 0.0;
+        for(int tr = 0; tr<thread_repeat; tr++) {
+          Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double& tval) {
+            double vector_result = 0.0;
+            for (int vr = 0; vr<inner_repeat; ++vr) {
+              Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, vector_range), [&] (const int vi, double& vval) {
+                vval += vi;
+              }, vector_result);
+            }
+            tval += t + vector_result;
+          },team_result);
+        }
+        lval+=team_result*team.league_rank();
+      },result);
+      result_expect = 0.5* (team_range)*(team_range-1) * team_size * (0.5*(thread_range) * (thread_range-1) + thread_range*0.5*(vector_range)*(vector_range-1));
+      // sum ( seq( [0, team_range) * constant ) + 1 + sum( seq([0,vector_range) ) per member per team; constant = sum( seq( [0, thread_range) )*team_size (1 per member, result for each team)
+    }
+
+    // parallel_for RangePolicy: range = team_size*team_range
+    if (test_type == 300) {
+      Kokkos::parallel_for("300 outer for", team_size*team_range, 
+        KOKKOS_LAMBDA (const int idx) {
+          v1(idx) = idx;
+          // prevent compiler from optimizing away the loop
+      });
+    }
+    // parallel_reduce RangePolicy: range = team_size*team_range
+    if (test_type == 400) {
+      Kokkos::parallel_reduce("400 outer reduce", team_size*team_range, 
+        KOKKOS_LAMBDA (const int idx, double& val) {
+          val += idx;
+      }, result);
+      result_expect = 0.5*(team_size*team_range)*(team_size*team_range-1);
+    }
+    // parallel_scan RangePolicy: range = team_size*team_range
+    if (test_type == 500) {
+      Kokkos::parallel_scan("500 outer scan", team_size*team_range, 
+        ParallelScanFunctor<ViewType1>(v1)
+#if 0
+        // This does not compile with pre Cuda 8.0 - see Github Issue #913 for explanation
+        KOKKOS_LAMBDA (const int idx, double& val, const bool& final) {
+          // inclusive scan
+          val += v1(idx);
+          if ( final ) {
+            v1(idx) = val;
+          }
+        }
+#endif
+      );
+      // result = v1( team_size*team_range - 1 ); // won't work with Cuda - need to copy result back to host to print
+      // result_expect = 0.5*(team_size*team_range)*(team_size*team_range-1);
+    }
+
+  } // end outer for loop
+
+  time = timer.seconds();
+} //end test_policy
--- a/lib/kokkos/benchmarks/policy_performance/script_basic_testing.sh
+++ b/lib/kokkos/benchmarks/policy_performance/script_basic_testing.sh
@ -0,0 +1,53 @@
+#!/bin/bash
+
+# Script to check policy_perf_test code works with each possible combo of options
+
+echo "Performance test results for parallel_reduce code computing sum of sequence [0,N) with various (nested) policies"
+
+EXECUTABLE=policy_performance
+
+TEAMRANGE=1000
+THREADRANGE=4
+VECTORRANGE=32
+TEAMSIZE=4
+VECTORSIZE=1
+OREPEAT=1
+MREPEAT=1
+IREPEAT=1
+SCHEDULE=1
+
+SUFFIX=host
+if [ -e $EXECUTABLE.$SUFFIX ]
+then
+SCHEDULE=1
+echo "Host tests Static schedule"
+for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500}
+do
+  OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
+done
+
+SCHEDULE=2
+echo "Host tests Dynamic schedule"
+for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500}
+do
+  OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
+done
+fi
+
+SUFFIX=cuda
+if [ -e $EXECUTABLE.$SUFFIX ]
+then
+SCHEDULE=1
+echo "Cuda tests Static schedule"
+for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500}
+do
+  ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
+done
+
+SCHEDULE=2
+echo "Cuda tests Dynamic schedule"
+for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500}
+do
+  ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
+done
+fi
--- a/lib/kokkos/benchmarks/policy_performance/script_sample_usage.sh
+++ b/lib/kokkos/benchmarks/policy_performance/script_sample_usage.sh
@ -0,0 +1,126 @@
+#!/bin/bash
+
+# Sample script for benchmarking policy performance 
+
+# Suggested enviroment variables to export prior to executing script:
+# KNL: 
+# OMP_NUM_THREADS=256 KMP_AFFINITY=compact
+# Power:
+# OMP_NUM_THREADS=64 OMP_PROC_BIND=true
+
+# Constants and Variables:
+# Vary:  TEAMSIZE, and THREADRANGE
+#  for TEAMSIZE in {1,2,4,5,8}; do
+#  for THREADRANGE in {32,41,1000}; do
+# Fixed: TEAMRANGE, VECTORRANGE, VECTORSIZE
+# System specific: Adjust REPEAT values to architecture tests are run on
+
+# Tests
+# Static SCHEDULE = 1
+# Tier 1: parallel_for + RangePolicy 300
+# Tier 2: parallel_reduce, parallel_scan + RangePolicy 400 500
+# Tier 3: 'outer' parallel_for with TeamPolicy (nested parallelism) 1XY
+# Tier 4: 'outer' parallel_reduce with TeamPolicy (nested parallelism) 2XY
+# Dynamic SCHEDULE = 2
+# Tier 5: parallel_for + RangePolicy 300
+# Tier 6: parallel_reduce, parallel_scan + RangePolicy 400 500
+# Tier 7: 'outer' parallel_for with TeamPolicy (nested parallelism) 1XY
+# Tier 8: 'outer' parallel_reduce with TeamPolicy (nested parallelism) 2XY
+
+# Results grouped by: 
+# 0) SCHEDULE  1) CODE (test)  2) TEAMRANGE  3) TEAMSIZE  4) THREADRANGE
+
+EXECUTABLE=policy_performance
+
+# Default defined values
+TEAMRANGE=1000
+THREADRANGE=1
+VECTORRANGE=32
+TEAMSIZE=1
+VECTORSIZE=1
+OREPEAT=1
+MREPEAT=1
+IREPEAT=1
+SCHEDULE=1
+
+# Host tests
+SUFFIX=host
+if [ -e $EXECUTABLE.$SUFFIX ]; then
+echo "Host"
+
+for SCHEDULE in {1,2}; do
+
+# Tier 1 and 2, 5 and 6
+for CODE in {300,400,500}; do
+    for TEAMSIZE in {1,2,4,5,8}; do
+    OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
+    done
+done
+
+# Tier 3, 7
+for CODE in {100,110,111,112,120,121,122}; do
+    for TEAMSIZE in {1,2,4,5,8}; do
+      for THREADRANGE in {32,41,1000}; do
+      OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
+      done
+    done
+done
+
+# Tier 4, 8
+for CODE in {200,210,211,212,220,221,222}; do
+    for TEAMSIZE in {1,2,4,5,8}; do
+      for THREADRANGE in {32,41,1000}; do
+      OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
+      done
+    done
+done
+
+done # end SCHEDULE
+
+fi # end host
+
+
+# Cuda tests
+SUFFIX=cuda
+# TEAMRANGE=10000, TEAMSIZE=8 too large
+# TEAMRANGE=10000, TEAMSIZE=8, THREADRANGE=1000 too large
+if [ -e $EXECUTABLE.$SUFFIX ]; then
+echo "Cuda"
+
+for SCHEDULE in {1,2}; do
+
+# Reset defaults
+TEAMRANGE=1000
+THREADRANGE=1
+VECTORRANGE=32
+TEAMSIZE=1
+VECTORSIZE=1
+
+# Tier 1 and 2, 5 and 6
+for CODE in {300,400,500}; do
+    for TEAMSIZE in {1,2,4,5,8}; do
+    ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
+    done
+done
+
+# Tier 3, 7
+for CODE in {100,110,111,112,120,121,122}; do
+    for TEAMSIZE in {1,2,4,5,8}; do
+      for THREADRANGE in {32,41,1000}; do
+      ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
+      done
+    done
+done
+
+# Tier 4, 8
+for CODE in {200,210,211,212,220,221,222}; do
+    for TEAMSIZE in {1,2,4,5,8}; do
+      for THREADRANGE in {32,41,1000}; do
+      ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
+      done
+    done
+done
+
+done # end SCHEDULE
+
+fi #end cuda
--- a/lib/kokkos/bin/hpcbind
+++ b/lib/kokkos/bin/hpcbind
@ -0,0 +1,454 @@
+#!/usr/bin/env bash
+
+################################################################################
+# Check if hwloc commands exist
+################################################################################
+declare -i HPCBIND_HAS_HWLOC=1
+type hwloc-bind >/dev/null 2>&1
+HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
+
+type hwloc-distrib >/dev/null 2>&1
+HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
+
+type hwloc-ls >/dev/null 2>&1
+HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
+
+type hwloc-calc >/dev/null 2>&1
+HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
+
+type hwloc-ps >/dev/null 2>&1
+HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
+
+if [[ ${HPCBIND_HAS_HWLOC} -eq 0 ]]; then
+  echo "hwloc not found, no process binding will occur"
+fi
+
+# Get parent cpuset
+HPCBIND_HWLOC_PARENT_CPUSET=""
+if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
+  MY_PID="$BASHPID"
+  HPCBIND_HWLOC_PARENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2)
+fi
+
+################################################################################
+# Check if nvidia-smi exist
+################################################################################
+declare -i HPCBIND_HAS_NVIDIA=0
+type nvidia-smi >/dev/null 2>&1
+HPCBIND_HAS_NVIDIA=$((!$?))
+
+
+################################################################################
+# Get visible gpu
+################################################################################
+declare -i NUM_GPUS=0
+HPCBIND_VISIBLE_GPUS=""
+if [[ ${HPCBIND_HAS_NVIDIA} -eq 1 ]]; then
+  NUM_GPUS=$(nvidia-smi -L | wc -l);
+  GPU_LIST="$( seq 0 $((NUM_GPUS-1)) )"
+  HPCBIND_VISIBLE_GPUS=${CUDA_VISIBLE_DEVICES:-${GPU_LIST}}
+fi
+
+declare -i HPCBIND_ENABLE_GPU_MAPPING=$((NUM_GPUS > 0))
+
+
+################################################################################
+# Get queue id
+# supports sbatch, bsub, aprun
+################################################################################
+HPCBIND_QUEUE_NAME=""
+declare -i HPCBIND_QUEUE_INDEX=0
+declare -i HPCBIND_QUEUE_GPU_MAPPING=0
+
+if [[ ! -z "${SLURM_LOCAL_ID}" ]]; then
+  HPCBIND_QUEUE_GPU_MAPPING=1
+  HPCBIND_QUEUE_NAME="sbatch"
+  HPCBIND_QUEUE_INDEX=${SLURM_LOCAL_ID}
+elif [[ ! -z "${LBS_JOBINDEX}" ]]; then
+  HPCBIND_QUEUE_GPU_MAPPING=1
+  HPCBIND_QUEUE_NAME="bsub"
+  HPCBIND_QUEUE_INDEX=${LBS_JOBINDEX}
+elif [[ ! -z "${ALPS_APP_PE}" ]]; then
+  HPCBIND_QUEUE_GPU_MAPPING=1
+  HPCBIND_QUEUE_NAME="aprun"
+  HPCBIND_QUEUE_INDEX=${ALPS_APP_PE}
+fi
+
+
+################################################################################
+# Show help
+################################################################################
+function show_help {
+  local cmd=$(basename "$0")
+  echo "Usage: ${cmd} <options> -- command ..."
+  echo "  Set the process mask, OMP environment variables and CUDA environment"
+  echo "  variables to sane values if possible. Uses hwloc and nvidia-smi if"
+  echo "  available.  Will preserve the current process binding, so it is safe"
+  echo "  to use with a queuing system or mpiexec."
+  echo ""
+  echo "Options:"
+  echo "  --no-hwloc-bind       Disable binding"
+  echo "  --proc-bind=<LOC>     Set the initial process mask for the script"
+  echo "                        LOC can be any valid location argument for"
+  echo "                        hwloc-calc  Default: all"
+  echo "  --distribute=N        Distribute the current cpuset into N partitions"
+  echo "  --distribute-partition=I"
+  echo "                        Use the i'th partition (zero based)"
+  echo "  --visible-gpus=<L>    Comma separated list of gpu ids"
+  echo "                        Default: CUDA_VISIBLE_DEVICES or all gpus in"
+  echo "                        sequential order"
+  echo "  --gpu-ignore-queue    Ignore queue job id when choosing visible GPU"
+  echo "  --no-gpu-mapping      Do not set CUDA_VISIBLE_DEVICES"
+  echo "  --openmp=M.m          Set env variables for the given OpenMP version"
+  echo "                        Default: 4.0"
+  echo "  --openmp-percent=N    Integer percentage of cpuset to use for OpenMP"
+  echo "                        threads  Default: 100"
+  echo "  --openmp-places=<Op>  Op=threads|cores|sockets. Default: threads"
+  echo "  --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES"
+  echo "  --force-openmp-num-threads=N"
+  echo "                        Override logic for selecting OMP_NUM_THREADS"
+  echo "  --force-openmp-proc-bind=<OP>"
+  echo "                        Override logic for selecting OMP_PROC_BIND"
+  echo "  --no-openmp-nested    Set OMP_NESTED to false"
+  echo "  --show-bindings       Show the bindings"
+  echo "  --lstopo              Show bindings in lstopo without executing a command"
+  echo "  -v|--verbose          Show options and relevant environment variables"
+  echo "  -h|--help             Show this message"
+  echo ""
+  echo "Sample Usage:"
+  echo "  Split the current process cpuset into 4 and use the 3rd partition"
+  echo "    ${cmd} --distribute=4 --distribute-partition=2 -v -- command ..."
+  echo "  Bing the process to all even cores"
+  echo "    ${cmd} --proc-bind=core:even -v -- command ..."
+  echo "  Bind to the first 64 cores and split the current process cpuset into 4"
+  echo "    ${cmd} --proc-bind=core:0-63 --distribute=4 --distribute-partition=0 -- command ..."
+  echo "  skip GPU 0 when mapping visible devices"
+  echo "    ${cmd} --distribute=4 --distribute-partition=0 --visible-gpus=1,2 -v -- command ..."
+  echo "  Display the current bindings"
+  echo "    ${cmd} --proc-bind=numa:0 --show-bindings -- command"
+  echo "  Display the current bindings using lstopo"
+  echo "    ${cmd} --proc-bind=numa:0.core:odd --lstopo"
+  echo ""
+}
+
+
+################################################################################
+# Parse command line arguments
+################################################################################
+# Show help if no command line arguments given
+if [[ "$#" -eq 0 ]]; then
+  show_help
+  exit 0
+fi
+
+declare -a UNKNOWN_ARGS=()
+declare -i HPCBIND_ENABLE_HWLOC_BIND=${HPCBIND_HAS_HWLOC}
+declare -i HPCBIND_DISTRIBUTE=1
+declare -i HPCBIND_PARTITION=0
+HPCBIND_PROC_BIND="all"
+HPCBIND_OPENMP_VERSION=4.0
+declare -i HPCBIND_OPENMP_PERCENT=100
+HPCBIND_OPENMP_PLACES=${OMP_PLACES:-threads}
+declare -i HPCBIND_OPENMP_PROC_BIND=1
+declare -i HPCBIND_OPENMP_FORCE_NUM_THREADS=-1
+HPCBIND_OPENMP_FORCE_PROC_BIND=""
+HPCBIND_OPENMP_NESTED=${OMP_NESTED:-true}
+declare -i HPCBIND_VERBOSE=0
+
+declare -i HPCBIND_SHOW_BINDINGS=0
+declare -i HPCBIND_LSTOPO=0
+
+for i in $@; do
+  case $i in
+    # number of partitions to create
+    --no-hwloc-bind)
+      HPCBIND_ENABLE_HWLOC_BIND=0
+      shift
+      ;;
+    --proc-bind=*)
+      HPCBIND_PROC_BIND="${i#*=}"
+      shift
+      ;;
+    --distribute=*)
+      HPCBIND_DISTRIBUTE="${i#*=}"
+      shift
+      ;;
+    # which partition to use
+    --distribute-partition=*)
+      HPCBIND_PARTITION="${i#*=}"
+      shift
+      ;;
+    --visible-gpus=*)
+      HPCBIND_VISIBLE_GPUS=$(echo "${i#*=}" | tr ',' ' ')
+      shift
+      ;;
+    --gpu-ignore-queue)
+      HPCBIND_QUEUE_GPU_MAPPING=0
+      shift
+      ;;
+    --no-gpu-mapping)
+      HPCBIND_ENABLE_GPU_MAPPING=0
+      shift
+      ;;
+    --openmp=*)
+      HPCBIND_OPENMP_VERSION="${i#*=}"
+      shift
+      ;;
+    --openmp-percent=*)
+      HPCBIND_OPENMP_PERCENT="${i#*=}"
+      shift
+      ;;
+    --openmp-places=*)
+      HPCBIND_OPENMP_PLACES="${i#*=}"
+      shift
+      ;;
+    --no-openmp-proc-bind)
+      HPCBIND_OPENMP_PROC_BIND=0
+      shift
+      ;;
+    --force-openmp-proc-bind=*)
+      HPCBIND_OPENMP_FORCE_PROC_BIND="${i#*=}"
+      shift
+      ;;
+    --force-openmp-num-threads=*)
+      HPCBIND_OPENMP_FORCE_NUM_THREADS="${i#*=}"
+      shift
+      ;;
+    --no-openmp-nested)
+      HPCBIND_OPENMP_NESTED="false"
+      shift
+      ;;
+    --show-bindings)
+      HPCBIND_VERBOSE=1
+      HPCBIND_SHOW_BINDINGS=1
+      shift
+      ;;
+    --lstopo)
+      HPCBIND_VERBOSE=1
+      HPCBIND_SHOW_BINDINGS=0
+      HPCBIND_LSTOPO=1
+      shift
+      ;;
+    -v|--verbose)
+      HPCBIND_VERBOSE=1
+      shift
+      ;;
+    -h|--help)
+      show_help
+      exit 0
+      ;;
+    # ignore remaining arguments
+    --)
+      shift
+      break
+      ;;
+    # unknown option
+    *)
+      UNKNOWN_ARGS+=("$i")
+      shift
+      ;;
+  esac
+done
+
+
+################################################################################
+# Check unknown arguments
+################################################################################
+if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then
+  echo "Uknown options: ${UNKNOWN_ARGS[*]}"
+  exit 1
+fi
+
+
+################################################################################
+# Check that visible gpus are valid
+################################################################################
+HPCBIND_VISIBLE_GPUS=(${HPCBIND_VISIBLE_GPUS})
+if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then
+  for ((i=0; i < ${#HPCBIND_VISIBLE_GPUS[*]}; i++)); do
+    if [[ ${HPCBIND_VISIBLE_GPUS[$i]} -ge ${NUM_GPUS} ||
+      ${HPCBIND_VISIBLE_GPUS[$i]} -lt 0 ]]; then
+      echo "Invaild GPU ID ${HPCBIND_VISIBLE_GPUS[$i]}, setting to 0"
+      HPCBIND_VISIBLE_GPUS[$i]=0;
+    fi
+  done
+  NUM_GPUS=${#HPCBIND_VISIBLE_GPUS[@]}
+fi
+
+
+################################################################################
+# Check OpenMP percent
+################################################################################
+if [[ ${HPCBIND_OPENMP_PERCENT} -lt 1 ]]; then
+  echo "OpenMP percent < 1, setting to 1"
+  HPCBIND_OPENMP_PERCENT=1
+elif [[ ${HPCBIND_OPENMP_PERCENT} -gt 100 ]]; then
+  echo "OpenMP percent > 100, setting to 100"
+  HPCBIND_OPENMP_PERCENT=100
+fi
+
+################################################################################
+# Check distribute
+################################################################################
+if [[ ${HPCBIND_DISTRIBUTE} -le 0 ]]; then
+  echo "Invalid input for distribute, changing distribute to 1"
+  HPCBIND_DISTRIBUTE=1
+fi
+
+if [[ ${HPCBIND_PARTITION} -ge ${HPCBIND_DISTRIBUTE} ]]; then
+  echo "Invalid input for distribute-partition, changing to 0"
+  HPCBIND_PARTITION=0
+fi
+
+
+################################################################################
+# Find cpuset and num threads
+################################################################################
+HPCBIND_HWLOC_CPUSET=""
+declare -i HPCBIND_NUM_PUS=0
+
+if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
+  if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then
+    BINDING=$(hwloc-calc ${HPCBIND_PROC_BIND})
+  else
+    BINDING=$(hwloc-calc --restrict ${HPCBIND_HWLOC_PARENT_CPUSET} ${HPCBIND_PROC_BIND})
+  fi
+
+  CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${HPCBIND_DISTRIBUTE}))
+  HPCBIND_HWLOC_CPUSET=${CPUSETS[${HPCBIND_PARTITION}]}
+  HPCBIND_NUM_PUS=$(hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu | wc -l)
+else
+  HPCBIND_NUM_PUS=$(cat /proc/cpuinfo | grep -c processor)
+fi
+
+declare -i HPCBIND_OPENMP_NUM_THREADS=$((HPCBIND_NUM_PUS * HPCBIND_OPENMP_PERCENT))
+HPCBIND_OPENMP_NUM_THREADS=$((HPCBIND_OPENMP_NUM_THREADS / 100))
+
+
+if [[ ${HPCBIND_OPENMP_NUM_THREADS} -lt 1 ]]; then
+  HPCBIND_OPENMP_NUM_THREADS=1
+elif [[ ${HPCBIND_OPENMP_NUM_THREADS} -gt ${HPCBIND_NUM_PUS} ]]; then
+  HPCBIND_OPENMP_NUM_THREADS=${HPCBIND_NUM_PUS}
+fi
+
+if [[ ${HPCBIND_OPENMP_FORCE_NUM_THREADS} -gt 0 ]]; then
+  HPCBIND_OPENMP_NUM_THREADS=${HPCBIND_OPENMP_FORCE_NUM_THREADS}
+fi
+
+################################################################################
+# Set OpenMP environment variables
+################################################################################
+
+# set OMP_NUM_THREADS
+export OMP_NUM_THREADS=${HPCBIND_OPENMP_NUM_THREADS}
+
+# set OMP_PROC_BIND and OMP_PLACES
+if [[ ${HPCBIND_OPENMP_PROC_BIND} -eq 1 ]]; then
+  if [[ "${HPCBIND_OPENMP_FORCE_PROC_BIND}" == "" ]]; then
+    #default proc bind logic
+    if [[ "${HPCBIND_OPENMP_VERSION}" == "4.0" || "${HPCBIND_OPENMP_VERSION}" > "4.0" ]]; then
+      export OMP_PLACES="${HPCBIND_OPENMP_PLACES}"
+      export OMP_PROC_BIND="spread"
+    else
+      export OMP_PROC_BIND="true"
+      unset OMP_PLACES
+    fi
+  else
+    #force proc bind
+    export OMP_PLACES="${HPCBIND_OPENMP_PLACES}"
+    export OMP_PROC_BIND="${HPCBIND_OPENMP_FORCE_PROC_BIND}"
+  fi
+else
+  # no openmp proc bind
+  unset OMP_PLACES
+  unset OMP_PROC_BIND
+fi
+
+# set OMP_NESTED
+export OMP_NESTED=${HPCBIND_OPENMP_NESTED}
+
+
+################################################################################
+# Set CUDA environment variables
+################################################################################
+
+if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then
+  if [[ ${HPCBIND_QUEUE_GPU_MAPPING} -eq 0 ]]; then
+    declare -i GPU_ID=$((HPCBIND_PARTITION % NUM_GPUS))
+    export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}
+  else
+    declare -i MY_TASK_ID=$((HPCBIND_QUEUE_INDEX * HPCBIND_DISTRIBUTE + HPCBIND_PARTITION))
+    declare -i GPU_ID=$((MY_TASK_ID % NUM_GPUS))
+    export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}
+  fi
+fi
+
+################################################################################
+# Set hpcbind environment variables
+################################################################################
+export HPCBIND_HAS_HWLOC=${HPCBIND_HAS_HWLOC}
+export HPCBIND_HAS_NVIDIA=${HPCBIND_HAS_NVIDIA}
+export HPCBIND_NUM_PUS=${HPCBIND_NUM_PUS}
+export HPCBIND_HWLOC_CPUSET=${HPCBIND_HWLOC_CPUSET}
+export HPCBIND_HWLOC_DISTRIBUTE=${HPCBIND_DISTRIBUTE}
+export HPCBIND_HWLOC_DISTRIBUTE_PARTITION=${HPCBIND_PARTITION}
+if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then
+  export HPCBIND_HWLOC_PARENT_CPUSET="all"
+else
+  export HPCBIND_HWLOC_PARENT_CPUSET=${HPCBIND_HWLOC_PARENT_CPUSET}
+fi
+export HPCBIND_HWLOC_PROC_BIND=${HPCBIND_PROC_BIND}
+export HPCBIND_NVIDIA_ENABLE_GPU_MAPPING=${HPCBIND_ENABLE_GPU_MAPPING}
+export HPCBIND_NVIDIA_VISIBLE_GPUS=$(echo "${HPCBIND_VISIBLE_GPUS[*]}" | tr ' ' ',')
+export HPCBIND_OPENMP_VERSION=${HPCBIND_OPENMP_VERSION}
+if [[ "${HPCBIND_QUEUE_NAME}" != "" ]]; then
+  export HPCBIND_QUEUE_INDEX=${HPCBIND_QUEUE_INDEX}
+  export HPCBIND_QUEUE_NAME=${HPCBIND_QUEUE_NAME}
+  export HPCBIND_QUEUE_GPU_MAPPING=${HPCBIND_QUEUE_GPU_MAPPING}
+fi
+
+
+################################################################################
+# Print verbose
+################################################################################
+
+if [[ ${HPCBIND_VERBOSE} -eq 1 ]]; then
+  MY_ENV=$(env | sort)
+  echo "[HPCBIND]"
+  echo "${MY_ENV}" | grep -E "^HPCBIND_"
+  echo "[CUDA]"
+  echo "${MY_ENV}" | grep -E "^CUDA_"
+  echo "[OPENMP]"
+  echo "${MY_ENV}" | grep -E "^OMP_"
+fi
+
+if [[ ${HPCBIND_HAS_HWLOC} -eq 1 && ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then
+  echo "[BINDINGS]"
+  hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu
+elif [[ ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then
+  echo "Unable to show bindings, hwloc not available."
+fi
+
+################################################################################
+# Run command
+################################################################################
+
+if [[ ${HPCBIND_LSTOPO} -eq 0 ]]; then
+  if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
+    hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- $@
+  else
+    eval $@
+  fi
+else
+  if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
+    if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 && ! -z ${DISPLAY} ]]; then
+      echo "[BINDINGS]"
+      hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu
+      hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- lstopo --pid 0
+    else
+      hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET}
+    fi
+  else
+    echo "Unable to show bindings, hwloc not available."
+  fi
+fi
--- a/lib/kokkos/bin/kokkos-bind
+++ b/lib/kokkos/bin/kokkos-bind
@ -0,0 +1,221 @@
+#!/usr/bin/env bash
+
+# check if hwloc commands exist
+declare -i HAS_HWLOC=0
+type hwloc-bind >/dev/null 2>&1
+HAS_HWLOC="${HAS_HWLOC} + $?"
+
+type hwloc-distrib >/dev/null 2>&1
+HAS_HWLOC="${HAS_HWLOC} + $?"
+
+type hwloc-ls >/dev/null 2>&1
+HAS_HWLOC="${HAS_HWLOC} + $?"
+
+type hwloc-calc >/dev/null 2>&1
+HAS_HWLOC="${HAS_HWLOC} + $?"
+
+type hwloc-ps >/dev/null 2>&1
+HAS_HWLOC="${HAS_HWLOC} + $?"
+
+
+#parse args
+declare -a UNKNOWN_ARGS=()
+declare -i DISTRIBUTE=1
+declare -i INDEX=0
+PROC_BIND="all"
+CURRENT_CPUSET=""
+OPENMP_VERSION=4.0
+OPENMP_PROC_BIND=True
+OPENMP_NESTED=True
+VERBOSE=False
+
+#get the current process cpuset
+if [[ ${HAS_HWLOC} -eq 0 ]]; then
+  MY_PID="$BASHPID"
+  CURRENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2)
+  echo "$CURRENT_CPUSET"
+fi
+
+function show_help {
+  local cmd=$(basename "$0")
+  echo "Usage: ${cmd} <options> -- command ..." 
+  echo "  Uses hwloc to divide the node into the given number of groups,"
+  echo "  set the appropriate OMP_NUM_THREADS and execute the command on the"
+  echo "  selected group."
+  echo ""
+  echo "  NOTE: This command assumes it has exclusive use of the node"
+  echo ""
+  echo "Options:"
+  echo "  --proc-bind=<LOC>     Set the initial process mask for the script.  "
+  echo "                        LOC can be any valid location argumnet for"
+  echo "                        hwloc-calc.  Defaults to the entire machine"
+  echo "  --distribute=N        Distribute the current proc-bind into N groups" 
+  echo "  --index=I             Use the i'th group (zero based)" 
+  echo "  --openmp=M.m          Set env variables for the given OpenMP version"
+  echo "                        (default 4.0)"
+  echo "  --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES"    
+  echo "  --no-openmp-nested    Set OMP_NESTED to false"
+  echo "  -v|--verbose" 
+  echo "  -h|--help" 
+  echo ""
+  echo "Sample Usage:"
+  echo "  ${cmd} --distribute=4 --index=2 -v -- command ..."
+  echo ""
+}
+
+if [[ "$#" -eq 0 ]]; then
+  show_help 
+  exit 0
+fi
+
+
+for i in $@; do
+  case $i in
+    # number of partitions to create
+    --proc-bind=*)
+      PROC_BIND="${i#*=}"
+      shift
+      ;;
+    --distribute=*)
+      DISTRIBUTE="${i#*=}"
+      shift
+      ;;
+    # which group to use
+    --index=*)
+      INDEX="${i#*=}"
+      shift
+      ;;
+    --openmp=*)
+      OPENMP_VERSION="${i#*=}"
+      shift
+      ;;
+    --no-openmp-proc-bind)
+      OPENMP_PROC_BIND=False
+      shift
+      ;;
+    --no-openmp-nested)
+      OPENMP_NESTED=False
+      shift
+      ;;
+    -v|--verbose)
+      VERBOSE=True
+      shift
+      ;;
+    -h|--help)
+      show_help
+      exit 0
+      ;;
+    # ignore remaining arguments
+    --)
+      shift
+      break
+      ;;
+    # unknown option
+    *)
+      UNKNOWN_ARGS+=("$i")
+      shift
+      ;;
+  esac
+done
+
+if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then
+  echo "Uknown options: ${UNKNOWN_ARGS[*]}"
+  exit 1
+fi
+
+if [[ ${DISTRIBUTE} -le 0 ]]; then
+  echo "Invalid input for distribute, changing distribute to 1"
+  DISTRIBUTE=1
+fi
+
+if [[ ${INDEX} -ge ${DISTRIBUTE} ]]; then
+  echo "Invalid input for index, changing index to 0"
+  INDEX=0
+fi
+
+if [[ ${HAS_HWLOC} -ne 0 ]]; then
+  echo "hwloc not found, no process binding will occur"
+  DISTRIBUTE=1
+  INDEX=0
+fi
+
+if [[ ${HAS_HWLOC} -eq 0 ]]; then
+
+  if [[ "${CURRENT_CPUSET}" == "" ]]; then
+    BINDING=$(hwloc-calc ${PROC_BIND})
+  else 
+    BINDING=$(hwloc-calc --restrict ${CURRENT_CPUSET} ${PROC_BIND})
+  fi
+
+  CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${DISTRIBUTE}))
+  CPUSET=${CPUSETS[${INDEX}]}
+  NUM_THREADS=$(hwloc-ls --restrict ${CPUSET} --only pu | wc -l)
+
+  if [[ "${VERBOSE}" == "True" ]]; then
+    echo "hwloc:         true"
+    echo "  proc_bind:     ${PROC_BIND}"
+    echo "  distribute:    ${DISTRIBUTE}"
+    echo "  index:         ${INDEX}"
+    echo "  parent_cpuset: ${CURRENT_CPUSET}"
+    echo "  cpuset:        ${CPUSET}"
+    echo "omp_num_threads: ${NUM_THREADS}"
+    echo "omp_proc_bind:   ${OPENMP_PROC_BIND}"
+    echo "omp_nested:      ${OPENMP_NESTED}"
+    echo "OpenMP:          ${OPENMP_VERSION}"
+  fi
+
+  # set OMP env
+  if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then
+    if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then
+      export OMP_PLACES="threads"
+      export OMP_PROC_BIND="spread"
+    else
+      export OMP_PROC_BIND="true"
+      unset OMP_PLACES
+    fi
+  else
+    unset OMP_PLACES
+    unset OMP_PROC_BIND
+  fi
+  if [[ "${OPENMP_NESTED}" == "True" ]]; then
+    export OMP_NESTED="true"
+  else
+    export OMP_NESTED="false"
+  fi
+  export OMP_NUM_THREADS="${NUM_THREADS}"
+
+  hwloc-bind ${CPUSET} -- $@
+else
+  NUM_THREADS=$(cat /proc/cpuinfo | grep -c processor)
+
+  if [[ "${VERBOSE}" == "True" ]]; then
+    echo "hwloc:           false"
+    echo "omp_num_threads: ${NUM_THREADS}"
+    echo "omp_proc_bind:   ${OPENMP_PROC_BIND}"
+    echo "omp_nested:      ${OPENMP_NESTED}"
+    echo "OpenMP:          ${OPENMP_VERSION}"
+  fi
+    
+  # set OMP env
+  if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then
+    if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then
+      export OMP_PLACES="threads"
+      export OMP_PROC_BIND="spread"
+    else
+      export OMP_PROC_BIND="true"
+      unset OMP_PLACES
+    fi
+  else
+    unset OMP_PLACES
+    unset OMP_PROC_BIND
+  fi
+  if [[ "${OPENMP_NESTED}" == "True" ]]; then
+    export OMP_NESTED="true"
+  else
+    export OMP_NESTED="false"
+  fi
+  export OMP_NUM_THREADS="${NUM_THREADS}"
+
+  eval $@
+fi
+
--- a/lib/kokkos/bin/runtest
+++ b/lib/kokkos/bin/runtest
@ -0,0 +1,165 @@
+#!/usr/bin/env bash
+
+function get_path() {
+  cd "$(dirname "$0")"
+  cd ..
+  echo "$(pwd -P)"
+}
+
+KOKKOS_PATH="$(get_path "$0")"
+
+function show_help() {
+  local cmd=$(basename "$0")
+  echo "Usage: ${cmd} <options> "
+  echo "  Build and run the tests"
+  echo ""
+  echo "Options:"
+  echo "  -j=N|--make-j=N        Build the tests in parallel"
+  echo "  -c|--clean             Clean build and regenerate make files"
+  echo "  --clean-on-pass        Clean build when runtest passes"
+  echo "  --output-prefix=<pre>  Prefix of log files  Default: runtest"
+  echo "  --build-only           Only build the tests"
+  echo "  -v|--verbose           Tee STDOUT and STDERR to screen and files"
+  echo "  -h|--help              Show this message"
+  echo ""
+  ${KOKKOS_PATH}/generate_makefile.bash --help
+  return 0
+}
+
+
+declare -a GENERATE_ARGS=()
+declare -i VERBOSE=0
+declare -i CLEAN=0
+declare -i CLEAN_ON_PASS=0
+declare -i BUILD_ONLY=0
+OUTPUT="runtest"
+
+declare -i MAKE_J=${HPCBIND_NUM_PUS:-1}
+
+for i in $@; do
+  case $i in
+    -j=*|--make-j=*)
+      MAKE_J=${i#*=}
+      shift
+      ;;
+    -c|--clean)
+      CLEAN=1
+      shift
+      ;;
+    --clean-on-pass)
+      CLEAN_ON_PASS=1
+      shift
+      ;;
+    --output-prefix=*)
+      OUTPUT=${i#*=}
+      shift
+      ;;
+    --build-only)
+      BUILD_ONLY=1
+      shift
+      ;;
+    -v|--verbose)
+      VERBOSE=1
+      shift
+      ;;
+    -h|--help)
+      show_help
+      exit 0
+      ;;
+    *)
+      GENERATE_ARGS+=("$i")
+      shift
+      ;;
+  esac
+done
+
+if [[ "$(pwd -P)" == ${KOKKOS_PATH} ]]; then
+  echo "Cannot call $0 from root repository path ${KOKKOS_PATH}"
+  exit 1
+fi
+
+# Some makefile dependencies are incorrect, so clean needs to force
+# a new call to generate_makefiles.bash
+if [[ ${CLEAN} -eq 1 ]]; then
+  START=${SECONDS}
+  echo "Cleaning"
+  /bin/rm -rf algorithms containers core example install Makefile >/dev/null 2>&1
+  END=${SECONDS}
+  echo "    $((END-START)) seconds"
+  if [[ ${VERBOSE} -eq 1 ]]; then
+    echo ""
+    echo ""
+  fi
+fi
+
+declare -i START=${SECONDS}
+echo "Generating Makefile"
+echo "    ${KOKKOS_PATH}/generate_makefile.bash --kokkos-path=${KOKKOS_PATH} ${GENERATE_ARGS[@]}"
+
+if [[ ${VERBOSE} -eq 0 ]]; then
+  "${KOKKOS_PATH}"/generate_makefile.bash --kokkos-path="${KOKKOS_PATH}" "${GENERATE_ARGS[@]}" > ${OUTPUT}.out 2> >(tee ${OUTPUT}.err >&2)
+else
+  "${KOKKOS_PATH}"/generate_makefile.bash --kokkos-path="${KOKKOS_PATH}" "${GENERATE_ARGS[@]}" > >(tee ${OUTPUT}.out) 2> >(tee ${OUTPUT}.err >&2)
+fi
+declare -i RESULT=$?
+declare -i END=${SECONDS}
+if [[ ${RESULT} -eq 0 ]]; then
+  echo "    PASS:  $((END-START)) seconds"
+  if [[ ${VERBOSE} -eq 1 ]]; then
+    echo ""
+    echo ""
+  fi
+else
+  cat ${OUTPUT}.out | grep "FAIL"
+  cat ${OUTPUT}.err | grep "FAIL"
+  echo "    FAIL:  $((END-START)) seconds"
+  exit 1
+fi
+
+START=${SECONDS}
+echo "Building"
+if [[ ${VERBOSE} -eq 0 ]]; then
+  make --keep-going -j ${MAKE_J} build-test >> ${OUTPUT}.out 2> >(tee -a ${OUTPUT}.err >&2)
+else
+  make --keep-going -j ${MAKE_J} build-test > >(tee -a ${OUTPUT}.out) 2> >(tee -a ${OUTPUT}.err >&2)
+fi
+RESULT=$?
+END=${SECONDS}
+if [[ ${RESULT} -eq 0 ]]; then
+  echo "    PASS:  $((END-START)) seconds"
+  if [[ ${VERBOSE} -eq 1 ]]; then
+    echo ""
+    echo ""
+  fi
+else
+  cat ${OUTPUT}.out | grep -E "[[:space:]]error:[[:space:]]"
+  cat ${OUTPUT}.err | grep -E "[[:space:]]error:[[:space:]]"
+  echo "    FAIL:  $((END-START)) seconds"
+  exit 1
+fi
+
+if [[ ${BUILD_ONLY} -eq 0 ]]; then
+  START=${SECONDS}
+  echo "Testing"
+  if [[ ${VERBOSE} -eq 0 ]]; then
+    make --keep-going test >> ${OUTPUT}.out 2> >(tee -a ${OUTPUT}.err >&2)
+  else
+    make --keep-going test > >(tee -a ${OUTPUT}.out) 2> >(tee -a ${OUTPUT}.err >&2)
+  fi
+  RESULT=$?
+  END=${SECONDS}
+  if [[ ${RESULT} -eq 0 ]]; then
+    echo "    PASS:  $((END-START)) seconds"
+    if [[ ${CLEAN_ON_PASS} -eq 1 ]]; then
+      make clean
+    fi
+  else
+    cat ${OUTPUT}.out | grep "FAIL"
+    cat ${OUTPUT}.err | grep "FAIL"
+    echo "    FAIL:  $((END-START)) seconds"
+    exit 1
+  fi
+fi
+
+exit ${RESULT}
+
--- a/lib/kokkos/cmake/kokkos.cmake
+++ b/lib/kokkos/cmake/kokkos.cmake
@ -999,8 +999,12 @@ SET (Kokkos_INCLUDE_DIRS
    ${Kokkos_SOURCE_DIR}/containers/src
    ${Kokkos_SOURCE_DIR}/algorithms/src
    ${Kokkos_BINARY_DIR}  # to find KokkosCore_config.h
+    ${KOKKOS_INCLUDE_DIRS}
 )

+# pass include dirs back to parent scope
+SET(Kokkos_INCLUDE_DIRS_RET ${Kokkos_INCLUDE_DIRS} PARENT_SCOPE)
+
 INCLUDE_DIRECTORIES(${Kokkos_INCLUDE_DIRS})

 IF(KOKKOS_SEPARATE_LIBS)
--- a/lib/kokkos/config/master_history.txt
+++ b/lib/kokkos/config/master_history.txt
@ -7,3 +7,4 @@ tag:  2.02.07    date: 12:16:2016    master: 4b4cc4ba    develop: 382c0966
 tag:  2.02.15    date: 02:10:2017    master: 8c64cd93    develop: 28dea8b6
 tag:  2.03.00    date: 04:25:2017    master: 120d9ce7    develop: 015ba641
 tag:  2.03.05    date: 05:27:2017    master: 36b92f43    develop: 79073186
+tag:  2.03.13    date: 07:27:2017    master: da314444    develop: 29ccb58a
--- a/lib/kokkos/config/query_cuda_arch.cpp
+++ b/lib/kokkos/config/query_cuda_arch.cpp
@ -0,0 +1,24 @@
+#include <cstdio>
+#include <cuda_runtime_api.h>
+int main()
+{
+	cudaDeviceProp prop;
+  const cudaError_t err_code = cudaGetDeviceProperties(&prop, 0);
+  if (cudaSuccess != err_code) {
+		fprintf(stderr,"cudaGetDeviceProperties failed: %s\n", cudaGetErrorString(err_code));
+		return -1;
+	}
+  switch (prop.major) {
+    case 3:
+      printf("Kepler"); break;
+    case 5:
+      printf("Maxwell"); break;
+    case 6:
+      printf("Pascal"); break;
+    default:
+      fprintf(stderr, "Unspported Device %d%d\n", (int)prop.major, (int)prop.minor);
+      return -1;
+  }
+  printf("%d%d\n", (int)prop.major, (int)prop.minor);
+  return 0;
+}
--- a/lib/kokkos/config/test_all_sandia
+++ b/lib/kokkos/config/test_all_sandia
@ -160,9 +160,14 @@ if [ "$MACHINE" = "sems" ]; then
    # Format: (compiler module-list build-list exe-name warning-flag)
    COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
               "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
               "intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
               "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
               "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "intel/16.0.3 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
               "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
               "clang/3.7.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
               "clang/3.8.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
@ -280,13 +285,13 @@ elif [ "$MACHINE" = "apollo" ]; then
               "gcc/5.1.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
               "intel/16.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
               "clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS"
-               "clang/head $CLANG_MODULE_LIST "Cuda_Pthread" clang++ $CUDA_WARNING_FLAGS"
+               "clang/4.0.0 $CLANG_MODULE_LIST "Cuda_Pthread" clang++ $CUDA_WARNING_FLAGS"
               "cuda/8.0.44 $CUDA_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
    )
  else
    # Format: (compiler module-list build-list exe-name warning-flag)
    COMPILERS=("cuda/8.0.44 $CUDA8_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
-               "clang/head $CLANG_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS"
+               "clang/4.0.0 $CLANG_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS"
               "clang/3.9.0 $CLANG_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS"
               "gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
               "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
@ -584,7 +589,7 @@ single_build_and_test() {
  else
    run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
    local -i build_start_time=$(date +%s)
-    run_cmd make build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
+    run_cmd make -j 32 build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
    local -i build_end_time=$(date +%s)
    comment="build_time=$(($build_end_time-$build_start_time))"

--- a/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_pthread_intel
+++ b/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_pthread_intel
@ -28,14 +28,14 @@ export JENKINS_DO_PTHREAD=ON
 export JENKINS_DO_SERIAL=OFF
 export JENKINS_DO_COMPLEX=OFF

-export ARCH_CXX_FLAG="-xCORE-AVX2 -mkl"
-export ARCH_C_FLAG="-xCORE-AVX2 -mkl"
+export JENKINS_ARCH_CXX_FLAG="-xCORE-AVX2 -mkl"
+export JENKINS_ARCH_C_FLAG="-xCORE-AVX2 -mkl"
 export BLAS_LIBRARIES="-mkl;${MKLROOT}/lib/intel64/libmkl_intel_lp64.a;${MKLROOT}/lib/intel64/libmkl_intel_thread.a;${MKLROOT}/lib/intel64/libmkl_core.a"
 export LAPACK_LIBRARIES=${BLAS_LIBRARIES}

 export JENKINS_DO_TESTS=ON
 export JENKINS_DO_EXAMPLES=ON
-export JENKINS_DO_SHARED=OFF
+export JENKINS_DO_SHARED=ON

 export QUEUE=haswell

--- a/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_serial_intel
+++ b/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_serial_intel
@ -28,14 +28,14 @@ export JENKINS_DO_PTHREAD=OFF
 export JENKINS_DO_SERIAL=ON
 export JENKINS_DO_COMPLEX=ON

-export ARCH_CXX_FLAG="-xCORE-AVX2 -mkl"
-export ARCH_C_FLAG="-xCORE-AVX2 -mkl"
+export JENKINS_ARCH_CXX_FLAG="-xCORE-AVX2 -mkl"
+export JENKINS_ARCH_C_FLAG="-xCORE-AVX2 -mkl"
 export BLAS_LIBRARIES="-mkl;${MKLROOT}/lib/intel64/libmkl_intel_lp64.a;${MKLROOT}/lib/intel64/libmkl_intel_thread.a;${MKLROOT}/lib/intel64/libmkl_core.a"
 export LAPACK_LIBRARIES=${BLAS_LIBRARIES}

 export JENKINS_DO_TESTS=ON
 export JENKINS_DO_EXAMPLES=ON
-export JENKINS_DO_SHARED=OFF
+export JENKINS_DO_SHARED=ON

 export QUEUE=haswell

--- a/lib/kokkos/containers/performance_tests/Makefile
+++ b/lib/kokkos/containers/performance_tests/Makefile
@ -60,7 +60,6 @@ test-threads: KokkosContainers_PerformanceTest_Threads
 test-openmp: KokkosContainers_PerformanceTest_OpenMP
 	./KokkosContainers_PerformanceTest_OpenMP

-
 build_all: $(TARGETS)

 test: $(TEST_TARGETS)
--- a/lib/kokkos/containers/performance_tests/TestMain.cpp
+++ b/lib/kokkos/containers/performance_tests/TestMain.cpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,12 +36,15 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */

 #include <gtest/gtest.h>
+#include <cstdlib>
+
+#include <Kokkos_Macros.hpp>

 int main(int argc, char *argv[]) {
  ::testing::InitGoogleTest(&argc,argv);
--- a/lib/kokkos/containers/performance_tests/TestOpenMP.cpp
+++ b/lib/kokkos/containers/performance_tests/TestOpenMP.cpp
@ -69,30 +69,13 @@ protected:
  {
    std::cout << std::setprecision(5) << std::scientific;

-    unsigned num_threads = 4;
-
-    if (Kokkos::hwloc::available()) {
-      num_threads = Kokkos::hwloc::get_available_numa_count()
-                    * Kokkos::hwloc::get_available_cores_per_numa()
-                    * Kokkos::hwloc::get_available_threads_per_core()
-                    ;
-
-    }
-
-    std::cout << "OpenMP: " << num_threads << std::endl;
-
-    Kokkos::OpenMP::initialize( num_threads );
-
-    std::cout << "available threads: " << omp_get_max_threads() << std::endl;
+    Kokkos::OpenMP::initialize();
+    Kokkos::OpenMP::print_configuration( std::cout );
  }

  static void TearDownTestCase()
  {
    Kokkos::OpenMP::finalize();
-
-    omp_set_num_threads(1);
-
-    ASSERT_EQ( 1 , omp_get_max_threads() );
  }
 };

--- a/lib/kokkos/containers/src/Kokkos_DualView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DualView.hpp
@ -564,7 +564,7 @@ namespace Impl {
 template< class D, class A1, class A2, class A3, class ... Args >
 struct DualViewSubview {

-  typedef typename Kokkos::Experimental::Impl::ViewMapping
+  typedef typename Kokkos::Impl::ViewMapping
    < void
    , Kokkos::ViewTraits< D, A1, A2, A3 >
    , Args ...
--- a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp
@ -46,19 +46,6 @@
 ///
 /// This header file declares and defines Kokkos::Experimental::DynRankView and its
 /// related nonmember functions.
-/*
- *   Changes from View
- *   1. The rank of the DynRankView is returned by the method rank()
- *   2. Max rank of a DynRankView is 7
- *   3. subview name is subdynrankview
- *   4. Every subdynrankview is returned with LayoutStride
- *
- *   NEW: Redesigned DynRankView
- *   5. subview function name now available
- *   6. Copy and Copy-Assign View to DynRankView
- *   7. deep_copy between Views and DynRankViews
- *   8. rank( view ); returns the rank of View or DynRankView
- */

 #ifndef KOKKOS_DYNRANKVIEW_HPP
 #define KOKKOS_DYNRANKVIEW_HPP
@ -117,6 +104,14 @@ struct DynRankDimTraits {
                      , layout.dimension[7] );
  }

+  // Extra overload to match that for specialize types v2
+  template <typename Layout, typename ... P>
+  KOKKOS_INLINE_FUNCTION
+  static size_t computeRank( const Kokkos::Impl::ViewCtorProp<P...>& prop, const Layout& layout )
+  {
+    return computeRank(layout);
+  }
+
  // Create the layout for the rank-7 view.
  // Non-strided Layout
  template <typename Layout>
@ -158,8 +153,17 @@ struct DynRankDimTraits {
                 );
  }

+  // Extra overload to match that for specialize types
+  template <typename Traits, typename ... P>
+  KOKKOS_INLINE_FUNCTION
+  static typename std::enable_if< (std::is_same<typename Traits::array_layout , Kokkos::LayoutRight>::value || std::is_same<typename Traits::array_layout , Kokkos::LayoutLeft>::value || std::is_same<typename Traits::array_layout , Kokkos::LayoutStride>::value) , typename Traits::array_layout >::type createLayout( const ViewCtorProp<P...>& prop, const typename Traits::array_layout& layout )
+  {
+    return createLayout( layout );
+  }
+
  // Create a view from the given dimension arguments.
  // This is only necessary because the shmem constructor doesn't take a layout.
+  //   NDE shmem View's are not compatible with the added view_alloc value_type / fad_dim deduction functionality
  template <typename ViewType, typename ViewArg>
  static ViewType createView( const ViewArg& arg
                            , const size_t N0
@ -186,7 +190,8 @@ struct DynRankDimTraits {
  // Non-strided Layout
  template <typename Layout , typename iType>
  KOKKOS_INLINE_FUNCTION
-  static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutRight>::value || std::is_same<Layout , Kokkos::LayoutLeft>::value) && std::is_integral<iType>::value , Layout >::type reconstructLayout( const Layout& layout , iType dynrank )
+  static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutRight>::value || std::is_same<Layout , Kokkos::LayoutLeft>::value) && std::is_integral<iType>::value , Layout >::type
+  reconstructLayout( const Layout& layout , iType dynrank )
  {
    return Layout( dynrank > 0 ? layout.dimension[0] : ~size_t(0)
                 , dynrank > 1 ? layout.dimension[1] : ~size_t(0)
@ -202,7 +207,8 @@ struct DynRankDimTraits {
  // LayoutStride
  template <typename Layout , typename iType>
  KOKKOS_INLINE_FUNCTION
-  static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutStride>::value) && std::is_integral<iType>::value , Layout >::type reconstructLayout( const Layout& layout , iType dynrank )
+  static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutStride>::value) && std::is_integral<iType>::value , Layout >::type
+  reconstructLayout( const Layout& layout , iType dynrank )
  {
    return Layout( dynrank > 0 ? layout.dimension[0] : ~size_t(0)
                 , dynrank > 0 ? layout.stride[0] : (0)
@ -311,6 +317,11 @@ void dyn_rank_view_verify_operator_bounds
 /** \brief  Assign compatible default mappings */
 struct ViewToDynRankViewTag {};

+} // namespace Impl
+} // namespace Experimental
+
+namespace Impl {
+
 template< class DstTraits , class SrcTraits >
 class ViewMapping< DstTraits , SrcTraits ,
  typename std::enable_if<(
@ -337,7 +348,7 @@ class ViewMapping< DstTraits , SrcTraits ,
        )
      )
    )
-  ) , ViewToDynRankViewTag >::type >
+  ) , Kokkos::Experimental::Impl::ViewToDynRankViewTag >::type >
 {
 private:

@ -376,7 +387,7 @@ public:

      typedef typename DstType::offset_type  dst_offset_type ;
      dst.m_map.m_offset = dst_offset_type(std::integral_constant<unsigned,0>() , src.layout() ); //Check this for integer input1 for padding, etc
-      dst.m_map.m_handle = Kokkos::Experimental::Impl::ViewDataHandle< DstTraits >::assign( src.m_map.m_handle , src.m_track );
+      dst.m_map.m_handle = Kokkos::Impl::ViewDataHandle< DstTraits >::assign( src.m_map.m_handle , src.m_track );
      dst.m_track.assign( src.m_track , DstTraits::is_managed );
      dst.m_rank = src.Rank ;
    }
@ -384,22 +395,20 @@ public:

 } //end Impl

+namespace Experimental {
+
 /* \class DynRankView
 * \brief Container that creates a Kokkos view with rank determined at runtime.
- *   Essentially this is a rank 7 view that wraps the access operators
- *   to yield the functionality of a view
+ *   Essentially this is a rank 7 view
 *
 *   Changes from View
 *   1. The rank of the DynRankView is returned by the method rank()
 *   2. Max rank of a DynRankView is 7
- *   3. subview name is subdynrankview
- *   4. Every subdynrankview is returned with LayoutStride
- *
- *   NEW: Redesigned DynRankView
- *   5. subview function name now available
- *   6. Copy and Copy-Assign View to DynRankView
- *   7. deep_copy between Views and DynRankViews
- *   8. rank( view ); returns the rank of View or DynRankView
+ *   3. subview called with 'subview(...)' or 'subdynrankview(...)' (backward compatibility) 
+ *   4. Every subview is returned with LayoutStride
+ *   5. Copy and Copy-Assign View to DynRankView
+ *   6. deep_copy between Views and DynRankViews
+ *   7. rank( view ); returns the rank of View or DynRankView
 *
 */

@ -427,7 +436,7 @@ public:


 private:
-  typedef Kokkos::Experimental::Impl::ViewMapping< traits , void > map_type ;
+  typedef Kokkos::Impl::ViewMapping< traits , void > map_type ;
  typedef Kokkos::Experimental::Impl::SharedAllocationTracker      track_type ;

  track_type  m_track ;
@ -556,7 +565,7 @@ public:
  // Allow specializations to query their specialized map

  KOKKOS_INLINE_FUNCTION
-  const Kokkos::Experimental::Impl::ViewMapping< traits , void > &
+  const Kokkos::Impl::ViewMapping< traits , void > &
  implementation_map() const { return m_map ; }

  //----------------------------------------
@ -803,7 +812,7 @@ public:
    , m_rank(rhs.m_rank)
    {
      typedef typename DynRankView<RT,RP...> ::traits SrcTraits ;
-      typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
+      typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
      static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
      Mapping::assign( m_map , rhs.m_map , rhs.m_track );
    }
@ -813,7 +822,7 @@ public:
  DynRankView & operator = (const DynRankView<RT,RP...> & rhs )
    {
      typedef typename DynRankView<RT,RP...> ::traits SrcTraits ;
-      typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
+      typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
      static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
      Mapping::assign( m_map , rhs.m_map , rhs.m_track );
      m_track.assign( rhs.m_track , traits::is_managed );
@ -831,7 +840,7 @@ public:
    , m_rank( rhs.Rank )
    {
      typedef typename View<RT,RP...>::traits  SrcTraits ;
-      typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag >  Mapping ;
+      typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag >  Mapping ;
      static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
      Mapping::assign( *this , rhs );
    }
@ -841,7 +850,7 @@ public:
  DynRankView & operator = ( const View<RT,RP...> & rhs )
    {
      typedef typename View<RT,RP...>::traits  SrcTraits ;
-      typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag >  Mapping ;
+      typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag >  Mapping ;
      static_assert( Mapping::is_assignable , "Incompatible View to DynRankView copy assignment" );
      Mapping::assign( *this , rhs );
      return *this ;
@ -870,7 +879,7 @@ public:
      )
      : m_track()
      , m_map()
-      , m_rank( Impl::DynRankDimTraits<typename traits::specialize>::computeRank(arg_layout) )
+      , m_rank( Impl::DynRankDimTraits<typename traits::specialize>::template computeRank< typename traits::array_layout, P...>(arg_prop, arg_layout) )
    {
      // Append layout and spaces if not input
      typedef Impl::ViewCtorProp< P ... > alloc_prop_input ;
@ -923,7 +932,7 @@ public:
 //------------------------------------------------------------

      Kokkos::Experimental::Impl::SharedAllocationRecord<> *
-        record = m_map.allocate_shared( prop , Impl::DynRankDimTraits<typename traits::specialize>::createLayout(arg_layout) );
+        record = m_map.allocate_shared( prop , Impl::DynRankDimTraits<typename traits::specialize>::template createLayout<traits, P...>(arg_prop, arg_layout) );

 //------------------------------------------------------------
 #if defined( KOKKOS_ENABLE_CUDA )
@ -947,8 +956,8 @@ public:
                               >::type const & arg_layout
      )
      : m_track() // No memory tracking
-      , m_map( arg_prop , Impl::DynRankDimTraits<typename traits::specialize>::createLayout(arg_layout) )
-      , m_rank( Impl::DynRankDimTraits<typename traits::specialize>::computeRank(arg_layout) )
+      , m_map( arg_prop , Impl::DynRankDimTraits<typename traits::specialize>::template createLayout<traits, P...>(arg_prop, arg_layout) )
+      , m_rank( Impl::DynRankDimTraits<typename traits::specialize>::template computeRank< typename traits::array_layout, P...>(arg_prop, arg_layout) )
    {
      static_assert(
        std::is_same< pointer_type
@ -1034,6 +1043,7 @@ public:
    {}

  // For backward compatibility
+  // NDE This ctor does not take ViewCtorProp argument - should not use alternative createLayout call
  explicit inline
  DynRankView( const ViewAllocateWithoutInitializing & arg_prop
      , const typename traits::array_layout & arg_layout
@ -1179,6 +1189,11 @@ namespace Impl {

 struct DynRankSubviewTag {};

+} // namespace Impl
+} // namespace Experimental
+
+namespace Impl {
+
 template< class SrcTraits , class ... Args >
 struct ViewMapping
  < typename std::enable_if<(
@ -1192,7 +1207,7 @@ struct ViewMapping
        std::is_same< typename SrcTraits::array_layout
                    , Kokkos::LayoutStride >::value
      )
-    ), DynRankSubviewTag >::type
+    ), Kokkos::Experimental::Impl::DynRankSubviewTag >::type
  , SrcTraits
  , Args ... >
 {
@ -1264,7 +1279,7 @@ public:
  };


-  typedef DynRankView< value_type , array_layout , typename SrcTraits::device_type , typename SrcTraits::memory_traits >  ret_type;
+  typedef Kokkos::Experimental::DynRankView< value_type , array_layout , typename SrcTraits::device_type , typename SrcTraits::memory_traits >  ret_type;

  template < typename T , class ... P >
  KOKKOS_INLINE_FUNCTION
@ -1336,9 +1351,10 @@ public:

 } // end Impl

+namespace Experimental {

 template< class V , class ... Args >
-using Subdynrankview = typename Kokkos::Experimental::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , V , Args... >::ret_type ;
+using Subdynrankview = typename Kokkos::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , V , Args... >::ret_type ;

 template< class D , class ... P , class ...Args >
 KOKKOS_INLINE_FUNCTION
@ -1348,7 +1364,7 @@ subdynrankview( const Kokkos::Experimental::DynRankView< D , P... > &src , Args.
    if ( src.rank() > sizeof...(Args) ) //allow sizeof...(Args) >= src.rank(), ignore the remaining args
      { Kokkos::abort("subdynrankview: num of args must be >= rank of the source DynRankView"); }

-    typedef Kokkos::Experimental::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , Kokkos::ViewTraits< D*******, P... > , Args... > metafcn ;
+    typedef Kokkos::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , Kokkos::ViewTraits< D*******, P... > , Args... > metafcn ;

    return metafcn::subview( src.rank() , src , args... );
  }
--- a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -57,7 +57,7 @@ namespace Experimental {
 */
 template< typename DataType , typename ... P >
 class DynamicView : public Kokkos::ViewTraits< DataType , P ... >
-{ 
+{
 public:

  typedef Kokkos::ViewTraits< DataType , P ... >  traits ;
@ -68,7 +68,7 @@ private:

  typedef Kokkos::Experimental::Impl::SharedAllocationTracker   track_type ;

-  static_assert( traits::rank == 1 && traits::rank_dynamic == 1 
+  static_assert( traits::rank == 1 && traits::rank_dynamic == 1
               , "DynamicView must be rank-one" );

  static_assert( std::is_trivial< typename traits::value_type >::value &&
@ -216,14 +216,14 @@ public:
        // Verify that allocation of the requested chunk in in progress.

        // The allocated chunk counter is m_chunks[ m_chunk_max ]
-        const uintptr_t n = 
+        const uintptr_t n =
          *reinterpret_cast<uintptr_t volatile *>( m_chunks + m_chunk_max );

        if ( n <= ic ) {
          Kokkos::abort("Kokkos::DynamicView array bounds error");
        }

-        // Allocation of this chunk is in progress 
+        // Allocation of this chunk is in progress
        // so wait for allocation to complete.
        while ( 0 == *ch );
      }
@ -267,7 +267,7 @@ public:
        const uintptr_t jc_try = jc ;

        // Jump iteration to the chunk counter.
-        
+
        jc = atomic_compare_exchange( pc , jc_try , jc_try + 1 );

        if ( jc_try == jc ) {
@ -316,7 +316,7 @@ public:
      }
      else {
        while ( NC + 1 <= *pc ) {
-          --*pc ;        
+          --*pc ;
          m_pool.deallocate( m_chunks[*pc]
                           , sizeof(value_type) << m_chunk_shift );
          m_chunks[*pc] = 0 ;
@ -331,7 +331,7 @@ public:
    typename traits::value_type ** m_chunks ;
    uintptr_t                    * m_pc ;
    uintptr_t                      m_nc ;
-    unsigned                       m_chunk_shift ;  
+    unsigned                       m_chunk_shift ;

    KOKKOS_INLINE_FUNCTION
    void operator()( int ) const
@ -348,7 +348,7 @@ public:
        }
        else {
          while ( m_nc + 1 <= *m_pc ) {
-            --*m_pc ;        
+            --*m_pc ;
            m_pool.deallocate( m_chunks[*m_pc]
                             , sizeof(value_type) << m_chunk_shift );
            m_chunks[*m_pc] = 0 ;
@ -482,7 +482,7 @@ public:
  };


-  /**\brief  Allocation constructor 
+  /**\brief  Allocation constructor
   *
   *  Memory is allocated in chunks from the memory pool.
   *  The chunk size conforms to the memory pool's chunk size.
@ -557,7 +557,7 @@ void deep_copy( const View<T,DP...> & dst

  if ( DstExecCanAccessSrc ) {
    // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
-    Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type >( dst , src );
+    Kokkos::Impl::ViewRemap< dst_type , src_type >( dst , src );
  }
  else {
    Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation");
@ -581,7 +581,7 @@ void deep_copy( const DynamicView<T,DP...> & dst

  if ( DstExecCanAccessSrc ) {
    // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
-    Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type >( dst , src );
+    Kokkos::Impl::ViewRemap< dst_type , src_type >( dst , src );
  }
  else {
    Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation");
--- a/lib/kokkos/containers/unit_tests/TestCuda.cpp
+++ b/lib/kokkos/containers/unit_tests/TestCuda.cpp
@ -69,6 +69,8 @@
 #include <Kokkos_ErrorReporter.hpp>
 #include <TestErrorReporter.hpp>

+#include <TestViewCtorPropEmbeddedDim.hpp>
+
 //----------------------------------------------------------------------------


@ -94,6 +96,10 @@ TEST_F( cuda , dyn_view_api) {
  TestDynViewAPI< double , Kokkos::Cuda >();
 }

+TEST_F( cuda, viewctorprop_embedded_dim ) {
+  TestViewCtorProp_EmbeddedDim< Kokkos::Cuda >::test_vcpt( 2, 3 );
+}
+
 TEST_F( cuda , staticcrsgraph )
 {
  TestStaticCrsGraph::run_test_graph< Kokkos::Cuda >();
--- a/lib/kokkos/containers/unit_tests/TestOpenMP.cpp
+++ b/lib/kokkos/containers/unit_tests/TestOpenMP.cpp
@ -66,6 +66,8 @@
 #include <Kokkos_ErrorReporter.hpp>
 #include <TestErrorReporter.hpp>

+#include <TestViewCtorPropEmbeddedDim.hpp>
+
 #include <iomanip>

 namespace Test {
@ -76,14 +78,7 @@ protected:
  {
    std::cout << std::setprecision(5) << std::scientific;

-    unsigned threads_count = 4 ;
-
-    if ( Kokkos::hwloc::available() ) {
-      threads_count = Kokkos::hwloc::get_available_numa_count() *
-                      Kokkos::hwloc::get_available_cores_per_numa();
-    }
-
-    Kokkos::OpenMP::initialize( threads_count );
+    Kokkos::OpenMP::initialize();
  }

  static void TearDownTestCase()
@ -96,6 +91,10 @@ TEST_F( openmp, dyn_view_api) {
  TestDynViewAPI< double , Kokkos::OpenMP >();
 }

+TEST_F( openmp, viewctorprop_embedded_dim ) {
+  TestViewCtorProp_EmbeddedDim< Kokkos::OpenMP >::test_vcpt( 2, 3 );
+}
+
 TEST_F( openmp, bitset )
 {
  test_bitset<Kokkos::OpenMP>();
--- a/lib/kokkos/containers/unit_tests/TestSerial.cpp
+++ b/lib/kokkos/containers/unit_tests/TestSerial.cpp
@ -67,6 +67,8 @@
 #include <Kokkos_ErrorReporter.hpp>
 #include <TestErrorReporter.hpp>

+#include <TestViewCtorPropEmbeddedDim.hpp>
+
 namespace Test {

 class serial : public ::testing::Test {
@ -85,6 +87,10 @@ TEST_F( serial, dyn_view_api) {
  TestDynViewAPI< double , Kokkos::Serial >();
 }

+TEST_F( serial, viewctorprop_embedded_dim ) {
+  TestViewCtorProp_EmbeddedDim< Kokkos::Serial >::test_vcpt( 2, 3 );
+}
+
 TEST_F( serial , staticcrsgraph )
 {
  TestStaticCrsGraph::run_test_graph< Kokkos::Serial >();
--- a/lib/kokkos/containers/unit_tests/TestThreads.cpp
+++ b/lib/kokkos/containers/unit_tests/TestThreads.cpp
@ -70,6 +70,8 @@
 #include <Kokkos_ErrorReporter.hpp>
 #include <TestErrorReporter.hpp>

+#include <TestViewCtorPropEmbeddedDim.hpp>
+
 namespace Test {

 class threads : public ::testing::Test {
@ -103,6 +105,10 @@ TEST_F( threads , dyn_view_api) {
  TestDynViewAPI< double , Kokkos::Threads >();
 }

+TEST_F( threads, viewctorprop_embedded_dim ) {
+  TestViewCtorProp_EmbeddedDim< Kokkos::Threads >::test_vcpt( 2, 3 );
+}
+
 TEST_F( threads , staticcrsgraph )
 {
  TestStaticCrsGraph::run_test_graph< Kokkos::Threads >();
--- a/lib/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp
+++ b/lib/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp
@ -0,0 +1,213 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cstdio>
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_DynRankView.hpp>
+
+#include <type_traits>
+#include <typeinfo>
+
+namespace Test {
+
+namespace {
+
+template <typename ExecSpace >
+struct TestViewCtorProp_EmbeddedDim {
+
+  using ViewIntType     = typename Kokkos::View< int**, ExecSpace >;
+  using ViewDoubleType     = typename Kokkos::View< double*, ExecSpace >;
+
+  using DynRankViewIntType     = typename Kokkos::DynRankView< int, ExecSpace >;
+  using DynRankViewDoubleType     = typename Kokkos::DynRankView< double, ExecSpace >;
+
+  // Cuda 7.0 has issues with using a lamda in parallel_for to initialize the view - replace with this functor
+  template < class ViewType >
+  struct Functor {
+
+    ViewType v;
+
+    Functor( const ViewType & v_ ) : v(v_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( const int i ) const {
+      v(i) = i;
+    }
+
+  };
+
+
+  static void test_vcpt( const int N0, const int N1 )
+  {
+
+    // Create two views to test
+    {
+      using VIT = typename TestViewCtorProp_EmbeddedDim::ViewIntType ;
+      using VDT = typename TestViewCtorProp_EmbeddedDim::ViewDoubleType ;
+
+      VIT vi1("vi1", N0, N1);
+      VDT vd1("vd1", N0);
+
+      // TEST: Test for common type between two views, one with type double, other with type int
+      // Deduce common value_type and construct a view with that type
+      {
+        // Two views
+        auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1, vd1);
+        typedef typename decltype( view_alloc_arg )::value_type                    CommonViewValueType;
+        typedef typename Kokkos::View< CommonViewValueType*, ExecSpace >  CVT;
+        typedef typename CVT::HostMirror                                           HostCVT;
+
+        // Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
+        CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
+
+        Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1), 
+          Functor<CVT>(cv1)
+        );
+
+        HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
+        Kokkos::deep_copy( hcv1, cv1 );
+
+        ASSERT_EQ( (std::is_same< CommonViewValueType, double >::value) , true ) ;
+      #if 0
+      // debug output
+      for ( int i = 0; i < N0*N1; ++i ) {
+        printf(" Output check: hcv1(%d) = %lf\n ", i, hcv1(i) );
+      }
+
+      printf( " Common value type view: %s \n", typeid( CVT() ).name() );
+      printf( " Common value type: %s \n", typeid( CommonViewValueType() ).name() );
+      if ( std::is_same< CommonViewValueType, double >::value == true ) {
+        printf("Proper common value_type\n");
+      }
+      else {
+        printf("WRONG common value_type\n");
+      }
+      // end debug output
+      #endif
+      }
+
+      {
+        // Single view
+        auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1);
+        typedef typename decltype( view_alloc_arg )::value_type                    CommonViewValueType;
+        typedef typename Kokkos::View< CommonViewValueType*, ExecSpace >  CVT;
+        typedef typename CVT::HostMirror                                           HostCVT;
+
+        // Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
+        CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
+
+        Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1), 
+          Functor<CVT>(cv1)
+        );
+
+        HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
+        Kokkos::deep_copy( hcv1, cv1 );
+
+        ASSERT_EQ( (std::is_same< CommonViewValueType, int>::value) , true ) ;
+      }
+
+    }
+
+    // Create two dynamic rank views to test
+    {
+      using VIT = typename TestViewCtorProp_EmbeddedDim::DynRankViewIntType ;
+      using VDT = typename TestViewCtorProp_EmbeddedDim::DynRankViewDoubleType ;
+
+      VIT vi1("vi1", N0, N1);
+      VDT vd1("vd1", N0);
+
+      // TEST: Test for common type between two views, one with type double, other with type int
+      // Deduce common value_type and construct a view with that type
+      {
+        // Two views
+        auto view_alloc_arg = Kokkos::common_view_alloc_prop( vi1, vd1 );
+        typedef typename decltype( view_alloc_arg )::value_type                    CommonViewValueType;
+        typedef typename Kokkos::View< CommonViewValueType*, ExecSpace >  CVT;
+        typedef typename CVT::HostMirror                                           HostCVT;
+
+        // Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
+        CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
+
+
+        Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1), 
+          Functor<CVT>(cv1)
+        );
+
+        HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
+        Kokkos::deep_copy( hcv1, cv1 );
+
+        ASSERT_EQ( (std::is_same< CommonViewValueType, double >::value) , true ) ;
+      }
+
+      {
+        // Single views
+        auto view_alloc_arg = Kokkos::common_view_alloc_prop( vi1 );
+        typedef typename decltype( view_alloc_arg )::value_type                    CommonViewValueType;
+        typedef typename Kokkos::View< CommonViewValueType*, ExecSpace >  CVT;
+        typedef typename CVT::HostMirror                                           HostCVT;
+
+        // Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
+        CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
+
+        Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1), 
+          Functor<CVT>(cv1)
+        );
+
+        HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
+        Kokkos::deep_copy( hcv1, cv1 );
+
+        ASSERT_EQ( (std::is_same< CommonViewValueType, int>::value) , true ) ;
+      }
+    }
+
+
+  } // end test_vcpt
+
+}; // end struct
+
+} // namespace
+
+} // namespace Test
--- a/lib/kokkos/containers/unit_tests/UnitTestMain.cpp
+++ b/lib/kokkos/containers/unit_tests/UnitTestMain.cpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,12 +36,14 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */

 #include <gtest/gtest.h>
+#include <cstdlib>
+#include <Kokkos_Macros.hpp>

 int main(int argc, char *argv[]) {
  ::testing::InitGoogleTest(&argc,argv);
--- a/Show More
+++ b/Show More