Merge branch 'master' into kk_copycpu

2020-04-28 11:56:22 -04:00 · 2020-04-28 11:56:22 -04:00 · 21cb0d394f
parent 700bb2e0f4 2e07345cea
commit 21cb0d394f
679 changed files with 26945 additions and 15386 deletions
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -45,7 +45,7 @@ src/GPU/pair_vashishta_gpu.*        @andeplane
 src/KOKKOS/pair_vashishta_kokkos.*  @andeplane
 src/MANYBODY/pair_vashishta_table.* @andeplane
 src/MANYBODY/pair_atm.*             @sergeylishchuk
-src/USER-MISC/fix_bond_react.*      @jrgissing
+src/USER-REACTION/fix_bond_react.*  @jrgissing
 src/USER-MISC/*_grem.*              @dstelter92
 src/USER-MISC/compute_stress_mop*.* @RomainVermorel

--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@ -135,23 +135,20 @@ if(PKG_USER-ADIOS)
  target_link_libraries(lammps PRIVATE adios2::adios2)
 endif()

-if (CMAKE_SYSTEM_NAME STREQUAL Windows)
-  option(BUILD_MPI "Build MPI version" OFF)
-else()
-  # do MPI detection after language activation,
-  # in case MPI for these languages is required
+if(NOT CMAKE_CROSSCOMPILING)
  set(MPI_CXX_SKIP_MPICXX TRUE)
  find_package(MPI QUIET)
  option(BUILD_MPI "Build MPI version" ${MPI_FOUND})
+else()
+  option(BUILD_MPI "Build MPI version" OFF)
 endif()

 if(BUILD_MPI)
-  # We use a non-standard procedure to compile with MPI on windows
-  if (CMAKE_SYSTEM_NAME STREQUAL Windows)
+  # We use a non-standard procedure to cross-compile with MPI on Windows
+  if((CMAKE_SYSTEM_NAME STREQUAL Windows) AND CMAKE_CROSSCOMPILING)
    include(MPI4WIN)
    target_link_libraries(lammps PUBLIC MPI::MPI_CXX)
  else()
-    set(MPI_CXX_SKIP_MPICXX ON)
    find_package(MPI REQUIRED)
    target_link_libraries(lammps PUBLIC MPI::MPI_CXX)
    option(LAMMPS_LONGLONG_TO_LONG "Workaround if your system or MPI version does not recognize 'long long' data types" OFF)
@ -332,8 +329,9 @@ set(CMAKE_TUNE_FLAGS "${CMAKE_TUNE_DEFAULT}" CACHE STRING "Compiler specific opt
 separate_arguments(CMAKE_TUNE_FLAGS)
 include(CheckCXXCompilerFlag)
 foreach(_FLAG ${CMAKE_TUNE_FLAGS})
-  check_cxx_compiler_flag("${_FLAG}" COMPILER_SUPPORTS${_FLAG})
-  if(COMPILER_SUPPORTS${_FLAG})
+  string(REGEX REPLACE "[=\"]" "" _FLAGX ${_FLAG})
+  check_cxx_compiler_flag("${_FLAG}" COMPILER_SUPPORTS${_FLAGX})
+  if(COMPILER_SUPPORTS${_FLAGX})
    target_compile_options(lammps PRIVATE ${_FLAG})
  else()
    message(WARNING "${_FLAG} found in CMAKE_TUNE_FLAGS, but not supported by the compiler, skipping")
@ -705,6 +703,7 @@ else()
 endif()
 if(BUILD_MPI)
  message(STATUS "<<< MPI flags >>>
+-- MPI_defines:      ${MPI_CXX_COMPILE_DEFINITIONS}
 -- MPI includes:     ${MPI_CXX_INCLUDE_PATH}
 -- MPI libraries:    ${MPI_CXX_LIBRARIES};${MPI_Fortran_LIBRARIES}")
 endif()
--- a/cmake/Modules/MPI4WIN.cmake
+++ b/cmake/Modules/MPI4WIN.cmake
@ -23,3 +23,8 @@ set_target_properties(MPI::MPI_CXX PROPERTIES
  INTERFACE_INCLUDE_DIRECTORIES "${SOURCE_DIR}/include"
  INTERFACE_COMPILE_DEFINITIONS "MPICH_SKIP_MPICXX")
 add_dependencies(MPI::MPI_CXX mpi4win_build)
+
+# set variables for status reporting at the end of CMake run
+set(MPI_CXX_INCLUDE_PATH "${SOURCE_DIR}/include")
+set(MPI_CXX_COMPILE_DEFINITIONS "MPICH_SKIP_MPICXX")
+set(MPI_CXX_LIBRARIES "${SOURCE_DIR}/lib/libmpi.a")
--- a/cmake/Modules/Packages/GPU.cmake
+++ b/cmake/Modules/Packages/GPU.cmake
@ -330,7 +330,7 @@ elseif(GPU_API STREQUAL "HIP")

  if(HIP_PLATFORM STREQUAL "nvcc")
    target_compile_definitions(gpu PRIVATE -D__HIP_PLATFORM_NVCC__)
-    target_include_directories(gpu PRIVATE ${HIP_ROOT_DIR}/include)
+    target_include_directories(gpu PRIVATE ${HIP_ROOT_DIR}/../include)
    target_include_directories(gpu PRIVATE ${CUDA_INCLUDE_DIRS})
    target_link_libraries(gpu PRIVATE ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY})

@ -338,6 +338,12 @@ elseif(GPU_API STREQUAL "HIP")
    target_include_directories(hip_get_devices PRIVATE ${HIP_ROOT_DIR}/include)
    target_include_directories(hip_get_devices PRIVATE ${CUDA_INCLUDE_DIRS})
    target_link_libraries(hip_get_devices PRIVATE ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY})
+  elseif(HIP_PLATFORM STREQUAL "hcc")
+    target_compile_definitions(gpu PRIVATE -D__HIP_PLATFORM_HCC__)
+    target_include_directories(gpu PRIVATE ${HIP_ROOT_DIR}/../include)
+
+    target_compile_definitions(hip_get_devices PRIVATE -D__HIP_PLATFORM_HCC__)
+    target_include_directories(hip_get_devices PRIVATE ${HIP_ROOT_DIR}/../include)
  endif()

  target_link_libraries(lammps PRIVATE gpu)
@ -353,7 +359,12 @@ RegisterStylesExt(${GPU_SOURCES_DIR} gpu GPU_SOURCES)

 get_property(GPU_SOURCES GLOBAL PROPERTY GPU_SOURCES)

-target_link_libraries(gpu PRIVATE MPI::MPI_CXX)
+if(NOT BUILD_MPI)
+  # mpistubs is aliased to MPI::MPI_CXX, but older versions of cmake won't work forward the include path
+  target_link_libraries(gpu PRIVATE mpi_stubs)
+else()
+  target_link_libraries(gpu PRIVATE MPI::MPI_CXX)
+endif()
 if(NOT BUILD_SHARED_LIBS)
  install(TARGETS gpu EXPORT LAMMPS_Targets LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
 endif()
--- a/cmake/Modules/Packages/KIM.cmake
+++ b/cmake/Modules/Packages/KIM.cmake
@ -1,7 +1,12 @@
 set(KIM-API_MIN_VERSION 2.1.3)
 find_package(CURL)
 if(CURL_FOUND)
-  target_link_libraries(lammps PRIVATE CURL::libcurl)
+  if(CMAKE_VERSION VERSION_LESS 3.12)
+    target_include_directories(lammps PRIVATE ${CURL_INCLUDE_DIRS})
+    target_link_libraries(lammps PRIVATE ${CURL_LIBRARIES})
+  else()
+    target_link_libraries(lammps PRIVATE CURL::libcurl)
+  endif()
  target_compile_definitions(lammps PRIVATE -DLMP_KIM_CURL)
  set(LMP_DEBUG_CURL OFF CACHE STRING "Set libcurl verbose mode on/off. If on, it displays a lot of verbose information about its operations.")
  mark_as_advanced(LMP_DEBUG_CURL)
--- a/cmake/Modules/Packages/KOKKOS.cmake
+++ b/cmake/Modules/Packages/KOKKOS.cmake
@ -14,16 +14,30 @@ endif()
 option(EXTERNAL_KOKKOS "Build against external kokkos library" OFF)
 option(DOWNLOAD_KOKKOS "Download the KOKKOS library instead of using the bundled one" OFF)
 if(DOWNLOAD_KOKKOS)
+  # extract Kokkos-related variables and values so we can forward them to the Kokkos library build
+  get_cmake_property(_VARS VARIABLES)
+  list(FILTER _VARS INCLUDE REGEX ^Kokkos_)
+  foreach(_VAR IN LISTS _VARS)
+    list(APPEND KOKKOS_LIB_BUILD_ARGS "-D${_VAR}=${${_VAR}}")
+  endforeach()
  message(STATUS "KOKKOS download requested - we will build our own")
-  file(DOWNLOAD https://github.com/kokkos/kokkos/compare/3.0.00...stanmoore1:lammps.diff ${CMAKE_CURRENT_BINARY_DIR}/kokkos-lammps.patch)
+  list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>")
+  if(CMAKE_REQUEST_PIC)
+    list(APPEND KOKKOS_LIB_BUILD_ARGS ${CMAKE_REQUEST_PIC})
+  endif()
+  # append other CMake variables that need to be forwarded to CMAKE_ARGS
+  list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}")
+  list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_INSTALL_LIBDIR=lib")
+  list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM}")
+  list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}")
+  list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD}")
+  list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_CXX_EXTENSIONS=${CMAKE_CXX_EXTENSIONS}")
+  list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}")
  include(ExternalProject)
  ExternalProject_Add(kokkos_build
-    URL https://github.com/kokkos/kokkos/archive/3.0.00.tar.gz
-    URL_MD5 281c7093aa3a603276e93abdf4be23b9
-    PATCH_COMMAND patch -p1 < ${CMAKE_CURRENT_BINARY_DIR}/kokkos-lammps.patch
-    CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> ${CMAKE_REQUEST_PIC}
-    -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_LIBDIR=lib
-    -DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM} -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}
+    URL https://github.com/kokkos/kokkos/archive/3.1.00.tar.gz
+    URL_MD5 f638a6c786f748a602b26faa0e96ebab
+    CMAKE_ARGS ${KOKKOS_LIB_BUILD_ARGS}
    BUILD_BYPRODUCTS <INSTALL_DIR>/lib/libkokkoscore.a
  )
  ExternalProject_get_property(kokkos_build INSTALL_DIR)
@ -39,9 +53,9 @@ if(DOWNLOAD_KOKKOS)
    install(CODE "MESSAGE(FATAL_ERROR \"Installing liblammps with downloaded libraries is currently not supported.\")")
  endif()
 elseif(EXTERNAL_KOKKOS)
-  find_package(Kokkos 3)
+  find_package(Kokkos 3.1)
  if(NOT Kokkos_FOUND)
-    message(FATAL_ERROR "KOKKOS library not found, help CMake to find it by setting KOKKOS_LIBRARY, or set DOWNLOAD_KOKKOS=ON to download it")
+    message(FATAL_ERROR "KOKKOS library version 3.1 or later not found, help CMake to find it by setting KOKKOS_LIBRARY, or set DOWNLOAD_KOKKOS=ON to download it")
  endif()
  target_link_libraries(lammps PRIVATE Kokkos::kokkos)
 else()
--- a/cmake/Modules/Packages/LATTE.cmake
+++ b/cmake/Modules/Packages/LATTE.cmake
@ -1,4 +1,11 @@
 enable_language(Fortran)
+
+# using lammps in a super-build setting
+if(TARGET LATTE::latte)
+  target_link_libraries(lammps PRIVATE LATTE::latte)
+  return()
+endif()
+
 find_package(LATTE)
 if(LATTE_FOUND)
  set(DOWNLOAD_LATTE_DEFAULT OFF)
@ -35,5 +42,6 @@ else()
  if(NOT LATTE_FOUND)
    message(FATAL_ERROR "LATTE library not found, help CMake to find it by setting LATTE_LIBRARY, or set DOWNLOAD_LATTE=ON to download it")
  endif()
-  target_link_libraries(lammps PRIVATE LATTE::latte)
+  # latte needs lapack
+  target_link_libraries(lammps PRIVATE LATTE::latte ${LAPACK_LIBRARIES})
 endif()
--- a/cmake/Modules/Packages/USER-PLUMED.cmake
+++ b/cmake/Modules/Packages/USER-PLUMED.cmake
@ -74,12 +74,11 @@ if(DOWNLOAD_PLUMED)
    install(CODE "MESSAGE(FATAL_ERROR \"Installing liblammps with downloaded libraries is currently not supported.\")")
  endif()
  if(PLUMED_MODE STREQUAL "STATIC")
-    set_target_properties(LAMMPS::PLUMED PROPERTIES INTERFACE_COMPILE_DEFINITIONS "__PLUMED_WRAPPER_CXX=1")
    set_target_properties(LAMMPS::PLUMED PROPERTIES IMPORTED_LOCATION ${INSTALL_DIR}/lib/libplumed.a INTERFACE_LINK_LIBRARIES "${PLUMED_LINK_LIBS};${CMAKE_DL_LIBS}")
  elseif(PLUMED_MODE STREQUAL "SHARED")
    set_target_properties(LAMMPS::PLUMED PROPERTIES IMPORTED_LOCATION ${INSTALL_DIR}/lib/libplumed${CMAKE_SHARED_LIBRARY_SUFFIX} INTERFACE_LINK_LIBRARIES "${INSTALL_DIR}/lib/libplumedKernel${CMAKE_SHARED_LIBRARY_SUFFIX};${CMAKE_DL_LIBS}")
  elseif(PLUMED_MODE STREQUAL "RUNTIME")
-    set_target_properties(LAMMPS::PLUMED PROPERTIES INTERFACE_COMPILE_DEFINITIONS "__PLUMED_HAS_DLOPEN=1;__PLUMED_DEFAULT_KERNEL=${INSTALL_DIR}/lib/libplumedKernel${CMAKE_SHARED_LIBRARY_SUFFIX}")
+    set_target_properties(LAMMPS::PLUMED PROPERTIES INTERFACE_COMPILE_DEFINITIONS "__PLUMED_DEFAULT_KERNEL=${INSTALL_DIR}/lib/libplumedKernel${CMAKE_SHARED_LIBRARY_SUFFIX}")
    set_target_properties(LAMMPS::PLUMED PROPERTIES IMPORTED_LOCATION ${INSTALL_DIR}/lib/libplumedWrapper.a INTERFACE_LINK_LIBRARIES "${CMAKE_DL_LIBS}")
  endif()
  set_target_properties(LAMMPS::PLUMED PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${INSTALL_DIR}/include)
@ -89,12 +88,11 @@ else()
  pkg_check_modules(PLUMED REQUIRED plumed)
  add_library(LAMMPS::PLUMED INTERFACE IMPORTED)
  if(PLUMED_MODE STREQUAL "STATIC")
-    set_target_properties(LAMMPS::PLUMED PROPERTIES INTERFACE_COMPILE_DEFINITIONS "__PLUMED_WRAPPER_CXX=1")
    include(${PLUMED_LIBDIR}/plumed/src/lib/Plumed.cmake.static)
  elseif(PLUMED_MODE STREQUAL "SHARED")
    include(${PLUMED_LIBDIR}/plumed/src/lib/Plumed.cmake.shared)
  elseif(PLUMED_MODE STREQUAL "RUNTIME")
-    set_target_properties(LAMMPS::PLUMED PROPERTIES INTERFACE_COMPILE_DEFINITIONS "__PLUMED_HAS_DLOPEN=1;__PLUMED_DEFAULT_KERNEL=${PLUMED_LIBDIR}/libplumedKernel${CMAKE_SHARED_LIBRARY_SUFFIX}")
+    set_target_properties(LAMMPS::PLUMED PROPERTIES INTERFACE_COMPILE_DEFINITIONS "__PLUMED_DEFAULT_KERNEL=${PLUMED_LIBDIR}/libplumedKernel${CMAKE_SHARED_LIBRARY_SUFFIX}")
    include(${PLUMED_LIBDIR}/plumed/src/lib/Plumed.cmake.runtime)
  endif()
  set_target_properties(LAMMPS::PLUMED PROPERTIES INTERFACE_LINK_LIBRARIES "${PLUMED_LOAD}")
--- a/cmake/presets/clang.cmake
+++ b/cmake/presets/clang.cmake
@ -2,7 +2,7 @@

 set(CMAKE_CXX_COMPILER "clang++" CACHE STRING "" FORCE)
 set(CMAKE_C_COMPILER "clang" CACHE STRING "" FORCE)
-set(CMAKE_CXX_FLAGS "-Wall -Wextra -g -O2 -DNDEBG" CACHE STRING "" FORCE)
+set(CMAKE_CXX_FLAGS "-Wall -Wextra -g -O2 -DNDEBUG" CACHE STRING "" FORCE)
 set(MPI_CXX "clang++" CACHE STRING "" FORCE)
 set(MPI_CXX_COMPILER "mpicxx" CACHE STRING "" FORCE)
 unset(HAVE_OMP_H_INCLUDE CACHE)
--- a/cmake/presets/hip.cmake
+++ b/cmake/presets/hip.cmake
@ -0,0 +1,12 @@
+# preset that will enable hipcc plus gcc with support for MPI and OpenMP (on Linux boxes)
+
+set(CMAKE_CXX_COMPILER "hipcc" CACHE STRING "" FORCE)
+set(CMAKE_C_COMPILER "gcc" CACHE STRING "" FORCE)
+set(CMAKE_CXX_FLAGS "-Wall -Wextra -g -O2 -DNDEBUG" CACHE STRING "" FORCE)
+unset(HAVE_OMP_H_INCLUDE CACHE)
+
+set(OpenMP_CXX "hipcc" CACHE STRING "" FORCE)
+set(OpenMP_CXX_FLAGS "-fopenmp" CACHE STRING "" FORCE)
+set(OpenMP_CXX_LIB_NAMES "omp" CACHE STRING "" FORCE)
+set(OpenMP_omp_LIBRARY "libomp.so" CACHE PATH "" FORCE)
+
--- a/doc/include-file-conventions.md
+++ b/doc/include-file-conventions.md
@ -49,22 +49,15 @@ include files provided with LAMMPS are included with double quotes

 For headers declaring functions of the C-library, the corresponding
 C++ versions should be included (examples: `#include <cstdlib>` or
-`#include <cctypes>`).  However, these includes are limited to those defined
-in the C++98 standard.  Some files thus must use the older style until
-the minimum C++ standard requirement of LAMMPS is lifted to C++11 or
-even beyond (examples: `#include <stdint.h>` versus `#include <cstdint>`
-or `#include <inttypes.h>` versus `#include <cinttypes>`).
+`#include <cctypes>` instead of `#include <stdlib.h>` or
+`#include<ctypes.h>` ).

 ### C++ Standard Compliance

-LAMMPS core files currently correspond to the C++98 standard. Files
-requiring C++11 or later are only permitted in (optional) packages
-and particularly packages that are not part of the list of commonly
-used packages such as MOLECULE, KSPACE, MANYBODY, or RIGID.
-
-Also, LAMMPS uses the C-style stdio library for I/O instead of iostreams.
-Since using both at the same time can cause problems, iostreams should
-be avoided where possible.
+LAMMPS core files use standard conforming C++ compatible with the
+C++11 standard, unless explicitly noted.  Also, LAMMPS uses the C-style
+stdio library for I/O instead of iostreams.  Since using both at the
+same time can cause problems, iostreams should be avoided where possible.

 ### Lean Header Files

--- a/doc/src/Build_extras.rst
+++ b/doc/src/Build_extras.rst
@ -320,11 +320,12 @@ to have an executable that will run on this and newer architectures.

 .. note::

-   NVIDIA GPUs with CC 5.0 (Maxwell) and newer are not compatible with
-   CC 3.x (Kepler).  If you run Kokkos on a newer architecture than what
-   LAMMPS was compiled with, there will be a significant delay during
-   device initialization since the just-in-time compiler has to
-   recompile the GPU kernel code for the new hardware.
+   If you run Kokkos on a newer GPU architecture than what LAMMPS was
+   compiled with, there will be a delay during device initialization
+   since the just-in-time compiler has to recompile all GPU kernels
+   for the new hardware.  This is, however, not possible when compiled
+   for NVIDIA GPUs with CC 3.x (Kepler) for GPUs with CC 5.0 (Maxwell)
+   and newer as they are not compatible.

 The settings discussed below have been tested with LAMMPS and are
 confirmed to work.  Kokkos is an active project with ongoing improvements
@ -343,73 +344,109 @@ be specified in uppercase.
   :widths: auto

   *  - **Arch-ID**
+      - **HOST or GPU**
      - **Description**
   *  - AMDAVX
+      - HOST
      - AMD 64-bit x86 CPU (AVX 1)
   *  - EPYC
+      - HOST
      - AMD EPYC Zen class CPU (AVX 2)
   *  - ARMV80
+      - HOST
      - ARMv8.0 Compatible CPU
   *  - ARMV81
+      - HOST
      - ARMv8.1 Compatible CPU
   *  - ARMV8_THUNDERX
+      - HOST
      - ARMv8 Cavium ThunderX CPU
   *  - ARMV8_THUNDERX2
+      - HOST
      - ARMv8 Cavium ThunderX2 CPU
   *  - WSM
+      - HOST
      - Intel Westmere CPU (SSE 4.2)
   *  - SNB
+      - HOST
      - Intel Sandy/Ivy Bridge CPU (AVX 1)
   *  - HSW
+      - HOST
      - Intel Haswell CPU (AVX 2)
   *  - BDW
+      - HOST
      - Intel Broadwell Xeon E-class CPU (AVX 2 + transactional mem)
   *  - SKX
+      - HOST
      - Intel Sky Lake Xeon E-class HPC CPU (AVX512 + transactional mem)
   *  - KNC
+      - HOST
      - Intel Knights Corner Xeon Phi
   *  - KNL
+      - HOST
      - Intel Knights Landing Xeon Phi
   *  - BGQ
+      - HOST
      - IBM Blue Gene/Q CPU
   *  - POWER7
-      - IBM POWER8 CPU
+      - HOST
+      - IBM POWER7 CPU
   *  - POWER8
+      - HOST
      - IBM POWER8 CPU
   *  - POWER9
+      - HOST
      - IBM POWER9 CPU
   *  - KEPLER30
+      - GPU
      - NVIDIA Kepler generation CC 3.0 GPU
   *  - KEPLER32
+      - GPU
      - NVIDIA Kepler generation CC 3.2 GPU
   *  - KEPLER35
+      - GPU
      - NVIDIA Kepler generation CC 3.5 GPU
   *  - KEPLER37
+      - GPU
      - NVIDIA Kepler generation CC 3.7 GPU
   *  - MAXWELL50
+      - GPU
      - NVIDIA Maxwell generation CC 5.0 GPU
   *  - MAXWELL52
+      - GPU
      - NVIDIA Maxwell generation CC 5.2 GPU
   *  - MAXWELL53
+      - GPU
      - NVIDIA Maxwell generation CC 5.3 GPU
   *  - PASCAL60
+      - GPU
      - NVIDIA Pascal generation CC 6.0 GPU
   *  - PASCAL61
+      - GPU
      - NVIDIA Pascal generation CC 6.1 GPU
   *  - VOLTA70
+      - GPU
      - NVIDIA Volta generation CC 7.0 GPU
   *  - VOLTA72
+      - GPU
      - NVIDIA Volta generation CC 7.2 GPU
   *  - TURING75
+      - GPU
      - NVIDIA Turing generation CC 7.5 GPU
+   *  - VEGA900
+      - GPU
+      - AMD GPU MI25 GFX900
+   *  - VEGA906
+      - GPU
+      - AMD GPU MI50/MI60 GFX906

-CMake build settings:
-^^^^^^^^^^^^^^^^^^^^^
+Basic CMake build settings:
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
 For multicore CPUs using OpenMP, set these 2 variables.

 .. code-block:: bash

-   -D Kokkos_ARCH_CPUARCH=yes  # CPUARCH = CPU from list above
+   -D Kokkos_ARCH_HOSTARCH=yes  # HOSTARCH = HOST from list above
   -D Kokkos_ENABLE_OPENMP=yes
   -D BUILD_OMP=yes

@ -427,15 +464,19 @@ For NVIDIA GPUs using CUDA, set these variables:

 .. code-block:: bash

-   -D Kokkos_ARCH_CPUARCH=yes    # CPUARCH = CPU from list above
+   -D Kokkos_ARCH_HOSTARCH=yes   # HOSTARCH = HOST from list above
   -D Kokkos_ARCH_GPUARCH=yes    # GPUARCH = GPU from list above
   -D Kokkos_ENABLE_CUDA=yes
   -D Kokkos_ENABLE_OPENMP=yes
   -D CMAKE_CXX_COMPILER=wrapper # wrapper = full path to Cuda nvcc wrapper

-The wrapper value is the Cuda nvcc compiler wrapper provided in the
-Kokkos library: ``lib/kokkos/bin/nvcc_wrapper``\ .  The setting should
-include the full path name to the wrapper, e.g.
+This will also enable executing FFTs on the GPU, either via the internal
+KISSFFT library, or - by preference - with the cuFFT library bundled
+with the CUDA toolkit, depending on whether CMake can identify its
+location.  The *wrapper* value for ``CMAKE_CXX_COMPILER`` variable is
+the path to the CUDA nvcc compiler wrapper provided in the Kokkos
+library: ``lib/kokkos/bin/nvcc_wrapper``\ .  The setting should include
+the full path name to the wrapper, e.g.

 .. code-block:: bash

@ -455,8 +496,8 @@ common packages enabled, you can do the following:
   cmake -C ../cmake/presets/minimal.cmake -C ../cmake/presets/kokkos-cuda.cmake ../cmake
   cmake --build .

-Traditional make settings:
-^^^^^^^^^^^^^^^^^^^^^^^^^^
+Basic traditional make settings:
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

 Choose which hardware to support in ``Makefile.machine`` via
 ``KOKKOS_DEVICES`` and ``KOKKOS_ARCH`` settings.  See the
@ -467,7 +508,7 @@ For multicore CPUs using OpenMP:
 .. code-block:: make

   KOKKOS_DEVICES = OpenMP
-   KOKKOS_ARCH = CPUARCH          # CPUARCH = CPU from list above
+   KOKKOS_ARCH = HOSTARCH          # HOSTARCH = HOST from list above

 For Intel KNLs using OpenMP:

@ -481,7 +522,8 @@ For NVIDIA GPUs using CUDA:
 .. code-block:: make

   KOKKOS_DEVICES = Cuda
-   KOKKOS_ARCH = CPUARCH,GPUARCH  # CPUARCH = CPU from list above that is hosting the GPU
+   KOKKOS_ARCH = HOSTARCH,GPUARCH  # HOSTARCH = HOST from list above that is hosting the GPU
+   KOKKOS_CUDA_OPTIONS = "enable_lambda"
                                  # GPUARCH = GPU from list above
   FFT_INC = -DFFT_CUFFT          # enable use of cuFFT (optional)
   FFT_LIB = -lcufft              # link to cuFFT library
@ -504,6 +546,44 @@ C++ compiler for non-Kokkos, non-CUDA files.
   KOKKOS_ABSOLUTE_PATH = $(shell cd $(KOKKOS_PATH); pwd)
   CC = mpicxx -cxx=$(KOKKOS_ABSOLUTE_PATH)/config/nvcc_wrapper

+
+Advanced KOKKOS compilation settings
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+There are other allowed options when building with the KOKKOS package
+that can improve performance or assist in debugging or profiling. Below
+are some examples that may be useful in combination with LAMMPS.  For
+the full list (which keeps changing as the Kokkos package itself evolves),
+please consult the Kokkos library documentation.
+
+As alternative to using multi-threading via OpenMP
+(``-DKokkos_ENABLE_OPENMP=on`` or ``KOKKOS_DEVICES=OpenMP``) it is also
+possible to use Posix threads directly (``-DKokkos_ENABLE_PTHREAD=on``
+or ``KOKKOS_DEVICES=Pthread``).  While binding of threads to individual
+or groups of CPU cores is managed in OpenMP with environment variables,
+you need assistance from either the "hwloc" or "libnuma" library for the
+Pthread thread parallelization option. To enable use with CMake:
+``-DKokkos_ENABLE_HWLOC=on`` or ``-DKokkos_ENABLE_LIBNUMA=on``; and with
+conventional make: ``KOKKOS_USE_TPLS=hwloc`` or
+``KOKKOS_USE_TPLS=libnuma``.
+
+The CMake option ``-DKokkos_ENABLE_LIBRT=on`` or the makefile setting
+``KOKKOS_USE_TPLS=librt`` enables the use of a more accurate timer
+mechanism on many Unix-like platforms for internal profiling.
+
+The CMake option ``-DKokkos_ENABLE_DEBUG=on`` or the makefile setting
+``KOKKOS_DEBUG=yes`` enables printing of run-time
+debugging information that can be useful. It also enables runtime
+bounds checking on Kokkos data structures.  As to be expected, enabling
+this option will negatively impact the performance and thus is only
+recommended when developing a Kokkos-enabled style in LAMMPS.
+
+The CMake option ``-DKokkos_ENABLE_CUDA_UVM=on`` or the makefile
+setting ``KOKKOS_CUDA_OPTIONS=enable_lambda,force_uvm`` enables the
+use of CUDA "Unified Virtual Memory" in Kokkos.  Please note, that
+the LAMMPS KOKKOS package must **always** be compiled with the
+*enable_lambda* option when using GPUs.
+
 ----------

 .. _latte:
--- a/doc/src/Speed_kokkos.rst
+++ b/doc/src/Speed_kokkos.rst
@ -9,10 +9,7 @@ different back end languages such as CUDA, OpenMP, or Pthreads.  The
 Kokkos library also provides data abstractions to adjust (at compile
 time) the memory layout of data structures like 2d and 3d arrays to
 optimize performance on different hardware. For more information on
-Kokkos, see `GitHub <https://github.com/kokkos/kokkos>`_. Kokkos is
-part of `Trilinos <https://www.trilinos.org/>`_. The Kokkos
-library was written primarily by Carter Edwards, Christian Trott, and
-Dan Sunderland (all Sandia).
+Kokkos, see `GitHub <https://github.com/kokkos/kokkos>`_.

 The LAMMPS KOKKOS package contains versions of pair, fix, and atom
 styles that use data structures and macros provided by the Kokkos
@ -21,7 +18,7 @@ package was developed primarily by Christian Trott (Sandia) and Stan
 Moore (Sandia) with contributions of various styles by others,
 including Sikandar Mashayak (UIUC), Ray Shan (Sandia), and Dan Ibanez
 (Sandia). For more information on developing using Kokkos abstractions
-see the Kokkos programmers' guide at /lib/kokkos/doc/Kokkos_PG.pdf.
+see the Kokkos `Wiki <https://github.com/kokkos/kokkos/wiki>`_.

 Kokkos currently provides support for 3 modes of execution (per MPI
 task). These are Serial (MPI-only for CPUs and Intel Phi), OpenMP
@ -31,33 +28,30 @@ compatible with specific hardware.

 .. note::

-   Kokkos support within LAMMPS must be built with a C++11 compatible
-   compiler. This means GCC version 4.7.2 or later, Intel 14.0.4 or later, or
-   Clang 3.5.2 or later is required.
-
-.. note::
-
-   To build with Kokkos support for NVIDIA GPUs, NVIDIA CUDA
+   To build with Kokkos support for NVIDIA GPUs, the NVIDIA CUDA toolkit
   software version 9.0 or later must be installed on your system. See
-   the discussion for the :doc:`GPU package <Speed_gpu>` for details of how
-   to check and do this.
+   the discussion for the :doc:`GPU package <Speed_gpu>` for details of
+   how to check and do this.

 .. note::

-   Kokkos with CUDA currently implicitly assumes that the MPI library
-   is CUDA-aware. This is not always the case, especially when using
-   pre-compiled MPI libraries provided by a Linux distribution. This is not
-   a problem when using only a single GPU with a single MPI rank. When
-   running with multiple MPI ranks, you may see segmentation faults without
-   CUDA-aware MPI support. These can be avoided by adding the flags :doc:`-pk kokkos cuda/aware off <Run_options>` to the LAMMPS command line or by
-   using the command :doc:`package kokkos cuda/aware off <package>` in the
-   input file.
+   Kokkos with CUDA currently implicitly assumes that the MPI library is
+   CUDA-aware. This is not always the case, especially when using
+   pre-compiled MPI libraries provided by a Linux distribution. This is
+   not a problem when using only a single GPU with a single MPI
+   rank. When running with multiple MPI ranks, you may see segmentation
+   faults without CUDA-aware MPI support. These can be avoided by adding
+   the flags :doc:`-pk kokkos cuda/aware off <Run_options>` to the
+   LAMMPS command line or by using the command :doc:`package kokkos
+   cuda/aware off <package>` in the input file.

-**Building LAMMPS with the KOKKOS package:**
+Building LAMMPS with the KOKKOS package
+"""""""""""""""""""""""""""""""""""""""

 See the :ref:`Build extras <kokkos>` doc page for instructions.

-**Running LAMMPS with the KOKKOS package:**
+Running LAMMPS with the KOKKOS package
+""""""""""""""""""""""""""""""""""""""

 All Kokkos operations occur within the context of an individual MPI
 task running on a single node of the machine. The total number of MPI
@ -66,7 +60,8 @@ usual manner via the mpirun or mpiexec commands, and is independent of
 Kokkos. E.g. the mpirun command in OpenMPI does this via its -np and
 -npernode switches. Ditto for MPICH via -np and -ppn.

-**Running on a multi-core CPU:**
+Running on a multi-core CPU
+^^^^^^^^^^^^^^^^^^^^^^^^^^^

 Here is a quick overview of how to use the KOKKOS package
 for CPU acceleration, assuming one or more 16-core nodes.
@ -142,7 +137,8 @@ atom.  When using the Kokkos Serial back end or the OpenMP back end with
 a single thread, no duplication or atomic operations are used. For CUDA
 and half neighbor lists, the KOKKOS package always uses atomic operations.

-**Core and Thread Affinity:**
+CPU Cores, Sockets and Thread Affinity
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

 When using multi-threading, it is important for performance to bind
 both MPI tasks to physical cores, and threads to physical cores, so
@ -156,15 +152,16 @@ for your MPI installation), binding can be forced with these flags:
   OpenMPI 1.8: mpirun -np 2 --bind-to socket --map-by socket ./lmp_openmpi ...
   Mvapich2 2.0: mpiexec -np 2 --bind-to socket --map-by socket ./lmp_mvapich ...

-For binding threads with KOKKOS OpenMP, use thread affinity
-environment variables to force binding. With OpenMP 3.1 (gcc 4.7 or
-later, intel 12 or later) setting the environment variable
-OMP_PROC_BIND=true should be sufficient. In general, for best
-performance with OpenMP 4.0 or better set OMP_PROC_BIND=spread and
-OMP_PLACES=threads.  For binding threads with the KOKKOS pthreads
-option, compile LAMMPS the KOKKOS HWLOC=yes option as described below.
+For binding threads with KOKKOS OpenMP, use thread affinity environment
+variables to force binding. With OpenMP 3.1 (gcc 4.7 or later, intel 12
+or later) setting the environment variable ``OMP_PROC_BIND=true`` should
+be sufficient. In general, for best performance with OpenMP 4.0 or later
+set ``OMP_PROC_BIND=spread`` and ``OMP_PLACES=threads``.  For binding
+threads with the KOKKOS pthreads option, compile LAMMPS with the hwloc
+or libnuma support enabled as described in the :ref:`extra build options page <kokkos>`.

-**Running on Knight's Landing (KNL) Intel Xeon Phi:**
+Running on Knight's Landing (KNL) Intel Xeon Phi
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

 Here is a quick overview of how to use the KOKKOS package for the
 Intel Knight's Landing (KNL) Xeon Phi:
@ -222,7 +219,8 @@ threads/task as Nt. The product of these two values should be N, i.e.
   them in "native" mode, not "offload" mode like the USER-INTEL package
   supports.

-**Running on GPUs:**
+Running on GPUs
+^^^^^^^^^^^^^^^

 Use the "-k" :doc:`command-line switch <Run_options>` to specify the
 number of GPUs per node. Typically the -np setting of the mpirun command
@ -257,7 +255,7 @@ one or more nodes, each with two GPUs:
   running on GPUs is to use "full" neighbor lists and set the Newton flag
   to "off" for both pairwise and bonded interactions, along with threaded
   communication. When running on Maxwell or Kepler GPUs, this will
-   typically be best. For Pascal GPUs, using "half" neighbor lists and
+   typically be best. For Pascal GPUs and beyond, using "half" neighbor lists and
   setting the Newton flag to "on" may be faster. For many pair styles,
   setting the neighbor binsize equal to twice the CPU default value will
   give speedup, which is the default when running on GPUs. Use the "-pk
@ -270,13 +268,6 @@ one or more nodes, each with two GPUs:

   mpirun -np 2 lmp_kokkos_cuda_openmpi -k on g 2 -sf kk -pk kokkos newton on neigh half binsize 2.8 -in in.lj      # Newton on, half neighbor list, set binsize = neighbor ghost cutoff

-.. note::
-
-   For good performance of the KOKKOS package on GPUs, you must
-   have Kepler generation GPUs (or later). The Kokkos library exploits
-   texture cache options not supported by Telsa generation GPUs (or
-   older).
-
 .. note::

   When using a GPU, you will achieve the best performance if your
@ -293,7 +284,8 @@ one or more nodes, each with two GPUs:
   kspace, etc., you must set the environment variable CUDA_LAUNCH_BLOCKING=1.
   However, this will reduce performance and is not recommended for production runs.

-**Run with the KOKKOS package by editing an input script:**
+Run with the KOKKOS package by editing an input script
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

 Alternatively the effect of the "-sf" or "-pk" switches can be
 duplicated by adding the :doc:`package kokkos <package>` or :doc:`suffix kk <suffix>` commands to your input script.
@ -316,17 +308,24 @@ You only need to use the :doc:`package kokkos <package>` command if you
 wish to change any of its option defaults, as set by the "-k on"
 :doc:`command-line switch <Run_options>`.

-**Using OpenMP threading and CUDA together (experimental):**
+**Using OpenMP threading and CUDA together:**

 With the KOKKOS package, both OpenMP multi-threading and GPUs can be
-used together in a few special cases. In the Makefile, the
-KOKKOS_DEVICES variable must include both "Cuda" and "OpenMP", as is
-the case for /src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi
+compiled and used together in a few special cases. In the makefile for
+the conventional build, the KOKKOS_DEVICES variable must include both,
+"Cuda" and "OpenMP", as is the case for ``/src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi``.

 .. code-block:: bash

   KOKKOS_DEVICES=Cuda,OpenMP

+When building with CMake you need to enable both features as it is done
+in the ``kokkos-cuda.cmake`` CMake preset file.
+
+.. code-block:: bash
+
+   cmake ../cmake -DKokkos_ENABLE_CUDA=yes -DKokkos_ENABLE_OPENMP=yes
+
 The suffix "/kk" is equivalent to "/kk/device", and for Kokkos CUDA,
 using the "-sf kk" in the command line gives the default CUDA version
 everywhere.  However, if the "/kk/host" suffix is added to a specific
@ -360,7 +359,8 @@ suffix for kspace and bonds, angles, etc.  in the input file and the
 sure the environment variable CUDA_LAUNCH_BLOCKING is not set to "1"
 so CPU/GPU overlap can occur.

-**Speed-ups to expect:**
+Performance to expect
+"""""""""""""""""""""

 The performance of KOKKOS running in different modes is a function of
 your hardware, which KOKKOS-enable styles are used, and the problem
@ -377,52 +377,26 @@ Generally speaking, the following rules of thumb apply:
  performance of a KOKKOS style is a bit slower than the USER-OMP
  package.
 * When running large number of atoms per GPU, KOKKOS is typically faster
-  than the GPU package.
+  than the GPU package when compiled for double precision. The benefit
+  of using single or mixed precision with the GPU package depends
+  significantly on the hardware in use and the simulated system and pair
+  style.
 * When running on Intel hardware, KOKKOS is not as fast as
-  the USER-INTEL package, which is optimized for that hardware.
+  the USER-INTEL package, which is optimized for x86 hardware (not just
+  from Intel) and compilation with the Intel compilers.  The USER-INTEL
+  package also can increase the vector length of vector instructions
+  by switching to single or mixed precision mode.

 See the `Benchmark page <https://lammps.sandia.gov/bench.html>`_ of the
 LAMMPS web site for performance of the KOKKOS package on different
 hardware.

-**Advanced Kokkos options:**
+Advanced Kokkos options
+"""""""""""""""""""""""

-There are other allowed options when building with the KOKKOS package.
-As explained on the :ref:`Build extras <kokkos>` doc page,
-they can be set either as variables on the make command line or in
-Makefile.machine, or they can be specified as CMake variables.  Each
-takes a value shown below.  The default value is listed, which is set
-in the lib/kokkos/Makefile.kokkos file.
-
-* KOKKOS_DEBUG, values = *yes*\ , *no*\ , default = *no*
-* KOKKOS_USE_TPLS, values = *hwloc*\ , *librt*\ , *experimental_memkind*, default = *none*
-* KOKKOS_CXX_STANDARD, values = *c++11*\ , *c++1z*\ , default = *c++11*
-* KOKKOS_OPTIONS, values = *aggressive_vectorization*, *disable_profiling*, default = *none*
-* KOKKOS_CUDA_OPTIONS, values = *force_uvm*, *use_ldg*, *rdc*\ , *enable_lambda*, default = *enable_lambda*
-
-KOKKOS_USE_TPLS=hwloc binds threads to hardware cores, so they do not
-migrate during a simulation. KOKKOS_USE_TPLS=hwloc should always be
-used if running with KOKKOS_DEVICES=Pthreads for pthreads. It is not
-necessary for KOKKOS_DEVICES=OpenMP for OpenMP, because OpenMP
-provides alternative methods via environment variables for binding
-threads to hardware cores.  More info on binding threads to cores is
-given on the :doc:`Speed omp <Speed_omp>` doc page.
-
-KOKKOS_USE_TPLS=librt enables use of a more accurate timer mechanism
-on most Unix platforms. This library is not available on all
-platforms.
-
-KOKKOS_DEBUG is only useful when developing a Kokkos-enabled style
-within LAMMPS. KOKKOS_DEBUG=yes enables printing of run-time
-debugging information that can be useful. It also enables runtime
-bounds checking on Kokkos data structures.
-
-KOKKOS_CXX_STANDARD and KOKKOS_OPTIONS are typically not changed when
-building LAMMPS.
-
-KOKKOS_CUDA_OPTIONS are additional options for CUDA. The LAMMPS KOKKOS
-package must be compiled with the *enable_lambda* option when using
-GPUs.
+There are other allowed options when building with the KOKKOS package
+that can improve performance or assist in debugging or profiling.
+They are explained on the :ref:`KOKKOS section of the build extras <kokkos>` doc page,

 Restrictions
 """"""""""""
--- a/doc/src/change_box.rst
+++ b/doc/src/change_box.rst
@ -299,6 +299,12 @@ match what is stored in the restart file.  So if you wish to change
 them, you should use the change_box command after the read_restart
 command.

+.. note::
+
+   Changing a periodic boundary to a non-periodic one will also
+   cause the image flag for that dimension to be reset to 0 for
+   all atoms.  LAMMPS will print a warning message, if that happens.
+
 ----------

 The *ortho* and *triclinic* keywords convert the simulation box to be
--- a/doc/src/compute_orientorder_atom.rst
+++ b/doc/src/compute_orientorder_atom.rst
@ -3,6 +3,9 @@
 compute orientorder/atom command
 ================================

+compute orientorder/atom/kk command
+=======================
+
 Syntax
 """"""

@ -128,6 +131,30 @@ too frequently.
   :doc:`special_bonds <special_bonds>` command that includes all pairs in
   the neighbor list.

+----------
+
+
+Styles with a *gpu*\ , *intel*\ , *kk*\ , *omp*\ , or *opt* suffix are
+functionally the same as the corresponding style without the suffix.
+They have been optimized to run faster, depending on your available
+hardware, as discussed on the :doc:`Speed packages <Speed_packages>` doc
+page.  The accelerated styles take the same arguments and should
+produce the same results, except for round-off and precision issues.
+
+These accelerated styles are part of the GPU, USER-INTEL, KOKKOS,
+USER-OMP and OPT packages, respectively.  They are only enabled if
+LAMMPS was built with those packages.  See the :doc:`Build package <Build_package>` doc page for more info.
+
+You can specify the accelerated styles explicitly in your input script
+by including their suffix, or you can use the :doc:`-suffix command-line switch <Run_options>` when you invoke LAMMPS, or you can use the
+:doc:`suffix <suffix>` command in your input script.
+
+See the :doc:`Speed packages <Speed_packages>` doc page for more
+instructions on how to use the accelerated styles effectively.
+
+
+----------
+
 **Output info:**

 This compute calculates a per-atom array with *nlvalues* columns,
--- a/doc/src/compute_stress_atom.rst
+++ b/doc/src/compute_stress_atom.rst
@ -2,6 +2,7 @@

 compute stress/atom command
 ===========================
+
 compute centroid/stress/atom command
 ====================================

@ -223,15 +224,14 @@ The per-atom array values will be in pressure\*volume

 Restrictions
 """"""""""""
-Currently, compute *centroid/stress/atom* does not support
-pair styles with many-body interactions,
-such as :doc:`Tersoff <pair_tersoff>`,
-and LAMMPS will generate an error in such cases.
-In principal, equivalent formulation
-to that of angle, dihedral and improper contributions
-in the virial :math:`W_{ab}` formula
-can also be applied to the many-body pair styles,
-and is planned in the future.
+
+Currently (Spring 2020), compute *centroid/stress/atom* does not support
+pair styles with many-body interactions, such as :doc:`Tersoff
+<pair_tersoff>`, or pair styles with long-range Coulomb interactions.
+LAMMPS will generate an error in such cases.  In principal, equivalent
+formulation to that of angle, dihedral and improper contributions in the
+virial :math:`W_{ab}` formula can also be applied to the many-body pair
+styles, and is planned in the future.

 Related commands
 """"""""""""""""
--- a/doc/src/fix_bond_react.rst
+++ b/doc/src/fix_bond_react.rst
@ -158,7 +158,9 @@ The following comments pertain to each *react* argument (in other
 words, can be customized for each reaction, or reaction step):

 A check for possible new reaction sites is performed every *Nevery*
-timesteps.
+timesteps. *Nevery* can be specified with an equal-style
+:doc:`variable <variable>`, whose value is rounded up to the nearest
+integer.

 Three physical conditions must be met for a reaction to occur. First,
 a bonding atom pair must be identified within the reaction distance
@ -171,19 +173,29 @@ modified to match the post-reaction template.
 A bonding atom pair will be identified if several conditions are met.
 First, a pair of atoms I,J within the specified react-group-ID of type
 itype and jtype must be separated by a distance between *Rmin* and
-*Rmax*\ . It is possible that multiple bonding atom pairs are
-identified: if the bonding atoms in the pre-reacted template are  1-2
-neighbors, i.e. directly bonded, the farthest bonding atom partner is
-set as its bonding partner; otherwise, the closest potential partner
-is chosen. Then, if both an atom I and atom J have each other as their
-bonding partners, these two atoms are identified as the bonding atom
-pair of the reaction site. Once this unique bonding atom pair is
-identified for each reaction, there could two or more reactions that
-involve a given atom on the same timestep. If this is the case, only
-one such reaction is permitted to occur. This reaction is chosen
-randomly from all potential reactions. This capability allows e.g. for
-different reaction pathways to proceed from identical reaction sites
-with user-specified probabilities.
+*Rmax*\ . *Rmin* and *Rmax* can be specified with equal-style
+:doc:`variables <variable>`. For example, these reaction cutoffs can
+be a function of the reaction conversion using the following commands:
+
+.. code-block:: LAMMPS
+
+   variable rmax equal 0 # initialize variable before bond/react
+   fix myrxn all bond/react react myrxn1 all 1 0 v_rmax mol1 mol2 map_file.txt
+   variable rmax equal 3+f_myrxn[1]/100 # arbitrary function of reaction count
+
+It is possible that multiple bonding atom pairs are identified: if the
+bonding atoms in the pre-reacted template are  1-2 neighbors, i.e.
+directly bonded, the farthest bonding atom partner is set as its
+bonding partner; otherwise, the closest potential partner is chosen.
+Then, if both an atom I and atom J have each other as their bonding
+partners, these two atoms are identified as the bonding atom pair of
+the reaction site. Once this unique bonding atom pair is identified
+for each reaction, there could two or more reactions that involve a
+given atom on the same timestep. If this is the case, only one such
+reaction is permitted to occur. This reaction is chosen randomly from
+all potential reactions. This capability allows e.g. for different
+reaction pathways to proceed from identical reaction sites with
+user-specified probabilities.

 The pre-reacted molecule template is specified by a molecule command.
 This molecule template file contains a sample reaction site and its
@ -419,7 +431,8 @@ it occurs:

 The *prob* keyword can affect whether or not an eligible reaction
 actually occurs. The fraction setting must be a value between 0.0 and
-1.0. A uniform random number between 0.0 and 1.0 is generated and the
+1.0, and can be specified with an equal-style :doc:`variable <variable>`.
+A uniform random number between 0.0 and 1.0 is generated and the
 eligible reaction only occurs if the random number is less than the
 fraction. Up to N reactions are permitted to occur, as optionally
 specified by the *max_rxn* keyword.
@ -489,10 +502,11 @@ local command.

 **Restart, fix_modify, output, run start/stop, minimize info:**

-Cumulative reaction counts for each reaction are written to :doc:`binary restart files <restart>`. These values are associated with the
-reaction name (react-ID). Additionally, internally-created per-atom
-properties are stored to allow for smooth restarts. None of the
-:doc:`fix_modify <fix_modify>` options are relevant to this fix.
+Cumulative reaction counts for each reaction are written to :doc:`binary restart files <restart>`.
+These values are associated with the reaction name (react-ID).
+Additionally, internally-created per-atom properties are stored to
+allow for smooth restarts. None of the :doc:`fix_modify <fix_modify>`
+options are relevant to this fix.

 This fix computes one statistic for each *react* argument that it
 stores in a global vector, of length 'number of react arguments', that
--- a/doc/src/fix_heat.rst
+++ b/doc/src/fix_heat.rst
@ -47,16 +47,22 @@ and the specified geometric :doc:`region <region>` in order to have
 energy added or subtracted to it.  If not specified, then the atoms in
 the group are affected wherever they may move to.

-Heat addition/subtraction is performed every N timesteps.  The *eflux*
-parameter can be specified as a numeric constant or as a variable (see
-below).  If it is a numeric constant or equal-style variable which
-evaluates to a scalar value, then the *eflux* determines the change in
-aggregate energy of the entire group of atoms per unit time, e.g. in
-eV/psec for :doc:`metal units <units>`.  In this case it is an
-"extensive" quantity, meaning its magnitude should be scaled with the
-number of atoms in the group.  Note that since *eflux* has per-time
-units (i.e. it is a flux), this means that a larger value of N will
-add/subtract a larger amount of energy each time the fix is invoked.
+Heat addition/subtraction is performed every N timesteps.
+
+The *eflux* parameter can be specified as a numeric constant or as an
+equal- or atom-style :doc:`variable <variable>`.  If the value is a
+variable, it should be specified as v_name, where *name* is the variable
+name.  In this case, the variable will be evaluated each timestep, and
+its current value(s) used to determine the flux.
+
+If *eflux* is a numeric constant or equal-style variable which evaluates
+to a scalar value, then *eflux* determines the change in aggregate energy
+of the entire group of atoms per unit time, e.g. in eV/psec for
+:doc:`metal units <units>`.  In this case it is an "extensive" quantity,
+meaning its magnitude should be scaled with the number of atoms in the
+group.  Note that since *eflux* also has per-time units (i.e. it is a
+flux), this means that a larger value of N will add/subtract a larger
+amount of energy each time the fix is invoked.

 .. note::

@ -71,12 +77,6 @@ the energy flux for a single atom, again in units of energy per unit
 time.  In this case, each value is an "intensive" quantity, which need
 not be scaled with the number of atoms in the group.

-As mentioned above, the *eflux* parameter can be specified as an
-equal-style or atom_style :doc:`variable <variable>`.  If the value is a
-variable, it should be specified as v_name, where name is the variable
-name.  In this case, the variable will be evaluated each timestep, and
-its value(s) used to determine the flux.
-
 Equal-style variables can specify formulas with various mathematical
 functions, and include :doc:`thermo_style <thermo_style>` command
 keywords for the simulation box parameters and timestep and elapsed
--- a/doc/src/fix_wall_body_polyhedron.rst
+++ b/doc/src/fix_wall_body_polyhedron.rst
@ -15,15 +15,14 @@ Syntax
 * k_n = normal repulsion strength (force/distance units or pressure units - see discussion below)
 * c_n = normal damping coefficient (force/distance units or pressure units - see discussion below)
 * c_t = tangential damping coefficient (force/distance units or pressure units - see discussion below)
-* wallstyle = *xplane* or *yplane* or *zplane* or *zcylinder*
+* wallstyle = *xplane* or *yplane* or *zplane*
 * args = list of arguments for a particular style

  .. parsed-literal::

-       *xplane* or *yplane* args = lo hi
+       *xplane* or *yplane* or *zplane* args = lo hi
         lo,hi = position of lower and upper plane (distance units), either can be NULL)
-       *zcylinder* args = radius
-         radius = cylinder radius (distance units)
+

 * zero or more keyword/value pairs may be appended to args
 * keyword = *wiggle*
@ -60,8 +59,7 @@ those specified with the :doc:`pair_style body/rounded/polyhedron <pair_body_rou
 The *wallstyle* can be planar or cylindrical.  The 3 planar options
 specify a pair of walls in a dimension.  Wall positions are given by
 *lo* and *hi*\ .  Either of the values can be specified as NULL if a
-single wall is desired.  For a *zcylinder* wallstyle, the cylinder's
-axis is at x = y = 0.0, and the radius of the cylinder is specified.
+single wall is desired.

 Optionally, the wall can be moving, if the *wiggle* keyword is appended.

@ -71,8 +69,7 @@ particles.  The arguments to the *wiggle* keyword specify a dimension
 for the motion, as well as it's *amplitude* and *period*\ .  Note that
 if the dimension is in the plane of the wall, this is effectively a
 shearing motion.  If the dimension is perpendicular to the wall, it is
-more of a shaking motion.  A *zcylinder* wall can only be wiggled in
-the z dimension.
+more of a shaking motion.

 Each timestep, the position of a wiggled wall in the appropriate *dim*
 is set according to this equation:
--- a/doc/utils/sphinx-config/false_positives.txt
+++ b/doc/utils/sphinx-config/false_positives.txt
@ -499,6 +499,7 @@ cuda
 Cuda
 CUDA
 CuH
+cuFFT
 Cummins
 Curk
 customIDs
@ -1544,6 +1545,7 @@ libmeam
 libmessage
 libmpi
 libmpich
+libnuma
 libplumed
 libplumedKernel
 libpng
--- a/examples/USER/reaction/nylon,6-6_melt/large_nylon_melt.data.gz
+++ b/examples/USER/reaction/nylon,6-6_melt/large_nylon_melt.data.gz
--- a/examples/USER/reaction/tiny_nylon/in.tiny_nylon.stabilized_variable_probability
+++ b/examples/USER/reaction/tiny_nylon/in.tiny_nylon.stabilized_variable_probability
@ -0,0 +1,56 @@
+# two monomer nylon example
+# reaction produces a condensed water molecule
+
+units real
+
+boundary p p p
+
+atom_style full
+
+kspace_style pppm 1.0e-4
+
+pair_style lj/class2/coul/long 8.5
+
+angle_style class2
+
+bond_style class2
+
+dihedral_style class2
+
+improper_style class2
+
+read_data tiny_nylon.data
+
+variable runsteps equal 1000
+variable prob1 equal step/v_runsteps*2
+variable prob2 equal (step/v_runsteps)>0.5
+
+velocity all create 300.0 4928459 dist gaussian
+
+molecule mol1 rxn1_stp1_unreacted.data_template
+molecule mol2 rxn1_stp1_reacted.data_template
+molecule mol3 rxn1_stp2_unreacted.data_template
+molecule mol4 rxn1_stp2_reacted.data_template
+
+thermo 50
+
+# dump 1 all xyz 1 test_vis.xyz
+
+fix myrxns all bond/react stabilization yes statted_grp .03 &
+  react rxn1 all 1 0.0 5.0 mol1 mol2 rxn1_stp1_map prob v_prob1 1234 &
+  react rxn2 all 1 0.0 5.0 mol3 mol4 rxn1_stp2_map prob v_prob2 1234
+
+fix 1 statted_grp_REACT nvt temp 300 300 100
+
+# optionally, you can customize behavior of reacting atoms,
+# by using the internally-created 'bond_react_MASTER_group', like so:
+fix 4 bond_react_MASTER_group temp/rescale 1 300 300 10 1
+
+thermo_style custom step temp press density v_prob1 v_prob2 f_myrxns[1] f_myrxns[2]
+
+# restart 100 restart1 restart2
+
+run ${runsteps}
+
+# write_restart restart_longrun
+# write_data restart_longrun.data
--- a/examples/USER/reaction/tiny_nylon/log.22Apr20.tiny_nylon.stabilized_variable_probability.g++.1
+++ b/examples/USER/reaction/tiny_nylon/log.22Apr20.tiny_nylon.stabilized_variable_probability.g++.1
@ -0,0 +1,201 @@
+LAMMPS (15 Apr 2020)
+OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:94)
+  using 1 OpenMP thread(s) per MPI task
+# two monomer nylon example
+# reaction produces a condensed water molecule
+
+units real
+
+boundary p p p
+
+atom_style full
+
+kspace_style pppm 1.0e-4
+
+pair_style lj/class2/coul/long 8.5
+
+angle_style class2
+
+bond_style class2
+
+dihedral_style class2
+
+improper_style class2
+
+read_data tiny_nylon.data
+  orthogonal box = (-25 -25 -25) to (25 25 25)
+  1 by 1 by 1 MPI processor grid
+  reading atoms ...
+  44 atoms
+  reading velocities ...
+  44 velocities
+  scanning bonds ...
+  9 = max bonds/atom
+  scanning angles ...
+  21 = max angles/atom
+  scanning dihedrals ...
+  29 = max dihedrals/atom
+  scanning impropers ...
+  29 = max impropers/atom
+  reading bonds ...
+  42 bonds
+  reading angles ...
+  74 angles
+  reading dihedrals ...
+  100 dihedrals
+  reading impropers ...
+  44 impropers
+  4 = max # of 1-2 neighbors
+  6 = max # of 1-3 neighbors
+  12 = max # of 1-4 neighbors
+  41 = max # of special neighbors
+  special bonds CPU = 0.000385045 secs
+  read_data CPU = 0.013443 secs
+
+variable runsteps equal 1000
+variable prob1 equal step/v_runsteps*2
+variable prob2 equal (step/v_runsteps)>0.5
+
+velocity all create 300.0 4928459 dist gaussian
+
+molecule mol1 rxn1_stp1_unreacted.data_template
+Read molecule template mol1:
+  1 molecules
+  18 atoms with max type 8
+  16 bonds with max type 14
+  25 angles with max type 28
+  23 dihedrals with max type 36
+  14 impropers with max type 11
+molecule mol2 rxn1_stp1_reacted.data_template
+Read molecule template mol2:
+  1 molecules
+  18 atoms with max type 9
+  17 bonds with max type 13
+  31 angles with max type 27
+  39 dihedrals with max type 33
+  20 impropers with max type 1
+molecule mol3 rxn1_stp2_unreacted.data_template
+Read molecule template mol3:
+  1 molecules
+  15 atoms with max type 9
+  14 bonds with max type 13
+  25 angles with max type 27
+  30 dihedrals with max type 33
+  16 impropers with max type 1
+molecule mol4 rxn1_stp2_reacted.data_template
+Read molecule template mol4:
+  1 molecules
+  15 atoms with max type 11
+  13 bonds with max type 15
+  19 angles with max type 29
+  16 dihedrals with max type 32
+  10 impropers with max type 13
+
+thermo 50
+
+# dump 1 all xyz 1 test_vis.xyz
+
+fix myrxns all bond/react stabilization yes statted_grp .03   react rxn1 all 1 0.0 5.0 mol1 mol2 rxn1_stp1_map prob v_prob1 1234   react rxn2 all 1 0.0 5.0 mol3 mol4 rxn1_stp2_map prob v_prob2 1234
+WARNING: Bond/react: Atom affected by reaction rxn1 too close to template edge (src/USER-REACTION/fix_bond_react.cpp:2051)
+WARNING: Bond/react: Atom affected by reaction rxn2 too close to template edge (src/USER-REACTION/fix_bond_react.cpp:2051)
+dynamic group bond_react_MASTER_group defined
+dynamic group statted_grp_REACT defined
+
+fix 1 statted_grp_REACT nvt temp 300 300 100
+
+# optionally, you can customize behavior of reacting atoms,
+# by using the internally-created 'bond_react_MASTER_group', like so:
+fix 4 bond_react_MASTER_group temp/rescale 1 300 300 10 1
+
+thermo_style custom step temp press density v_prob1 v_prob2 f_myrxns[1] f_myrxns[2]
+
+# restart 100 restart1 restart2
+
+run ${runsteps}
+run 1000
+PPPM initialization ...
+  using 12-bit tables for long-range coulomb (src/kspace.cpp:332)
+  G vector (1/distance) = 0.0534597
+  grid = 2 2 2
+  stencil order = 5
+  estimated absolute RMS force accuracy = 0.0402256
+  estimated relative force accuracy = 0.000121138
+  using double precision FFTW3
+  3d grid and FFT values/proc = 343 8
+Neighbor list info ...
+  update every 1 steps, delay 10 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 10.5
+  ghost atom cutoff = 10.5
+  binsize = 5.25, bins = 10 10 10
+  2 neighbor lists, perpetual/occasional/extra = 1 1 0
+  (1) pair lj/class2/coul/long, perpetual
+      attributes: half, newton on
+      pair build: half/bin/newton
+      stencil: half/bin/3d/newton
+      bin: standard
+  (2) fix bond/react, occasional, copy from (1)
+      attributes: half, newton on
+      pair build: copy
+      stencil: none
+      bin: none
+WARNING: Inconsistent image flags (src/domain.cpp:812)
+Per MPI rank memory allocation (min/avg/max) = 33.78 | 33.78 | 33.78 Mbytes
+Step Temp Press Density v_prob1 v_prob2 f_myrxns[1] f_myrxns[2] 
+       0          300    346.78165 0.0034851739            0            0            0            0 
+      50    262.63913   -492.10749 0.0034851739          0.1            0            1            0 
+     100    766.52962   -29.714349 0.0034851739          0.2            0            1            0 
+     150    503.86837    50.220304 0.0034851739          0.3            0            1            0 
+     200    456.51295    12.312892 0.0034851739          0.4            0            1            0 
+     250    391.54928    9.2335844 0.0034851739          0.5            0            1            0 
+     300     336.6988   -47.193937 0.0034851739          0.6            0            1            0 
+     350    254.06985   -9.2867898 0.0034851739          0.7            0            1            0 
+     400    259.41098   -25.657321 0.0034851739          0.8            0            1            0 
+     450    258.10364      22.5086 0.0034851739          0.9            0            1            0 
+     500    272.13412   -6.5391448 0.0034851739            1            0            1            0 
+     550    202.75504    54.658731 0.0034851739          1.1            1            1            1 
+     600    344.79887    23.798478 0.0034851739          1.2            1            1            1 
+     650    328.44488   -29.908484 0.0034851739          1.3            1            1            1 
+     700    280.13593   -8.3223255 0.0034851739          1.4            1            1            1 
+     750    300.67624    1.0632669 0.0034851739          1.5            1            1            1 
+     800    376.64234    12.488392 0.0034851739          1.6            1            1            1 
+     850    321.07642    19.814074 0.0034851739          1.7            1            1            1 
+     900    332.23751    30.814079 0.0034851739          1.8            1            1            1 
+     950    311.14029    5.7853136 0.0034851739          1.9            1            1            1 
+    1000    253.14634   -37.560642 0.0034851739            2            1            1            1 
+Loop time of 0.379454 on 1 procs for 1000 steps with 44 atoms
+
+Performance: 227.696 ns/day, 0.105 hours/ns, 2635.368 timesteps/s
+99.6% CPU use with 1 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0.069723   | 0.069723   | 0.069723   |   0.0 | 18.37
+Bond    | 0.14802    | 0.14802    | 0.14802    |   0.0 | 39.01
+Kspace  | 0.044252   | 0.044252   | 0.044252   |   0.0 | 11.66
+Neigh   | 0.072359   | 0.072359   | 0.072359   |   0.0 | 19.07
+Comm    | 0.0044748  | 0.0044748  | 0.0044748  |   0.0 |  1.18
+Output  | 0.0022775  | 0.0022775  | 0.0022775  |   0.0 |  0.60
+Modify  | 0.036509   | 0.036509   | 0.036509   |   0.0 |  9.62
+Other   |            | 0.00184    |            |       |  0.48
+
+Nlocal:    44 ave 44 max 44 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:    3 ave 3 max 3 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:    722 ave 722 max 722 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 722
+Ave neighs/atom = 16.4091
+Ave special neighs/atom = 9.77273
+Neighbor list builds = 1000
+Dangerous builds = 0
+
+# write_restart restart_longrun
+# write_data restart_longrun.data
+
+Please see the log.cite file for references relevant to this simulation
+
+Total wall time: 0:00:00
--- a/examples/USER/reaction/tiny_nylon/log.22Apr20.tiny_nylon.stabilized_variable_probability.g++.4
+++ b/examples/USER/reaction/tiny_nylon/log.22Apr20.tiny_nylon.stabilized_variable_probability.g++.4
@ -0,0 +1,201 @@
+LAMMPS (15 Apr 2020)
+OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:94)
+  using 1 OpenMP thread(s) per MPI task
+# two monomer nylon example
+# reaction produces a condensed water molecule
+
+units real
+
+boundary p p p
+
+atom_style full
+
+kspace_style pppm 1.0e-4
+
+pair_style lj/class2/coul/long 8.5
+
+angle_style class2
+
+bond_style class2
+
+dihedral_style class2
+
+improper_style class2
+
+read_data tiny_nylon.data
+  orthogonal box = (-25 -25 -25) to (25 25 25)
+  1 by 2 by 2 MPI processor grid
+  reading atoms ...
+  44 atoms
+  reading velocities ...
+  44 velocities
+  scanning bonds ...
+  9 = max bonds/atom
+  scanning angles ...
+  21 = max angles/atom
+  scanning dihedrals ...
+  29 = max dihedrals/atom
+  scanning impropers ...
+  29 = max impropers/atom
+  reading bonds ...
+  42 bonds
+  reading angles ...
+  74 angles
+  reading dihedrals ...
+  100 dihedrals
+  reading impropers ...
+  44 impropers
+  4 = max # of 1-2 neighbors
+  6 = max # of 1-3 neighbors
+  12 = max # of 1-4 neighbors
+  41 = max # of special neighbors
+  special bonds CPU = 0.000431282 secs
+  read_data CPU = 0.0129571 secs
+
+variable runsteps equal 1000
+variable prob1 equal step/v_runsteps*2
+variable prob2 equal (step/v_runsteps)>0.5
+
+velocity all create 300.0 4928459 dist gaussian
+
+molecule mol1 rxn1_stp1_unreacted.data_template
+Read molecule template mol1:
+  1 molecules
+  18 atoms with max type 8
+  16 bonds with max type 14
+  25 angles with max type 28
+  23 dihedrals with max type 36
+  14 impropers with max type 11
+molecule mol2 rxn1_stp1_reacted.data_template
+Read molecule template mol2:
+  1 molecules
+  18 atoms with max type 9
+  17 bonds with max type 13
+  31 angles with max type 27
+  39 dihedrals with max type 33
+  20 impropers with max type 1
+molecule mol3 rxn1_stp2_unreacted.data_template
+Read molecule template mol3:
+  1 molecules
+  15 atoms with max type 9
+  14 bonds with max type 13
+  25 angles with max type 27
+  30 dihedrals with max type 33
+  16 impropers with max type 1
+molecule mol4 rxn1_stp2_reacted.data_template
+Read molecule template mol4:
+  1 molecules
+  15 atoms with max type 11
+  13 bonds with max type 15
+  19 angles with max type 29
+  16 dihedrals with max type 32
+  10 impropers with max type 13
+
+thermo 50
+
+# dump 1 all xyz 1 test_vis.xyz
+
+fix myrxns all bond/react stabilization yes statted_grp .03   react rxn1 all 1 0.0 5.0 mol1 mol2 rxn1_stp1_map prob v_prob1 1234   react rxn2 all 1 0.0 5.0 mol3 mol4 rxn1_stp2_map prob v_prob2 1234
+WARNING: Bond/react: Atom affected by reaction rxn1 too close to template edge (src/USER-REACTION/fix_bond_react.cpp:2051)
+WARNING: Bond/react: Atom affected by reaction rxn2 too close to template edge (src/USER-REACTION/fix_bond_react.cpp:2051)
+dynamic group bond_react_MASTER_group defined
+dynamic group statted_grp_REACT defined
+
+fix 1 statted_grp_REACT nvt temp 300 300 100
+
+# optionally, you can customize behavior of reacting atoms,
+# by using the internally-created 'bond_react_MASTER_group', like so:
+fix 4 bond_react_MASTER_group temp/rescale 1 300 300 10 1
+
+thermo_style custom step temp press density v_prob1 v_prob2 f_myrxns[1] f_myrxns[2]
+
+# restart 100 restart1 restart2
+
+run ${runsteps}
+run 1000
+PPPM initialization ...
+  using 12-bit tables for long-range coulomb (src/kspace.cpp:332)
+  G vector (1/distance) = 0.0534597
+  grid = 2 2 2
+  stencil order = 5
+  estimated absolute RMS force accuracy = 0.0402256
+  estimated relative force accuracy = 0.000121138
+  using double precision FFTW3
+  3d grid and FFT values/proc = 252 2
+Neighbor list info ...
+  update every 1 steps, delay 10 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 10.5
+  ghost atom cutoff = 10.5
+  binsize = 5.25, bins = 10 10 10
+  2 neighbor lists, perpetual/occasional/extra = 1 1 0
+  (1) pair lj/class2/coul/long, perpetual
+      attributes: half, newton on
+      pair build: half/bin/newton
+      stencil: half/bin/3d/newton
+      bin: standard
+  (2) fix bond/react, occasional, copy from (1)
+      attributes: half, newton on
+      pair build: copy
+      stencil: none
+      bin: none
+WARNING: Inconsistent image flags (src/domain.cpp:812)
+Per MPI rank memory allocation (min/avg/max) = 33.66 | 33.88 | 34.43 Mbytes
+Step Temp Press Density v_prob1 v_prob2 f_myrxns[1] f_myrxns[2] 
+       0          300    346.78165 0.0034851739            0            0            0            0 
+      50     266.5092   -90.813802 0.0034851739          0.1            0            1            0 
+     100    559.41271    -53.23688 0.0034851739          0.2            0            1            0 
+     150    489.90516    31.555817 0.0034851739          0.3            0            1            0 
+     200    326.18391    7.7889992 0.0034851739          0.4            0            1            0 
+     250    339.78203    2.3919541 0.0034851739          0.5            0            1            0 
+     300    370.90263    -32.01673 0.0034851739          0.6            0            1            0 
+     350    294.07547   -5.4019813 0.0034851739          0.7            0            1            0 
+     400    287.76477    12.254133 0.0034851739          0.8            0            1            0 
+     450    293.36482    66.372956 0.0034851739          0.9            0            1            0 
+     500    246.84496    26.132317 0.0034851739            1            0            1            0 
+     550    253.08778   -15.350262 0.0034851739          1.1            1            1            1 
+     600    358.83641    25.007371 0.0034851739          1.2            1            1            1 
+     650    320.51492    -32.34823 0.0034851739          1.3            1            1            1 
+     700    310.87976   -8.2306669 0.0034851739          1.4            1            1            1 
+     750    307.54142    12.025818 0.0034851739          1.5            1            1            1 
+     800    272.51724    -22.92823 0.0034851739          1.6            1            1            1 
+     850    268.66181    10.069534 0.0034851739          1.7            1            1            1 
+     900     265.5531   -10.471377 0.0034851739          1.8            1            1            1 
+     950    259.43086    9.4546712 0.0034851739          1.9            1            1            1 
+    1000    247.14622    20.250308 0.0034851739            2            1            1            1 
+Loop time of 0.357762 on 4 procs for 1000 steps with 44 atoms
+
+Performance: 241.502 ns/day, 0.099 hours/ns, 2795.157 timesteps/s
+99.0% CPU use with 4 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0.0003917  | 0.015545   | 0.033317   |  11.9 |  4.35
+Bond    | 0.0010131  | 0.030153   | 0.076975   |  18.2 |  8.43
+Kspace  | 0.092857   | 0.1462     | 0.18688    |  10.7 | 40.87
+Neigh   | 0.043786   | 0.044014   | 0.044189   |   0.1 | 12.30
+Comm    | 0.03636    | 0.038345   | 0.040538   |   0.8 | 10.72
+Output  | 0.00091578 | 0.0012541  | 0.0020923  |   1.4 |  0.35
+Modify  | 0.075379   | 0.080791   | 0.086052   |   1.8 | 22.58
+Other   |            | 0.00146    |            |       |  0.41
+
+Nlocal:    11 ave 32 max 0 min
+Histogram: 2 0 1 0 0 0 0 0 0 1
+Nghost:    40 ave 51 max 19 min
+Histogram: 1 0 0 0 0 0 0 1 0 2
+Neighs:    191 ave 529 max 0 min
+Histogram: 2 0 0 0 1 0 0 0 0 1
+
+Total # of neighbors = 764
+Ave neighs/atom = 17.3636
+Ave special neighs/atom = 9.77273
+Neighbor list builds = 1000
+Dangerous builds = 0
+
+# write_restart restart_longrun
+# write_data restart_longrun.data
+
+Please see the log.cite file for references relevant to this simulation
+
+Total wall time: 0:00:00
--- a/lib/gpu/lal_atom.cu
+++ b/lib/gpu/lal_atom.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : brownw@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_preprocessor.h"
--- a/lib/gpu/lal_beck.cu
+++ b/lib/gpu/lal_beck.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
--- a/lib/gpu/lal_born.cu
+++ b/lib/gpu/lal_born.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
--- a/lib/gpu/lal_born_coul_long.cu
+++ b/lib/gpu/lal_born_coul_long.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)

--- a/lib/gpu/lal_born_coul_long_cs.cu
+++ b/lib/gpu/lal_born_coul_long_cs.cu
@ -11,7 +11,7 @@
 //
 //    begin                : June 2018
 //    email                : ndactrung@gmail.com
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)

@ -109,7 +109,7 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
      numtyp rsq = delx*delx+dely*dely+delz*delz;

      int mtype=itype*lj_types+jtype;
-      if (rsq<cutsq_sigma[mtype].x) { // cutsq 
+      if (rsq<cutsq_sigma[mtype].x) { // cutsq
        numtyp forcecoul,forceborn,force,r6inv,prefactor,_erfc,rexp;

        rsq += EPSILON; // Add Epsilon for case: r = 0; Interaction must be removed by special bond;
@ -250,7 +250,7 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;

-      if (rsq<cutsq_sigma[mtype].x) { // cutsq 
+      if (rsq<cutsq_sigma[mtype].x) { // cutsq
        numtyp forcecoul,forceborn,force,r6inv,prefactor,_erfc,rexp;

        rsq += EPSILON; // Add Epsilon for case: r = 0; Interaction must be removed by special bond;
--- a/lib/gpu/lal_born_coul_wolf.cu
+++ b/lib/gpu/lal_born_coul_wolf.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)

--- a/lib/gpu/lal_born_coul_wolf_cs.cu
+++ b/lib/gpu/lal_born_coul_wolf_cs.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : ndactrung@gmail.com
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)

--- a/lib/gpu/lal_buck.cu
+++ b/lib/gpu/lal_buck.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
--- a/lib/gpu/lal_buck_coul.cu
+++ b/lib/gpu/lal_buck_coul.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)

--- a/lib/gpu/lal_buck_coul_long.cu
+++ b/lib/gpu/lal_buck_coul_long.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)

--- a/lib/gpu/lal_charmm_long.cu
+++ b/lib/gpu/lal_charmm_long.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : brownw@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)

--- a/lib/gpu/lal_colloid.cu
+++ b/lib/gpu/lal_colloid.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
--- a/lib/gpu/lal_coul.cu
+++ b/lib/gpu/lal_coul.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : ndtrung@umich.edu
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)

--- a/lib/gpu/lal_coul_debye.cu
+++ b/lib/gpu/lal_coul_debye.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : ndtrung@umich.edu
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)

--- a/lib/gpu/lal_coul_dsf.cu
+++ b/lib/gpu/lal_coul_dsf.cu
@ -11,7 +11,7 @@
 //
 //    begin                : 8/15/2012
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)

--- a/lib/gpu/lal_coul_long.cu
+++ b/lib/gpu/lal_coul_long.cu
@ -11,7 +11,7 @@
 //
 //    begin                : July 2011
 //    email                : a.kohlmeyer@temple.edu
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)

--- a/lib/gpu/lal_coul_long_cs.cu
+++ b/lib/gpu/lal_coul_long_cs.cu
@ -11,7 +11,7 @@
 //
 //    begin                : June 2018
 //    email                : ndactrung@gmail.com
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)

--- a/lib/gpu/lal_dipole_lj.cu
+++ b/lib/gpu/lal_dipole_lj.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
--- a/lib/gpu/lal_dipole_lj_sf.cu
+++ b/lib/gpu/lal_dipole_lj_sf.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
--- a/lib/gpu/lal_dipole_long_lj.cu
+++ b/lib/gpu/lal_dipole_long_lj.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
--- a/lib/gpu/lal_dpd.cu
+++ b/lib/gpu/lal_dpd.cu
@ -11,7 +11,7 @@
 //
 //    begin                : Jan 15, 2014
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
--- a/lib/gpu/lal_eam.cu
+++ b/lib/gpu/lal_eam.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : brownw@ornl.gov nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
--- a/lib/gpu/lal_ellipsoid_nbor.cu
+++ b/lib/gpu/lal_ellipsoid_nbor.cu
@ -9,9 +9,8 @@
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
-//    begin                :
 //    email                : brownw@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_preprocessor.h"
@ -53,8 +52,8 @@ __kernel void kernel_nbor(const __global numtyp4 *restrict x_,
    int itype=fast_mul(iw,ntypes);
    int newj=0;
    for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
-      int j=dev_ij[nbor];
-      j &= NEIGHMASK;
+      int sj=dev_ij[nbor];
+      int j = sj & NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;
      int mtype=itype+jtype;
@ -69,7 +68,7 @@ __kernel void kernel_nbor(const __global numtyp4 *restrict x_,
        rsq+=t*t;

        if (rsq<cf.x) {
-          dev_nbor[packed]=j;
+          dev_nbor[packed]=sj;
          packed+=nbor_pitch;
          newj++;
        }
@ -117,8 +116,8 @@ __kernel void kernel_nbor_fast(const __global numtyp4 *restrict x_,

    int newj=0;
    for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
-      int j=dev_ij[nbor];
-      j &= NEIGHMASK;
+      int sj=dev_ij[nbor];
+      int j = sj & NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      int jtype=jx.w;
      int mtype=itype+jtype;
@ -133,7 +132,7 @@ __kernel void kernel_nbor_fast(const __global numtyp4 *restrict x_,
        rsq+=t*t;

        if (rsq<cutsq[mtype]) {
-          dev_nbor[packed]=j;
+          dev_nbor[packed]=sj;
          packed+=nbor_pitch;
          newj++;
        }
--- a/lib/gpu/lal_gauss.cu
+++ b/lib/gpu/lal_gauss.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
--- a/lib/gpu/lal_gayberne.cu
+++ b/lib/gpu/lal_gayberne.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : brownw@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_ellipsoid_extra.h"
--- a/lib/gpu/lal_gayberne_lj.cu
+++ b/lib/gpu/lal_gayberne_lj.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : brownw@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_ellipsoid_extra.h"
--- a/lib/gpu/lal_lj.cu
+++ b/lib/gpu/lal_lj.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : brownw@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
--- a/lib/gpu/lal_lj96.cu
+++ b/lib/gpu/lal_lj96.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : brownw@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
--- a/lib/gpu/lal_lj_class2_long.cu
+++ b/lib/gpu/lal_lj_class2_long.cu
@ -11,7 +11,7 @@
 //
 //    begin                : Mon May 16 2011
 //    email                : brownw@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)

--- a/lib/gpu/lal_lj_coul.cu
+++ b/lib/gpu/lal_lj_coul.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : brownw@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)

--- a/lib/gpu/lal_lj_coul_debye.cu
+++ b/lib/gpu/lal_lj_coul_debye.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)

--- a/lib/gpu/lal_lj_coul_long.cu
+++ b/lib/gpu/lal_lj_coul_long.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : brownw@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)

--- a/lib/gpu/lal_lj_coul_msm.cu
+++ b/lib/gpu/lal_lj_coul_msm.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)

--- a/lib/gpu/lal_lj_cubic.cu
+++ b/lib/gpu/lal_lj_cubic.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : ndactrung@gmail.com
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
--- a/lib/gpu/lal_lj_dsf.cu
+++ b/lib/gpu/lal_lj_dsf.cu
@ -11,7 +11,7 @@
 //
 //    begin                : 7/12/2012
 //    email                : brownw@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)

--- a/lib/gpu/lal_lj_expand.cu
+++ b/lib/gpu/lal_lj_expand.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : ibains@nvidia.com
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)

--- a/lib/gpu/lal_lj_expand_coul_long.cu
+++ b/lib/gpu/lal_lj_expand_coul_long.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : ndactrung@gmail.com
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)

--- a/lib/gpu/lal_lj_gromacs.cu
+++ b/lib/gpu/lal_lj_gromacs.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)

--- a/lib/gpu/lal_lj_sdk.cu
+++ b/lib/gpu/lal_lj_sdk.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : brownw@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
--- a/lib/gpu/lal_lj_sdk_long.cu
+++ b/lib/gpu/lal_lj_sdk_long.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : brownw@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)

--- a/lib/gpu/lal_mie.cu
+++ b/lib/gpu/lal_mie.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
--- a/lib/gpu/lal_morse.cu
+++ b/lib/gpu/lal_morse.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : brownw@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)

--- a/lib/gpu/lal_neighbor_cpu.cu
+++ b/lib/gpu/lal_neighbor_cpu.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : brownw@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_preprocessor.h"
--- a/lib/gpu/lal_neighbor_gpu.cu
+++ b/lib/gpu/lal_neighbor_gpu.cu
@ -12,7 +12,7 @@
 //
 //    begin                :
 //    email                : penwang@nvidia.com, brownw@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_preprocessor.h"
--- a/lib/gpu/lal_pppm.cu
+++ b/lib/gpu/lal_pppm.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : brownw@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)

--- a/lib/gpu/lal_re_squared.cu
+++ b/lib/gpu/lal_re_squared.cu
@ -11,7 +11,7 @@
 //
 //    begin                : Fri May 06 2011
 //    email                : brownw@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_ellipsoid_extra.h"
--- a/lib/gpu/lal_re_squared_lj.cu
+++ b/lib/gpu/lal_re_squared_lj.cu
@ -11,7 +11,7 @@
 //
 //    begin                : Fri May 06 2011
 //    email                : brownw@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_ellipsoid_extra.h"
--- a/lib/gpu/lal_soft.cu
+++ b/lib/gpu/lal_soft.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
--- a/lib/gpu/lal_sw.cu
+++ b/lib/gpu/lal_sw.cu
@ -11,7 +11,7 @@
 //
 //    begin                : Tue March 26, 2013
 //    email                : brownw@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
--- a/lib/gpu/lal_table.cu
+++ b/lib/gpu/lal_table.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
--- a/lib/gpu/lal_tersoff.cu
+++ b/lib/gpu/lal_tersoff.cu
@ -11,7 +11,7 @@
 //
 //       begin                : Thu April 17, 2014
 //       email                : ndactrung@gmail.com
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_tersoff_extra.h"
--- a/lib/gpu/lal_tersoff_mod.cu
+++ b/lib/gpu/lal_tersoff_mod.cu
@ -11,7 +11,7 @@
 //
 //       begin                :
 //       email                : ndactrung@gmail.com
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_tersoff_mod_extra.h"
--- a/lib/gpu/lal_tersoff_zbl.cu
+++ b/lib/gpu/lal_tersoff_zbl.cu
@ -11,7 +11,7 @@
 //
 //       begin                :
 //       email                : ndactrung@gmail.com
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_tersoff_zbl_extra.h"
--- a/lib/gpu/lal_ufm.cu
+++ b/lib/gpu/lal_ufm.cu
@ -10,7 +10,7 @@
    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 __________________________________________________________________________

-    begin                : 
+    begin                :
    email                : pl.rodolfo@gmail.com
                           dekoning@ifi.unicamp.br
 ***************************************************************************/
@ -31,10 +31,10 @@ __kernel void k_ufm(const __global numtyp4 *restrict x_,
                   const __global numtyp4 *restrict uf3,
                   const int lj_types,
                   const __global numtyp *restrict sp_lj,
-                   const __global int * dev_nbor, 
-                   const __global int * dev_packed, 
-                   __global acctyp4 *restrict ans, 
-                   __global acctyp *restrict engv, 
+                   const __global int * dev_nbor,
+                   const __global int * dev_packed,
+                   __global acctyp4 *restrict ans,
+                   __global acctyp *restrict engv,
                   const int eflag, const int vflag, const int inum,
                   const int nbor_pitch, const int t_per_atom) {
  int tid, ii, offset;
@ -46,19 +46,19 @@ __kernel void k_ufm(const __global numtyp4 *restrict x_,
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
-  
+
  if (ii<inum) {
    int i, numj, nbor, nbor_end;
    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);
-  
+
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    int itype=ix.w;

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;
@ -71,12 +71,12 @@ __kernel void k_ufm(const __global numtyp4 *restrict x_,
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;
- 
+
      int mtype=itype*lj_types+jtype;
      if (rsq<uf1[mtype].z) {
        numtyp expuf = exp(- rsq * uf1[mtype].y);
        numtyp force = factor_lj * uf1[mtype].x * expuf / (1.0 - expuf);
-      
+
        f.x += delx*force;
        f.y += dely*force;
        f.z += delz*force;
@ -102,17 +102,17 @@ __kernel void k_ufm(const __global numtyp4 *restrict x_,

 __kernel void k_ufm_fast(const __global numtyp4 *restrict x_,
                        const __global numtyp4 *restrict uf1_in,
-                        const __global numtyp4 *restrict uf3_in, 
+                        const __global numtyp4 *restrict uf3_in,
                        const __global numtyp *restrict sp_lj_in,
-                        const __global int * dev_nbor, 
-                        const __global int * dev_packed, 
-                        __global acctyp4 *restrict ans, 
-                        __global acctyp *restrict engv, 
-                        const int eflag, const int vflag, const int inum, 
+                        const __global int * dev_nbor,
+                        const __global int * dev_packed,
+                        __global acctyp4 *restrict ans,
+                        __global acctyp *restrict engv,
+                        const int eflag, const int vflag, const int inum,
                        const int nbor_pitch, const int t_per_atom) {
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);
-  
+
  __local numtyp4 uf1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 uf3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[4];
@ -123,7 +123,7 @@ __kernel void k_ufm_fast(const __global numtyp4 *restrict x_,
    if (eflag>0)
      uf3[tid]=uf3_in[tid];
  }
-  
+
  acctyp energy=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
@ -132,7 +132,7 @@ __kernel void k_ufm_fast(const __global numtyp4 *restrict x_,
    virial[i]=(acctyp)0;

  __syncthreads();
-  
+
  if (ii<inum) {
    int i, numj, nbor, nbor_end;
    __local int n_stride;
@ -145,7 +145,7 @@ __kernel void k_ufm_fast(const __global numtyp4 *restrict x_,

    numtyp factor_lj;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
-  
+
      int j=dev_packed[nbor];
      factor_lj = sp_lj[sbmask(j)];
      j &= NEIGHMASK;
@ -158,11 +158,11 @@ __kernel void k_ufm_fast(const __global numtyp4 *restrict x_,
      numtyp dely = ix.y-jx.y;
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;
-        
+
      if (rsq<uf1[mtype].z) {
        numtyp expuf = exp(- rsq * uf1[mtype].y);
        numtyp force = factor_lj * uf1[mtype].x * expuf / (1.0 - expuf);
-      
+
        f.x += delx*force;
        f.y += dely*force;
        f.z += delz*force;
--- a/lib/gpu/lal_vashishta.cu
+++ b/lib/gpu/lal_vashishta.cu
@ -11,7 +11,7 @@
 //
 //    begin                : Mon June 12, 2017
 //    email                : andershaf@gmail.com
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
@ -290,7 +290,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,

        if (eflag>0)
          energy += (param3_bigh*reta+vc2-vc3-param3_bigw*r6inv-r*param3_dvrc+param3_c0);
-          
+
        if (vflag>0) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
@ -471,13 +471,13 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
      numtyp rsq1 = delr1x*delr1x+delr1y*delr1y+delr1z*delr1z;

      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
-      
+
      numtyp4 param4_ijparam; fetch4(param4_ijparam,ijparam,param4_tex);
      param_r0sq_ij=param4_ijparam.x;
      if (rsq1 > param_r0sq_ij) continue; // still keep this for neigh no and tpa > 1
      param_gamma_ij=param4_ijparam.y;
      param_r0_ij=param4_ijparam.w;
-      
+
      int nbor_k,k_end;
      if (dev_packed==dev_nbor) {
        nbor_k=nborj_start-offset_j+offset_k;
@ -619,7 +619,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,

      param_gamma_ij=param4_ijparam.y;
      param_r0_ij = param4_ijparam.w;
-      
+
      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
        if (gpu_nbor) nbor_k=j+nbor_pitch;
@ -665,14 +665,14 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
        if (rsq2 < param_r0sq_ik) {
          param_gamma_ik=param4_ikparam.y;
          param_r0_ik=param4_ikparam.w;
-          
+
          int ijkparam=elem2param[jtype*nelements*nelements+itype*nelements+ktype]; //jik
          numtyp4 param5_ijkparam; fetch4(param5_ijkparam,ijkparam,param5_tex);
          param_bigc_ijk=param5_ijkparam.x;
          param_costheta_ijk=param5_ijkparam.y;
          param_bigb_ijk=param5_ijkparam.z;
          param_big2b_ijk=param5_ijkparam.w;
-          
+
          numtyp fjx, fjy, fjz;
          //if (evatom==0) {
            threebody_half(delr1x,delr1y,delr1z);
@ -774,7 +774,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,

      param_gamma_ij=param4_ijparam.y;
      param_r0_ij=param4_ijparam.w;
-      
+
      int nbor_k,numk;
      if (dev_nbor==dev_packed) {
        if (gpu_nbor) nbor_k=j+nbor_pitch;
@ -827,7 +827,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
          param_costheta_ijk=param5_ijkparam.y;
          param_bigb_ijk=param5_ijkparam.z;
          param_big2b_ijk=param5_ijkparam.w;
-          
+
          numtyp fjx, fjy, fjz, fkx, fky, fkz;
          threebody(delr1x,delr1y,delr1z,eflag,energy);

--- a/lib/gpu/lal_yukawa.cu
+++ b/lib/gpu/lal_yukawa.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
--- a/lib/gpu/lal_yukawa_colloid.cu
+++ b/lib/gpu/lal_yukawa_colloid.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : nguyentd@ornl.gov
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)

--- a/lib/gpu/lal_zbl.cu
+++ b/lib/gpu/lal_zbl.cu
@ -11,7 +11,7 @@
 //
 //    begin                :
 //    email                : ndactrung@gmail.com
-// ***************************************************************************/
+// ***************************************************************************

 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include "lal_aux_fun1.h"
--- a/lib/kokkos/BUILD.md
+++ b/lib/kokkos/BUILD.md
@ -40,6 +40,13 @@ cmake ${srcdir} \
 ````
 which activates the OpenMP backend. All of the options controlling device backends, options, architectures, and third-party libraries (TPLs) are given below.

+## Platform-specific Problems
+
+### Cray
+
+* The Cray compiler wrappers do static linking by default. This seems to break the Kokkos build. You will likely need to set the environment variable `CRAYPE_LINK_TYPE=dynamic` in order to link correctly. Kokkos warns during configure if this is missing.
+* The Cray compiler identifies to CMake as Clang, but it sometimes has its own flags that differ from Clang. We try to include all exceptions, but flag errors may occur in which a Clang-specific flag is passed that the Cray compiler does not recognize.
+
 ## Spack
 An alternative to manually building with the CMake is to use the Spack package manager.
 To do so, download the `kokkos-spack` git repo and add to the package list:
@ -63,6 +70,7 @@ For a complete list of Kokkos options, run:
 ````
 spack info kokkos
 ````
+More details can be found in the kokkos-spack repository [README](https://github.com/kokkos/kokkos-spack/blob/master/README.md).

 #### Spack Development
 Spack currently installs packages to a location determined by a unique hash. This hash name is not really "human readable".
@ -73,32 +81,8 @@ spack find -p kokkos ...
 ````
 where `...` is the unique spec identifying the particular Kokkos configuration and version.

-A better way to use Spack for doing Kokkos development is the DIY feature of Spack.
-If you wish to develop Kokkos itself, go to the Kokkos source folder:
-````
-spack diy -u cmake kokkos@diy ...
-````
-where `...` is a Spack spec identifying the exact Kokkos configuration.
-This then creates a `spack-build` directory where you can run `make`.
-
-If doing development on a downstream project, you can do almost exactly the same thing.
-````
-spack diy -u cmake ${myproject}@${myversion} ... ^kokkos...
-````
-where the `...` are the specs for your project and the desired Kokkos configuration.
-Again, a `spack-build` directory will be created where you can run `make`.
-
-Spack has a few idiosyncracies that make building outside of Spack annoying related to Spack forcing use of a compiler wrapper. This can be worked around by having a `-DSpack_WORKAROUND=On` given your CMake. Then add the block of code to your CMakeLists.txt:
-
-````
-if (Spack_WORKAROUND)
- set(SPACK_CXX $ENV{SPACK_CXX})
- if(SPACK_CXX)
-   set(CMAKE_CXX_COMPILER ${SPACK_CXX} CACHE STRING "the C++ compiler" FORCE)
-   set(ENV{CXX} ${SPACK_CXX})
- endif()
-endif()
-````
+A better way to use Spack for doing Kokkos development is the dev-build feature of Spack.
+For dev-build details, consult the kokkos-spack repository [README](https://github.com/kokkos/kokkos-spack/blob/master/README.md).

 # Kokkos Keyword Listing

@ -157,6 +141,9 @@ Options can be enabled by specifying `-DKokkos_ENABLE_X`.
 * Kokkos_ENABLE_DEPRECATED_CODE
    * Whether to enable deprecated code
    * BOOL Default: OFF
+* Kokkos_ENABLE_EXAMPLES
+    * Whether to enable building examples
+    * BOOL Default: OFF
 * Kokkos_ENABLE_HPX_ASYNC_DISPATCH
    * Whether HPX supports asynchronous dispatch
    * BOOL Default: OFF
--- a/lib/kokkos/CHANGELOG.md
+++ b/lib/kokkos/CHANGELOG.md
@ -1,5 +1,59 @@
 # Change Log

+## [3.1.00](https://github.com/kokkos/kokkos/tree/3.1.00) (2020-04-14)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/3.0.00...3.1.00)
+
+**Features:**
+
+- HIP Support for AMD
+- OpenMPTarget Support with clang
+- Windows VS19 (Serial) Support [\#1533](https://github.com/kokkos/kokkos/issues/1533)
+
+**Implemented enhancements:**
+
+- generate\_makefile.bash should allow tests to be disabled [\#2886](https://github.com/kokkos/kokkos/issues/2886)
+- clang/7+cuda/9 build -Werror-unused parameter error in nightly test [\#2884](https://github.com/kokkos/kokkos/issues/2884)
+- ScatterView memory space is not user settable [\#2826](https://github.com/kokkos/kokkos/issues/2826)
+- clang/8+cuda/10.0 build error with c++17 [\#2809](https://github.com/kokkos/kokkos/issues/2809)
+- warnings.... [\#2805](https://github.com/kokkos/kokkos/issues/2805)
+- Kokkos version in cpp define [\#2787](https://github.com/kokkos/kokkos/issues/2787)
+- Remove Defunct QThreads Backend [\#2751](https://github.com/kokkos/kokkos/issues/2751)
+- Improve Kokkos::fence behavior with multiple execution spaces [\#2659](https://github.com/kokkos/kokkos/issues/2659)
+- polylithic\(?\) initialization of Kokkos [\#2658](https://github.com/kokkos/kokkos/issues/2658)
+- Unnecessary\(?\) check for host execution space initialization from Cuda initialization [\#2652](https://github.com/kokkos/kokkos/issues/2652)
+- Kokkos error reporting failures with CUDA GPUs in exclusive mode [\#2471](https://github.com/kokkos/kokkos/issues/2471)
+- atomicMax equivalent \(and other atomics\) [\#2401](https://github.com/kokkos/kokkos/issues/2401)
+- Fix alignment for Kokkos::complex [\#2255](https://github.com/kokkos/kokkos/issues/2255)
+- Warnings with Cuda 10.1 [\#2206](https://github.com/kokkos/kokkos/issues/2206)
+- dual view with Kokkos::ViewAllocateWithoutInitializing [\#2188](https://github.com/kokkos/kokkos/issues/2188)
+- Check error code  from cudaOccupancyMaxActiveBlocksPerMultiprocessor [\#2172](https://github.com/kokkos/kokkos/issues/2172)
+- Add non-member Kokkos::resize/realloc for DualView [\#2170](https://github.com/kokkos/kokkos/issues/2170)
+- Construct DualView without initialization [\#2046](https://github.com/kokkos/kokkos/issues/2046)
+- Expose is\_assignable to determine if one view can be assigned to another [\#1936](https://github.com/kokkos/kokkos/issues/1936)
+- profiling label [\#1935](https://github.com/kokkos/kokkos/issues/1935)
+- team\_broadcast of bool failed on CUDA backend [\#1908](https://github.com/kokkos/kokkos/issues/1908)
+- View static\_extent [\#660](https://github.com/kokkos/kokkos/issues/660)
+- Misleading Kokkos::Cuda::initialize ERROR message when compiled for wrong GPU architecture [\#1944](https://github.com/kokkos/kokkos/issues/1944)
+- Cryptic Error When Malloc Fails [\#2164](https://github.com/kokkos/kokkos/issues/2164)
+- Drop support for intermediate standards in CMake [\#2336](https://github.com/kokkos/kokkos/issues/2336)
+
+**Fixed bugs:**
+
+- DualView sync\_device with length zero creates cuda errors [\#2946](https://github.com/kokkos/kokkos/issues/2946)
+- building with nvcc and clang \(or clang based XL\) as host compiler: "Kokkos::atomic\_fetch\_min\(volatile int \*, int\)" has already been defined [\#2903](https://github.com/kokkos/kokkos/issues/2903)
+- Cuda 9.1,10.1 debug builds failing due to -Werror=unused-parameter [\#2880](https://github.com/kokkos/kokkos/issues/2880)
+- clang -Werror: Kokkos\_FixedBufferMemoryPool.hpp:140:28: error: unused parameter 'alloc\_size' [\#2869](https://github.com/kokkos/kokkos/issues/2869)
+- intel/16.0.1, intel/17.0.1 nightly build failures with debugging enabled [\#2867](https://github.com/kokkos/kokkos/issues/2867)
+- intel/16.0.1 debug build errors [\#2863](https://github.com/kokkos/kokkos/issues/2863)
+- xl/16.1.1 with cpp14, openmp build, nightly test failures [\#2856](https://github.com/kokkos/kokkos/issues/2856)
+- Intel nightly test failures: team\_vector [\#2852](https://github.com/kokkos/kokkos/issues/2852)
+- Kokkos Views with intmax/2\<N\<intmax can hang during construction [\#2850](https://github.com/kokkos/kokkos/issues/2850)
+- workgraph\_fib test seg-faults with threads backend and hwloc [\#2797](https://github.com/kokkos/kokkos/issues/2797)
+- cuda.view\_64bit test hangs on Power8+Kepler37 system - develop and 2.9.00 branches [\#2771](https://github.com/kokkos/kokkos/issues/2771)
+- device\_type for Kokkos\_Random ?  [\#2693](https://github.com/kokkos/kokkos/issues/2693)
+- "More than one tag given" error in Experimental::require\(\) [\#2608](https://github.com/kokkos/kokkos/issues/2608)
+- Segfault on Marvell from our finalization stack [\#2542](https://github.com/kokkos/kokkos/issues/2542)
+
 ## [3.0.00](https://github.com/kokkos/kokkos/tree/3.0.00) (2020-01-27)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/2.9.00...3.0.00)

@ -214,7 +268,7 @@
 ## [2.6.00](https://github.com/kokkos/kokkos/tree/2.6.00) (2018-03-07)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/2.5.00...2.6.00)

-**Part of the Kokkos C++ Performance Portability Programming EcoSystem 2.6**      
+**Part of the Kokkos C++ Performance Portability Programming EcoSystem 2.6**

 **Implemented enhancements:**

@ -258,7 +312,7 @@
 ## [2.5.00](https://github.com/kokkos/kokkos/tree/2.5.00) (2017-12-15)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/2.04.11...2.5.00)

-**Part of the Kokkos C++ Performance Portability Programming EcoSystem 2.5**      
+**Part of the Kokkos C++ Performance Portability Programming EcoSystem 2.5**

 **Implemented enhancements:**

--- a/lib/kokkos/CMakeLists.txt
+++ b/lib/kokkos/CMakeLists.txt
@ -2,7 +2,7 @@
 # We want to determine if options are given with the wrong case
 # In order to detect which arguments are given to compare against
 # the list of valid arguments, at the beginning here we need to
-# form a list of all the given variables. If it begins with any 
+# form a list of all the given variables. If it begins with any
 # case of KoKkOS, we add it to the list.


@ -25,6 +25,8 @@ SET(KOKKOS_TOP_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR})

 # Needed to simplify syntax of if statements
 CMAKE_POLICY(SET CMP0054 NEW)
+# Needed to make IN_LIST a valid operator
+CMAKE_POLICY(SET CMP0057 NEW)

 # Is this a build as part of Trilinos?
 IF(COMMAND TRIBITS_PACKAGE_DECL)
@ -65,7 +67,7 @@ IF(NOT KOKKOS_HAS_TRILINOS)
  cmake_minimum_required(VERSION 3.10 FATAL_ERROR)
  set(CMAKE_DISABLE_SOURCE_CHANGES ON)
  set(CMAKE_DISABLE_IN_SOURCE_BUILD ON)
-  IF (Spack_WORKAROUND) 
+  IF (Spack_WORKAROUND)
    #if we are explicitly using Spack for development,
    #nuke the Spack compiler
    SET(SPACK_CXX $ENV{SPACK_CXX})
@ -75,7 +77,15 @@ IF(NOT KOKKOS_HAS_TRILINOS)
    ENDIF()
  ENDif()
  IF(NOT DEFINED ${PROJECT_NAME})
+    # WORKAROUND FOR HIPCC
+    IF(Kokkos_ENABLE_HIP)
+      SET(KOKKOS_INTERNAL_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+      SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --amdgpu-target=gfx906")
+    ENDIF()
    PROJECT(Kokkos CXX)
+    IF(Kokkos_ENABLE_HIP)
+      SET(CMAKE_CXX_FLAGS ${KOKKOS_INTERNAL_CMAKE_CXX_FLAGS})
+    ENDIF()
  ENDIF()
 ENDIF()

@ -92,16 +102,17 @@ ENDIF()


 set(Kokkos_VERSION_MAJOR 3)
-set(Kokkos_VERSION_MINOR 0)
+set(Kokkos_VERSION_MINOR 1)
 set(Kokkos_VERSION_PATCH 0)
 set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}")
+math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}")

-IF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.12.0") 
+IF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.12.0")
  MESSAGE(STATUS "Setting policy CMP0074 to use <Package>_ROOT variables")
  CMAKE_POLICY(SET CMP0074 NEW)
 ENDIF()

-# Load either the real TriBITS or a TriBITS wrapper 
+# Load either the real TriBITS or a TriBITS wrapper
 # for certain utility functions that are universal (like GLOBAL_SET)
 INCLUDE(${KOKKOS_SRC_PATH}/cmake/fake_tribits.cmake)

@ -118,18 +129,14 @@ ENDIF()
 # These are the variables we will append to as we go
 # I really wish these were regular variables
 # but scoping issues can make it difficult
-GLOBAL_RESET(KOKKOS_COMPILE_OPTIONS)
-GLOBAL_RESET(KOKKOS_LINK_OPTIONS)
-GLOBAL_RESET(KOKKOS_CUDA_OPTIONS)
-GLOBAL_RESET(KOKKOS_CUDAFE_OPTIONS)
-GLOBAL_RESET(KOKKOS_XCOMPILER_OPTIONS)
+GLOBAL_SET(KOKKOS_COMPILE_OPTIONS)
+GLOBAL_SET(KOKKOS_LINK_OPTIONS)
+GLOBAL_SET(KOKKOS_CUDA_OPTIONS)
+GLOBAL_SET(KOKKOS_CUDAFE_OPTIONS)
+GLOBAL_SET(KOKKOS_XCOMPILER_OPTIONS)
 # We need to append text here for making sure TPLs
 # we import are available for an installed Kokkos
-GLOBAL_RESET(KOKKOS_TPL_EXPORTS)
-# We need these for controlling the exact -std flag
-GLOBAL_RESET(KOKKOS_DONT_ALLOW_EXTENSIONS)
-GLOBAL_RESET(KOKKOS_USE_CXX_EXTENSIONS)
-GLOBAL_RESET(KOKKOS_CXX_STANDARD_FEATURE)
+GLOBAL_SET(KOKKOS_TPL_EXPORTS)

 # Include a set of Kokkos-specific wrapper functions that
 # will either call raw CMake or TriBITS
@ -137,6 +144,9 @@ GLOBAL_RESET(KOKKOS_CXX_STANDARD_FEATURE)
 INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_tribits.cmake)


+# Check the environment and set certain variables
+# to allow platform-specific checks
+INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_check_env.cmake)
 # The build environment setup goes in the following steps
 # 1) Check all the enable options. This includes checking Kokkos_DEVICES
 # 2) Check the compiler ID (type and version)
@ -187,14 +197,21 @@ IF (KOKKOS_HAS_TRILINOS)
  # Because Tribits doesn't use lists, it uses spaces for the list of CXX flags
  # we have to match the annoying behavior
  STRING(REPLACE ";" " " KOKKOSCORE_COMPILE_OPTIONS "${KOKKOS_COMPILE_OPTIONS}")
-  STRING(REPLACE ";" " " KOKKOSCORE_CUDA_OPTIONS    "${KOKKOS_CUDA_OPTIONS}")
-  FOREACH(CUDAFE_FLAG ${KOKKOS_CUDAFE_OPTIONS})
-    SET(KOKKOSCORE_CUDAFE_OPTIONS "${KOKKOSCORE_CUDAFE_OPTIONS} -Xcudafe ${CUDAFE_FLAG}")
-  ENDFOREACH()
+  LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS ${KOKKOS_COMPILE_OPTIONS})
+  LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS ${KOKKOS_CUDA_OPTIONS})
  FOREACH(XCOMP_FLAG ${KOKKOS_XCOMPILER_OPTIONS})
    SET(KOKKOSCORE_XCOMPILER_OPTIONS "${KOKKOSCORE_XCOMPILER_OPTIONS} -Xcompiler ${XCOMP_FLAG}")
+    LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS -Xcompiler ${XCOMP_FLAG})
  ENDFOREACH()
-  SET(KOKKOSCORE_CXX_FLAGS "${KOKKOSCORE_COMPILE_OPTIONS} ${CMAKE_CXX${KOKKOS_CXX_STANDARD}_STANDARD_COMPILE_OPTION} ${KOKKOSCORE_CUDA_OPTIONS} ${KOKKOSCORE_CUDAFE_OPTIONS} ${KOKKOSCORE_XCOMPILER_OPTIONS}")
+  SET(KOKKOSCORE_CXX_FLAGS "${KOKKOSCORE_COMPILE_OPTIONS} ${CMAKE_CXX${KOKKOS_CXX_STANDARD}_STANDARD_COMPILE_OPTION} ${KOKKOSCORE_XCOMPILER_OPTIONS}")
+  IF (KOKKOS_ENABLE_CUDA)
+    STRING(REPLACE ";" " " KOKKOSCORE_CUDA_OPTIONS    "${KOKKOS_CUDA_OPTIONS}")
+    FOREACH(CUDAFE_FLAG ${KOKKOS_CUDAFE_OPTIONS})
+      SET(KOKKOSCORE_CUDAFE_OPTIONS "${KOKKOSCORE_CUDAFE_OPTIONS} -Xcudafe ${CUDAFE_FLAG}")
+      LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS -Xcudafe ${CUDAFE_FLAG})
+    ENDFOREACH()
+    SET(KOKKOSCORE_CXX_FLAGS "${KOKKOSCORE_CXX_FLAGS} ${KOKKOSCORE_CUDA_OPTIONS} ${KOKKOSCORE_CUDAFE_OPTIONS}")
+  ENDIF()
  # Both parent scope and this package
  # In ProjectCompilerPostConfig.cmake, we capture the "global" flags Trilinos wants in
  # TRILINOS_TOPLEVEL_CXX_FLAGS
@ -203,6 +220,8 @@ IF (KOKKOS_HAS_TRILINOS)
  #CMAKE_CXX_FLAGS will get added to Kokkos and Kokkos dependencies automatically here
  #These flags get set up in KOKKOS_PACKAGE_DECL, which means they
  #must be configured before KOKKOS_PACKAGE_DECL
+  SET(KOKKOS_ALL_COMPILE_OPTIONS
+    $<$<COMPILE_LANGUAGE:CXX>:${KOKKOS_ALL_COMPILE_OPTIONS}>)
 ENDIF()

 KOKKOS_PACKAGE_DECL()
@ -250,7 +269,7 @@ INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_config.h" DESTINATION ${CM
 IF (HAS_PARENT)
  FOREACH(DEV Kokkos_ENABLED_DEVICES)
    #I would much rather not make these cache variables or global properties, but I can't
-    #make any guarantees on whether PARENT_SCOPE is good enough to make 
+    #make any guarantees on whether PARENT_SCOPE is good enough to make
    #these variables visible where I need them
    SET(Kokkos_ENABLE_${DEV} ON PARENT_SCOPE)
    SET_PROPERTY(GLOBAL PROPERTY Kokkos_ENABLE_${DEV} ON)
--- a/lib/kokkos/Copyright.txt
+++ b/lib/kokkos/Copyright.txt
@ -1,13 +1,13 @@
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 3.0
 //       Copyright (2020) National Technology & Engineering
 //               Solutions of Sandia, LLC (NTESS).
-// 
+//
 // Under the terms of Contract DE-NA0003525 with NTESS,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,6 +36,6 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
--- a/lib/kokkos/Makefile.kokkos
+++ b/lib/kokkos/Makefile.kokkos
@ -6,15 +6,20 @@ ifndef KOKKOS_PATH
 endif
 CXXFLAGS=$(CCFLAGS)

-# Options: Cuda,ROCm,OpenMP,Pthreads,Qthreads,Serial
+KOKKOS_VERSION_MAJOR = 3
+KOKKOS_VERSION_MINOR = 1
+KOKKOS_VERSION_PATCH = 0
+KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc)
+
+# Options: Cuda,HIP,ROCm,OpenMP,Pthread,Serial
 KOKKOS_DEVICES ?= "OpenMP"
-#KOKKOS_DEVICES ?= "Pthreads"
+#KOKKOS_DEVICES ?= "Pthread"
 # Options: 
 # Intel:    KNC,KNL,SNB,HSW,BDW,SKX
 # NVIDIA:   Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75
 # ARM:      ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2
 # IBM:      BGQ,Power7,Power8,Power9
-# AMD-GPUS: Kaveri,Carrizo,Fiji,Vega
+# AMD-GPUS: Vega900,Vega906
 # AMD-CPUS: AMDAVX,Ryzen,EPYC
 KOKKOS_ARCH ?= ""
 # Options: yes,no
@ -35,6 +40,9 @@ KOKKOS_STANDALONE_CMAKE ?= "no"
 # Options: force_uvm,use_ldg,rdc,enable_lambda,enable_constexpr
 KOKKOS_CUDA_OPTIONS ?= "enable_lambda"

+# Options: rdc
+KOKKOS_HIP_OPTIONS ?= ""
+
 # Default settings specific options.
 # Options: enable_async_dispatch
 KOKKOS_HPX_OPTIONS ?= ""
@ -82,29 +90,50 @@ KOKKOS_INTERNAL_CUDA_USE_CONSTEXPR := $(call kokkos_has_string,$(KOKKOS_CUDA_OPT
 KOKKOS_INTERNAL_HPX_ENABLE_ASYNC_DISPATCH := $(call kokkos_has_string,$(KOKKOS_HPX_OPTIONS),enable_async_dispatch)
 KOKKOS_INTERNAL_ENABLE_ETI := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_eti)

+KOKKOS_INTERNAL_HIP_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_HIP_OPTIONS),rdc)

 # Check for Kokkos Host Execution Spaces one of which must be on.
 KOKKOS_INTERNAL_USE_OPENMP := $(call kokkos_has_string,$(subst OpenMPTarget,,$(KOKKOS_DEVICES)),OpenMP)
 KOKKOS_INTERNAL_USE_PTHREADS := $(call kokkos_has_string,$(KOKKOS_DEVICES),Pthread)
-KOKKOS_INTERNAL_USE_QTHREADS := $(call kokkos_has_string,$(KOKKOS_DEVICES),Qthreads)
 KOKKOS_INTERNAL_USE_HPX := $(call kokkos_has_string,$(KOKKOS_DEVICES),HPX)
 KOKKOS_INTERNAL_USE_SERIAL := $(call kokkos_has_string,$(KOKKOS_DEVICES),Serial)

 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 0)
  ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 0)
-    ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 0)
-      ifeq ($(KOKKOS_INTERNAL_USE_HPX), 0)
-        KOKKOS_INTERNAL_USE_SERIAL := 1
-      endif
+    ifeq ($(KOKKOS_INTERNAL_USE_HPX), 0)
+      KOKKOS_INTERNAL_USE_SERIAL := 1
    endif
  endif
 endif

 # Check for other Execution Spaces.
 KOKKOS_INTERNAL_USE_CUDA := $(call kokkos_has_string,$(KOKKOS_DEVICES),Cuda)
-KOKKOS_INTERNAL_USE_ROCM := $(call kokkos_has_string,$(KOKKOS_DEVICES),ROCm)
+KOKKOS_INTERNAL_USE_HIP := $(call kokkos_has_string,$(KOKKOS_DEVICES),HIP)
 KOKKOS_INTERNAL_USE_OPENMPTARGET := $(call kokkos_has_string,$(KOKKOS_DEVICES),OpenMPTarget)

+KOKKOS_DEVICELIST =
+ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
+  KOKKOS_DEVICELIST += Serial
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
+  KOKKOS_DEVICELIST += OpenMP
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
+  KOKKOS_DEVICELIST += Threads
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
+  KOKKOS_DEVICELIST += HPX
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+  KOKKOS_DEVICELIST += Cuda
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
+  KOKKOS_DEVICELIST += HIP
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
+  KOKKOS_DEVICELIST += OPENMPTARGET
+endif
+
 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
  KOKKOS_INTERNAL_NVCC_PATH := $(shell which nvcc)
  ifeq ($(origin CUDA_PATH), undefined)
@ -132,6 +161,7 @@ KOKKOS_INTERNAL_COMPILER_NVCC        := $(strip $(shell echo "$(shell export OMP
 KOKKOS_INTERNAL_COMPILER_CLANG       := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),clang)
 KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),Apple LLVM)
 KOKKOS_INTERNAL_COMPILER_HCC         := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),HCC)
+KOKKOS_INTERNAL_COMPILER_GCC         := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),GCC)

 # Check Host Compiler if using NVCC through nvcc_wrapper
 ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
@ -180,20 +210,20 @@ ifeq ($(KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS), 1)
    KOKKOS_INTERNAL_COMPILER_WARNINGS =
  else
    ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
-      KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
+      KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wunused-parameter -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
    else
      ifeq ($(KOKKOS_INTERNAL_COMPILER_APPLE_CLANG), 1)
-        KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
+        KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wunused-parameter -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
      else
        ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
-          KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
+          KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wunused-parameter -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
        else
          ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
            # TODO check if cray accepts GNU style warnings
            KOKKOS_INTERNAL_COMPILER_WARNINGS =
          else
            #gcc
-            KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized
+            KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wunused-parameter -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized
          endif
        endif
      endif
@ -230,7 +260,12 @@ ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
  KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -DKOKKOS_IBM_XL_OMP45_WORKAROUND -qsmp=omp -qoffload -qnoeh
 else
  ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
-    KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -DKOKKOS_BUG_WORKAROUND_IBM_CLANG_OMP45_VIEW_INIT -fopenmp-implicit-declare-target -fopenmp-targets=nvptx64-nvidia-cuda -fopenmp -fopenmp=libomp
+    #KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -DKOKKOS_BUG_WORKAROUND_IBM_CLANG_OMP45_VIEW_INIT -fopenmp-implicit-declare-target -fopenmp-targets=nvptx64-nvidia-cuda -fopenmp -fopenmp=libomp
+    KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -DKOKKOS_WORKAROUND_OPENMPTARGET_CLANG -fopenmp -fopenmp=libomp
+    KOKKOS_INTERNAL_OPENMPTARGET_LIB := -lomptarget
+  else
+    #Assume GCC
+    KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -fopenmp -foffload=nvptx-none
  endif
 endif

@ -353,11 +388,8 @@ KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_
 KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(call kokkos_has_string,$(KOKKOS_ARCH),AMDAVX)
 KOKKOS_INTERNAL_USE_ARCH_RYZEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Ryzen)
 KOKKOS_INTERNAL_USE_ARCH_EPYC := $(call kokkos_has_string,$(KOKKOS_ARCH),EPYC)
-KOKKOS_INTERNAL_USE_ARCH_KAVERI := $(call kokkos_has_string,$(KOKKOS_ARCH),Kaveri)
-KOKKOS_INTERNAL_USE_ARCH_CARRIZO := $(call kokkos_has_string,$(KOKKOS_ARCH),Carrizo)
-KOKKOS_INTERNAL_USE_ARCH_FIJI := $(call kokkos_has_string,$(KOKKOS_ARCH),Fiji)
-KOKKOS_INTERNAL_USE_ARCH_VEGA := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega)
-KOKKOS_INTERNAL_USE_ARCH_GFX901 := $(call kokkos_has_string,$(KOKKOS_ARCH),gfx901)
+KOKKOS_INTERNAL_USE_ARCH_VEGA900 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega900)
+KOKKOS_INTERNAL_USE_ARCH_VEGA906 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega906)

 # Any AVX?
 KOKKOS_INTERNAL_USE_ARCH_SSE42      := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM))
@ -430,6 +462,10 @@ tmp := $(call kokkos_append_header,'\#error "Do not include $(KOKKOS_CONFIG_HEAD
 tmp := $(call kokkos_append_header,'\#else')
 tmp := $(call kokkos_append_header,'\#define KOKKOS_CORE_CONFIG_H')
 tmp := $(call kokkos_append_header,'\#endif')
+
+tmp := $(call kokkos_append_header,"")
+tmp := $(call kokkos_append_header,"\#define KOKKOS_VERSION $(KOKKOS_VERSION)")
+tmp := $(call kokkos_append_header,"")
 	
 tmp := $(call kokkos_append_header,"/* Execution Spaces */")

@ -442,9 +478,15 @@ ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
  tmp := $(call kokkos_append_header,'\#define KOKKOS_ENABLE_ROCM')
  tmp := $(call kokkos_append_header,'\#define KOKKOS_IMPL_ROCM_CLANG_WORKAROUND 1')
 endif
+ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
+  tmp := $(call kokkos_append_header,'\#define KOKKOS_ENABLE_HIP')
+endif

 ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
  tmp := $(call kokkos_append_header,'\#define KOKKOS_ENABLE_OPENMPTARGET')
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1)
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_WORKAROUND_OPENMPTARGET_GCC")
+  endif
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
@ -455,10 +497,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_THREADS")
 endif

-ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_QTHREADS")
-endif
-
 ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_HPX")
 endif
@ -966,6 +1004,14 @@ endif

 # Figure out the architecture flag for Cuda.
 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+   KOKKOS_INTERNAL_USE_CUDA_ARCH=1
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
+   ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
+     KOKKOS_INTERNAL_USE_CUDA_ARCH=1
+   endif
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1)
  ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
    KOKKOS_INTERNAL_CUDA_ARCH_FLAG=-arch
  else ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
@ -974,7 +1020,17 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
  else
    $(error Makefile.kokkos: CUDA is enabled but the compiler is neither NVCC nor Clang (got version string $(KOKKOS_CXX_VERSION)) )
  endif
+  KOKKOS_INTERNAL_USE_CUDA_ARCH = 1
+endif

+ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
+    KOKKOS_INTERNAL_CUDA_ARCH_FLAG=-fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march
+  endif
+  KOKKOS_INTERNAL_USE_CUDA_ARCH = 1
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1)
  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1)
    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KEPLER")
    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KEPLER30")
@ -1042,55 +1098,49 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
    ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
      KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)
    endif
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
+      ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
+        KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)
+      endif
+    endif
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
+    KOKKOS_CXXFLAGS += --expt-extended-lambda
  endif
 endif

+
 # Figure out the architecture flag for ROCm.
-ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
+ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
  # Lets start with adding architecture defines
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KAVERI), 1)
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ROCM 701")
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KAVERI")
-    KOKKOS_INTERNAL_ROCM_ARCH_FLAG := --amdgpu-target=gfx701 
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA900), 1)
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_HIP 900")
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_VEGA900")
+    KOKKOS_INTERNAL_HIP_ARCH_FLAG := --amdgpu-target=gfx900
  endif
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_CARRIZO), 1)
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ROCM 801")
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_CARRIZO")
-    KOKKOS_INTERNAL_ROCM_ARCH_FLAG := --amdgpu-target=gfx801 
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA906), 1)
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_HIP 906")
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_VEGA906")
+    KOKKOS_INTERNAL_HIP_ARCH_FLAG := --amdgpu-target=gfx906
  endif
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_FIJI), 1)
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ROCM 803")
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_FIJI")
-    KOKKOS_INTERNAL_ROCM_ARCH_FLAG := --amdgpu-target=gfx803
-  endif
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA), 1)
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ROCM 900")
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_VEGA")
-    KOKKOS_INTERNAL_ROCM_ARCH_FLAG := --amdgpu-target=gfx900 
-  endif
-  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_GFX901), 1)
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ROCM 901")
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_GFX901")
-    KOKKOS_INTERNAL_ROCM_ARCH_FLAG := --amdgpu-target=gfx901 
-  endif
- 
-  
-  KOKKOS_INTERNAL_HCC_PATH := $(shell which $(CXX))
-  ROCM_HCC_PATH ?= $(KOKKOS_INTERNAL_HCC_PATH:/bin/clang++=)

-  KOKKOS_CXXFLAGS += $(shell $(ROCM_HCC_PATH)/bin/hcc-config --cxxflags) 
-  KOKKOS_LDFLAGS += $(shell $(ROCM_HCC_PATH)/bin/hcc-config --ldflags) -lhc_am -lm 
-  KOKKOS_CXXLDFLAGS += $(shell $(ROCM_HCC_PATH)/bin/hcc-config --ldflags) -lhc_am -lm
-  KOKKOS_TPL_LIBRARY_NAMES += hc_am m
-  KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_ROCM_ARCH_FLAG)
+  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/HIP/*.cpp)
+  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/HIP/*.hpp)

-  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/ROCm/*.cpp)
-ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
-  KOKKOS_SRC += $(wildcard $(KOKKOS_ETI_PATH)/ROCm/*.cpp)
-endif
-  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/ROCm/*.hpp)
+  KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_HIP_ARCH_FLAG)
+  KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_HIP_ARCH_FLAG)
+
+  ifeq ($(KOKKOS_INTERNAL_HIP_USE_RELOC), 1)
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE")
+    KOKKOS_CXXFLAGS+=-fgpu-rdc
+    KOKKOS_LDFLAGS+=-fgpu-rdc
+  else
+    KOKKOS_CXXFLAGS+=-fno-gpu-rdc
+    KOKKOS_LDFLAGS+=-fno-gpu-rdc
+  endif
 endif

+
 KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h 2>&1)

 ifeq ($(KOKKOS_INTERNAL_LS_CONFIG), KokkosCore_config.h)
@ -1141,7 +1191,7 @@ endif
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
-  KOKKOS_SRC += $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp
+  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/OpenMPTarget/*.cpp)
  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/OpenMPTarget/*.hpp)
  ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
    KOKKOS_CXXFLAGS += -Xcompiler $(KOKKOS_INTERNAL_OPENMPTARGET_FLAG)
@ -1149,6 +1199,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
    KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_OPENMPTARGET_FLAG)
  endif
  KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_OPENMPTARGET_FLAG)
+  KOKKOS_LIBS += $(KOKKOS_INTERNAL_OPENMPTARGET_LIB)
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
@ -1178,22 +1229,6 @@ endif
  KOKKOS_TPL_LIBRARY_NAMES += pthread
 endif

-ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
-  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Qthreads/*.cpp)
-  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Qthreads/*.hpp)
-  ifneq ($(KOKKOS_CMAKE), yes)
-    ifneq ($(QTHREADS_PATH),)
-      KOKKOS_CPPFLAGS += -I$(QTHREADS_PATH)/include
-      KOKKOS_LIBDIRS += -L$(QTHREADS_PATH)/lib
-      KOKKOS_CXXLDFLAGS += -L$(QTHREADS_PATH)/lib
-      KOKKOS_TPL_INCLUDE_DIRS += $(QTHREADS_PATH)/include
-      KOKKOS_TPL_LIBRARY_DIRS += $(QTHREADS_PATH)/lib64
-    endif
-    KOKKOS_LIBS += -lqthread
-    KOKKOS_TPL_LIBRARY_NAMES += qthread
-  endif
-endif
-
 ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/HPX/*.cpp)
  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/HPX/*.hpp)
--- a/lib/kokkos/Makefile.targets
+++ b/lib/kokkos/Makefile.targets
@ -55,6 +55,17 @@ ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
 endif
 endif

+ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
+Kokkos_HIP_Space.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Space.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Space.cpp
+Kokkos_HIP_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Instance.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Instance.cpp
+Kokkos_HIP_KernelLaunch.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_KernelLaunch.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_KernelLaunch.cpp
+Kokkos_HIP_Locks.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Locks.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Locks.cpp
+endif
+
 ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
 Kokkos_ROCm_Exec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Exec.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Exec.cpp
@ -79,13 +90,6 @@ ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
 endif
 endif

-ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
-Kokkos_QthreadsExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Qthreads/Kokkos_QthreadsExec.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Qthreads/Kokkos_QthreadsExec.cpp
-Kokkos_Qthreads_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Qthreads/Kokkos_Qthreads_Task.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Qthreads/Kokkos_Qthreads_Task.cpp
-endif
-
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
 Kokkos_OpenMP_Exec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
@ -106,10 +110,12 @@ endif
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
 Kokkos_OpenMPTarget_Exec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
+Kokkos_OpenMPTarget_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
 Kokkos_OpenMPTargetSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp
-#Kokkos_OpenMPTarget_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp
-#       $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp
+Kokkos_OpenMPTarget_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp
 endif

 Kokkos_HBWSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp
--- a/lib/kokkos/README.md
+++ b/lib/kokkos/README.md
@ -11,8 +11,8 @@ CUDA, HPX, OpenMP and Pthreads as backend programming models with several other
 backends in development.

 Kokkos Core is part of the Kokkos C++ Performance Portability Programming EcoSystem,
-which also provides math kernels (https://github.com/kokkos/kokkos-kernels), as well as 
-profiling and debugging tools (https://github.com/kokkos/kokkos-tools).  
+which also provides math kernels (https://github.com/kokkos/kokkos-kernels), as well as
+profiling and debugging tools (https://github.com/kokkos/kokkos-tools).

 # Learning about Kokkos

@ -23,7 +23,7 @@ For questions find us on Slack: https://kokkosteam.slack.com or open a github is
 For non-public questions send an email to
 crtrott(at)sandia.gov

-A separate repository with extensive tutorial material can be found under 
+A separate repository with extensive tutorial material can be found under
 https://github.com/kokkos/kokkos-tutorials.

 Furthermore, the 'example/tutorial' directory provides step by step tutorial
@ -41,12 +41,12 @@ To learn more about Kokkos consider watching one of our presentations:

 # Contributing to Kokkos

-We are open and try to encourage contributions from external developers. 
+We are open and try to encourage contributions from external developers.
 To do so please first open an issue describing the contribution and then issue
 a pull request against the develop branch. For larger features it may be good
-to get guidance from the core development team first through the github issue. 
+to get guidance from the core development team first through the github issue.

-Note that Kokkos Core is licensed under standard 3-clause BSD terms of use. 
+Note that Kokkos Core is licensed under standard 3-clause BSD terms of use.
 Which means contributing to Kokkos allows anyone else to use your contributions
 not just for public purposes but also for closed source commercial projects.
 For specifics see the LICENSE file contained in the repository or distribution.
@ -94,9 +94,9 @@ For specifics see the LICENSE file contained in the repository or distribution.
 * Intel 18.2.199 (with gcc 4.9.3)

 ### Primary tested compilers on ARM (Cavium ThunderX2)
-* GCC 7.2.0 
+* GCC 7.2.0
 * ARM/Clang 18.4.0
-  
+
 ### Other compilers working:
 * X86:
    * Cygwin 2.1.0 64bit with gcc 4.9.3
@ -110,47 +110,47 @@ For specifics see the LICENSE file contained in the repository or distribution.


 Primary tested compiler are passing in release mode
-with warnings as errors. They also are tested with a comprehensive set of 
+with warnings as errors. They also are tested with a comprehensive set of
 backend combinations (i.e. OpenMP, Pthreads, Serial, OpenMP+Serial, ...).
 We are using the following set of flags:
-* GCC:   
+* GCC:
   ````
-      -Wall -Wshadow -pedantic 
+      -Wall -Wunused-parameter -Wshadow -pedantic
      -Werror -Wsign-compare -Wtype-limits
-      -Wignored-qualifiers -Wempty-body 
+      -Wignored-qualifiers -Wempty-body
      -Wclobbered -Wuninitialized
   ````
-* Intel: 
+* Intel:
    ````
-      -Wall -Wshadow -pedantic 
-      -Werror -Wsign-compare -Wtype-limits 
+      -Wall -Wunused-parameter -Wshadow -pedantic
+      -Werror -Wsign-compare -Wtype-limits
      -Wuninitialized
    ````
-* Clang: 
+* Clang:
    ````
-      -Wall -Wshadow -pedantic 
-      -Werror -Wsign-compare -Wtype-limits 
+      -Wall -Wunused-parameter -Wshadow -pedantic
+      -Werror -Wsign-compare -Wtype-limits
      -Wuninitialized
-    ````    
+    ````

-* NVCC:  
+* NVCC:
  ````
-    -Wall -Wshadow -pedantic 
-    -Werror -Wsign-compare -Wtype-limits 
+    -Wall -Wunused-parameter -Wshadow -pedantic
+    -Werror -Wsign-compare -Wtype-limits
    -Wuninitialized
  ````

-Other compilers are tested occasionally, in particular when pushing from develop to 
+Other compilers are tested occasionally, in particular when pushing from develop to
 master branch. These are tested less rigorously without `-Werror` and only for a select set of backends.

 # Building and Installing Kokkos
-Kokkos provide a CMake build system and a raw Makefile build system. 
+Kokkos provide a CMake build system and a raw Makefile build system.
 The CMake build system is strongly encouraged and will be the most rigorously supported in future releases.
 Full details are given in the [build instructions](BUILD.md). Basic setups are shown here:

 ## CMake

-The best way to install Kokkos is using the CMake build system. Assuming Kokkos lives in `$srcdir`: 
+The best way to install Kokkos is using the CMake build system. Assuming Kokkos lives in `$srcdir`:
 ````
 cmake $srcdir \
  -DCMAKE_CXX_COMPILER=$path_to_compiler \
@ -162,9 +162,9 @@ cmake $srcdir \
 ````
 then simply type `make install`. The Kokkos CMake package will then be installed in `$path_to_install` to be used by downstream packages.

-To validate the Kokkos build, configure with 
+To validate the Kokkos build, configure with
 ````
- -DKokkos_ENABLE_TESTS=On 
+ -DKokkos_ENABLE_TESTS=On
 ````
 and run `make test` after completing the build.

@ -209,7 +209,7 @@ For a complete list of Kokkos options, run:
 spack info kokkos
 ````
 Spack currently installs packages to a location determined by a unique hash. This hash name is not really "human readable".
-Generally, Spack usage should never really require you to reference the computer-generated unique install folder. 
+Generally, Spack usage should never really require you to reference the computer-generated unique install folder.
 More details are given in the [build instructions](BUILD.md). If you must know, you can locate Spack Kokkos installations with:
 ````
 spack find -p kokkos ...
@ -217,7 +217,7 @@ spack find -p kokkos ...
 where `...` is the unique spec identifying the particular Kokkos configuration and version.


-## Raw Makefile 
+## Raw Makefile
 A bash script is provided to generate raw makefiles.
 To install Kokkos as a library create a build directory and run the following
 ````
@ -240,33 +240,33 @@ changing the device type for which to build.
 For individual projects, it may be preferable to build Kokkos inline rather than link to an installed package.
 The main reason is that you may otherwise need many different
 configurations of Kokkos installed depending on the required compile time
-features an application needs. For example there is only one default 
+features an application needs. For example there is only one default
 execution space, which means you need different installations to have OpenMP
 or Pthreads as the default space. Also for the CUDA backend there are certain
-choices, such as allowing relocatable device code, which must be made at 
+choices, such as allowing relocatable device code, which must be made at
 installation time. Building Kokkos inline uses largely the same process
-as compiling an application against an installed Kokkos library. 
+as compiling an application against an installed Kokkos library.

 For CMake, this means copying over the Kokkos source code into your project and adding `add_subdirectory(kokkos)` to your CMakeLists.txt.

-For raw Makefiles, see the example benchmarks/bytes_and_flops/Makefile which can be used with an installed library and or an inline build.  
+For raw Makefiles, see the example benchmarks/bytes_and_flops/Makefile which can be used with an installed library and or an inline build.

 # Kokkos and CUDA UVM

-Kokkos does support UVM as a specific memory space called CudaUVMSpace. 
-Allocations made with that space are accessible from host and device. 
+Kokkos does support UVM as a specific memory space called CudaUVMSpace.
+Allocations made with that space are accessible from host and device.
 You can tell Kokkos to use that as the default space for Cuda allocations.
 In either case UVM comes with a number of restrictions:
-* You can't access allocations on the host while a kernel is potentially 
-running. This will lead to segfaults. To avoid that you either need to 
+* You can't access allocations on the host while a kernel is potentially
+running. This will lead to segfaults. To avoid that you either need to
 call Kokkos::Cuda::fence() (or just Kokkos::fence()), after kernels, or
 you can set the environment variable CUDA_LAUNCH_BLOCKING=1.
-* In multi socket multi GPU machines without NVLINK, UVM defaults 
+* In multi socket multi GPU machines without NVLINK, UVM defaults
 to using zero copy allocations for technical reasons related to using multiple
 GPUs from the same process. If an executable doesn't do that (e.g. each
-MPI rank of an application uses a single GPU [can be the same GPU for 
+MPI rank of an application uses a single GPU [can be the same GPU for
 multiple MPI ranks]) you can set CUDA_MANAGED_FORCE_DEVICE_ALLOC=1.
-This will enforce proper UVM allocations, but can lead to errors if 
+This will enforce proper UVM allocations, but can lead to errors if
 more than a single GPU is used by a single process.


--- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
@ -537,6 +537,145 @@ struct rand<Generator, Kokkos::complex<double> > {
  }
 };

+template <class DeviceType>
+class Random_XorShift1024_Pool;
+
+namespace Impl {
+
+template <bool UseCArrayState>
+struct Random_XorShift1024_State {
+  uint64_t state_[16];
+  KOKKOS_DEFAULTED_FUNCTION
+  Random_XorShift1024_State() = default;
+
+  template <class StateViewType>
+  KOKKOS_FUNCTION Random_XorShift1024_State(const StateViewType& v,
+                                            int state_idx) {
+    for (int i = 0; i < 16; i++) state_[i] = v(state_idx, i);
+  }
+
+  KOKKOS_FUNCTION
+  uint64_t operator[](const int i) const { return state_[i]; }
+
+  KOKKOS_FUNCTION
+  uint64_t& operator[](const int i) { return state_[i]; }
+};
+
+template <>
+struct Random_XorShift1024_State<false> {
+  uint64_t* state_;
+  const int stride_;
+  KOKKOS_FUNCTION
+  Random_XorShift1024_State() : state_(nullptr), stride_(1){};
+
+  template <class StateViewType>
+  KOKKOS_FUNCTION Random_XorShift1024_State(const StateViewType& v,
+                                            int state_idx)
+      : state_(&v(state_idx, 0)), stride_(v.stride_1()) {}
+
+  KOKKOS_FUNCTION
+  uint64_t operator[](const int i) const { return state_[i * stride_]; }
+
+  KOKKOS_FUNCTION
+  uint64_t& operator[](const int i) { return state_[i * stride_]; }
+};
+
+template <class ExecutionSpace>
+struct Random_XorShift1024_UseCArrayState : std::true_type {};
+
+#ifdef KOKKOS_ENABLE_CUDA
+template <>
+struct Random_XorShift1024_UseCArrayState<Kokkos::Cuda> : std::false_type {};
+#endif
+#ifdef KOKKOS_ENABLE_HIP
+template <>
+struct Random_XorShift1024_UseCArrayState<Kokkos::Experimental::HIP>
+    : std::false_type {};
+#endif
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+template <>
+struct Random_XorShift1024_UseCArrayState<Kokkos::Experimental::OpenMPTarget>
+    : std::false_type {};
+#endif
+
+template <class ExecutionSpace>
+struct Random_UniqueIndex {
+  using locks_view_type = View<int*, ExecutionSpace>;
+  KOKKOS_FUNCTION
+  static int get_state_idx(const locks_view_type) {
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
+    const int i = ExecutionSpace::hardware_thread_id();
+#else
+    const int i = ExecutionSpace::impl_hardware_thread_id();
+#endif
+    return i;
+#else
+    return 0;
+#endif
+  }
+};
+
+#ifdef KOKKOS_ENABLE_CUDA
+template <>
+struct Random_UniqueIndex<Kokkos::Cuda> {
+  using locks_view_type = View<int*, Kokkos::Cuda>;
+  KOKKOS_FUNCTION
+  static int get_state_idx(const locks_view_type& locks_) {
+#ifdef __CUDA_ARCH__
+    const int i_offset =
+        (threadIdx.x * blockDim.y + threadIdx.y) * blockDim.z + threadIdx.z;
+    int i = (((blockIdx.x * gridDim.y + blockIdx.y) * gridDim.z + blockIdx.z) *
+                 blockDim.x * blockDim.y * blockDim.z +
+             i_offset) %
+            locks_.extent(0);
+    while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
+      i += blockDim.x * blockDim.y * blockDim.z;
+      if (i >= static_cast<int>(locks_.extent(0))) {
+        i = i_offset;
+      }
+    }
+    return i;
+#else
+    (void)locks_;
+    return 0;
+#endif
+  }
+};
+#endif
+
+#ifdef KOKKOS_ENABLE_HIP
+template <>
+struct Random_UniqueIndex<Kokkos::Experimental::HIP> {
+  using locks_view_type = View<int*, Kokkos::Experimental::HIP>;
+  KOKKOS_FUNCTION
+  static int get_state_idx(const locks_view_type& locks_) {
+#ifdef __HIP_DEVICE_COMPILE__
+    const int i_offset =
+        (hipThreadIdx_x * hipBlockDim_y + hipThreadIdx_y) * hipBlockDim_z +
+        hipThreadIdx_z;
+    int i = (((hipBlockIdx_x * hipGridDim_y + hipBlockIdx_y) * hipGridDim_z +
+              hipBlockIdx_z) *
+                 hipBlockDim_x * hipBlockDim_y * hipBlockDim_z +
+             i_offset) %
+            locks_.extent(0);
+    while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
+      i += hipBlockDim_x * hipBlockDim_y * hipBlockDim_z;
+      if (i >= static_cast<int>(locks_.extent(0))) {
+        i = i_offset;
+      }
+    }
+    return i;
+#else
+    (void)locks_;
+    return 0;
+#endif
+  }
+};
+#endif
+
+}  // namespace Impl
+
 template <class DeviceType>
 class Random_XorShift64_Pool;

@ -550,10 +689,10 @@ class Random_XorShift64 {
 public:
  typedef DeviceType device_type;

-  enum { MAX_URAND = 0xffffffffU };
-  enum { MAX_URAND64 = 0xffffffffffffffffULL - 1 };
-  enum { MAX_RAND = static_cast<int>(0xffffffff / 2) };
-  enum { MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffLL / 2 - 1) };
+  constexpr static uint32_t MAX_URAND   = std::numeric_limits<uint32_t>::max();
+  constexpr static uint64_t MAX_URAND64 = std::numeric_limits<uint64_t>::max();
+  constexpr static int32_t MAX_RAND     = std::numeric_limits<int32_t>::max();
+  constexpr static int64_t MAX_RAND64   = std::numeric_limits<int64_t>::max();

  KOKKOS_INLINE_FUNCTION
  Random_XorShift64(uint64_t state, int state_idx = 0)
@ -637,10 +776,12 @@ class Random_XorShift64 {
  }

  KOKKOS_INLINE_FUNCTION
-  float frand() { return 1.0f * urand64() / MAX_URAND64; }
+  float frand() { return urand64() / static_cast<float>(MAX_URAND64); }

  KOKKOS_INLINE_FUNCTION
-  float frand(const float& range) { return range * urand64() / MAX_URAND64; }
+  float frand(const float& range) {
+    return range * urand64() / static_cast<float>(MAX_URAND64);
+  }

  KOKKOS_INLINE_FUNCTION
  float frand(const float& start, const float& end) {
@ -648,10 +789,12 @@ class Random_XorShift64 {
  }

  KOKKOS_INLINE_FUNCTION
-  double drand() { return 1.0 * urand64() / MAX_URAND64; }
+  double drand() { return urand64() / static_cast<double>(MAX_URAND64); }

  KOKKOS_INLINE_FUNCTION
-  double drand(const double& range) { return range * urand64() / MAX_URAND64; }
+  double drand(const double& range) {
+    return range * urand64() / static_cast<double>(MAX_URAND64);
+  }

  KOKKOS_INLINE_FUNCTION
  double drand(const double& start, const double& end) {
@ -662,6 +805,11 @@ class Random_XorShift64 {
  // number
  KOKKOS_INLINE_FUNCTION
  double normal() {
+#ifndef __HIP_DEVICE_COMPILE__  // FIXME_HIP
+    using std::sqrt;
+#else
+    using ::sqrt;
+#endif
    double S = 2.0;
    double U;
    while (S >= 1.0) {
@ -669,7 +817,7 @@ class Random_XorShift64 {
      const double V = 2.0 * drand() - 1.0;
      S              = U * U + V * V;
    }
-    return U * std::sqrt(-2.0 * log(S) / S);
+    return U * sqrt(-2.0 * log(S) / S);
  }

  KOKKOS_INLINE_FUNCTION
@ -681,9 +829,10 @@ class Random_XorShift64 {
 template <class DeviceType = Kokkos::DefaultExecutionSpace>
 class Random_XorShift64_Pool {
 private:
-  typedef View<int*, DeviceType> lock_type;
+  using execution_space = typename DeviceType::execution_space;
+  typedef View<int*, execution_space> locks_type;
  typedef View<uint64_t*, DeviceType> state_data_type;
-  lock_type locks_;
+  locks_type locks_;
  state_data_type state_;
  int num_states_;

@ -695,11 +844,8 @@ class Random_XorShift64_Pool {
  Random_XorShift64_Pool() { num_states_ = 0; }
  Random_XorShift64_Pool(uint64_t seed) {
    num_states_ = 0;
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-    init(seed, DeviceType::max_hardware_threads());
-#else
-    init(seed, DeviceType::impl_max_hardware_threads());
-#endif
+
+    init(seed, execution_space().concurrency());
  }

  KOKKOS_INLINE_FUNCTION
@ -719,11 +865,11 @@ class Random_XorShift64_Pool {

    num_states_ = num_states;

-    locks_ = lock_type("Kokkos::Random_XorShift64::locks", num_states_);
+    locks_ = locks_type("Kokkos::Random_XorShift64::locks", num_states_);
    state_ = state_data_type("Kokkos::Random_XorShift64::state", num_states_);

    typename state_data_type::HostMirror h_state = create_mirror_view(state_);
-    typename lock_type::HostMirror h_lock        = create_mirror_view(locks_);
+    typename locks_type::HostMirror h_lock       = create_mirror_view(locks_);

    // Execute on the HostMirror's default execution space.
    Random_XorShift64<typename state_data_type::HostMirror::execution_space>
@ -746,13 +892,8 @@ class Random_XorShift64_Pool {

  KOKKOS_INLINE_FUNCTION
  Random_XorShift64<DeviceType> get_state() const {
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-    const int i = DeviceType::hardware_thread_id();
-    ;
-#else
-    const int i = DeviceType::impl_hardware_thread_id();
-    ;
-#endif
+    const int i =
+        Impl::Random_UniqueIndex<execution_space>::get_state_idx(locks_);
    return Random_XorShift64<DeviceType>(state_(i), i);
  }

@ -765,35 +906,35 @@ class Random_XorShift64_Pool {
  KOKKOS_INLINE_FUNCTION
  void free_state(const Random_XorShift64<DeviceType>& state) const {
    state_(state.state_idx_) = state.state_;
+    locks_(state.state_idx_) = 0;
  }
 };

-template <class DeviceType>
-class Random_XorShift1024_Pool;
-
 template <class DeviceType>
 class Random_XorShift1024 {
+  using execution_space = typename DeviceType::execution_space;
+
 private:
  int p_;
  const int state_idx_;
-  uint64_t state_[16];
+  Impl::Random_XorShift1024_State<
+      Impl::Random_XorShift1024_UseCArrayState<execution_space>::value>
+      state_;
  friend class Random_XorShift1024_Pool<DeviceType>;

 public:
  typedef Random_XorShift1024_Pool<DeviceType> pool_type;
  typedef DeviceType device_type;

-  enum { MAX_URAND = 0xffffffffU };
-  enum { MAX_URAND64 = 0xffffffffffffffffULL - 1 };
-  enum { MAX_RAND = static_cast<int>(0xffffffffU / 2) };
-  enum { MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL / 2 - 1) };
+  constexpr static uint32_t MAX_URAND   = std::numeric_limits<uint32_t>::max();
+  constexpr static uint64_t MAX_URAND64 = std::numeric_limits<uint64_t>::max();
+  constexpr static int32_t MAX_RAND     = std::numeric_limits<int32_t>::max();
+  constexpr static int64_t MAX_RAND64   = std::numeric_limits<int64_t>::max();

  KOKKOS_INLINE_FUNCTION
  Random_XorShift1024(const typename pool_type::state_data_type& state, int p,
                      int state_idx = 0)
-      : p_(p), state_idx_(state_idx) {
-    for (int i = 0; i < 16; i++) state_[i] = state(state_idx, i);
-  }
+      : p_(p), state_idx_(state_idx), state_(state, state_idx) {}

  KOKKOS_INLINE_FUNCTION
  uint32_t urand() {
@ -876,10 +1017,12 @@ class Random_XorShift1024 {
  }

  KOKKOS_INLINE_FUNCTION
-  float frand() { return 1.0f * urand64() / MAX_URAND64; }
+  float frand() { return urand64() / static_cast<float>(MAX_URAND64); }

  KOKKOS_INLINE_FUNCTION
-  float frand(const float& range) { return range * urand64() / MAX_URAND64; }
+  float frand(const float& range) {
+    return range * urand64() / static_cast<float>(MAX_URAND64);
+  }

  KOKKOS_INLINE_FUNCTION
  float frand(const float& start, const float& end) {
@ -887,10 +1030,12 @@ class Random_XorShift1024 {
  }

  KOKKOS_INLINE_FUNCTION
-  double drand() { return 1.0 * urand64() / MAX_URAND64; }
+  double drand() { return urand64() / static_cast<double>(MAX_URAND64); }

  KOKKOS_INLINE_FUNCTION
-  double drand(const double& range) { return range * urand64() / MAX_URAND64; }
+  double drand(const double& range) {
+    return range * urand64() / static_cast<double>(MAX_URAND64);
+  }

  KOKKOS_INLINE_FUNCTION
  double drand(const double& start, const double& end) {
@ -901,6 +1046,11 @@ class Random_XorShift1024 {
  // number
  KOKKOS_INLINE_FUNCTION
  double normal() {
+#ifndef KOKKOS_ENABLE_HIP  // FIXME_HIP
+    using std::sqrt;
+#else
+    using ::sqrt;
+#endif
    double S = 2.0;
    double U;
    while (S >= 1.0) {
@ -908,7 +1058,7 @@ class Random_XorShift1024 {
      const double V = 2.0 * drand() - 1.0;
      S              = U * U + V * V;
    }
-    return U * std::sqrt(-2.0 * log(S) / S);
+    return U * sqrt(-2.0 * log(S) / S);
  }

  KOKKOS_INLINE_FUNCTION
@ -920,10 +1070,12 @@ class Random_XorShift1024 {
 template <class DeviceType = Kokkos::DefaultExecutionSpace>
 class Random_XorShift1024_Pool {
 private:
+  using execution_space = typename DeviceType::execution_space;
+  typedef View<int*, execution_space> locks_type;
  typedef View<int*, DeviceType> int_view_type;
  typedef View<uint64_t * [16], DeviceType> state_data_type;

-  int_view_type locks_;
+  locks_type locks_;
  state_data_type state_;
  int_view_type p_;
  int num_states_;
@ -939,11 +1091,8 @@ class Random_XorShift1024_Pool {

  inline Random_XorShift1024_Pool(uint64_t seed) {
    num_states_ = 0;
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-    init(seed, DeviceType::max_hardware_threads());
-#else
-    init(seed, DeviceType::impl_max_hardware_threads());
-#endif
+
+    init(seed, execution_space().concurrency());
  }

  KOKKOS_INLINE_FUNCTION
@ -965,12 +1114,12 @@ class Random_XorShift1024_Pool {
  inline void init(uint64_t seed, int num_states) {
    if (seed == 0) seed = uint64_t(1318319);
    num_states_ = num_states;
-    locks_ = int_view_type("Kokkos::Random_XorShift1024::locks", num_states_);
+    locks_      = locks_type("Kokkos::Random_XorShift1024::locks", num_states_);
    state_ = state_data_type("Kokkos::Random_XorShift1024::state", num_states_);
    p_     = int_view_type("Kokkos::Random_XorShift1024::p", num_states_);

    typename state_data_type::HostMirror h_state = create_mirror_view(state_);
-    typename int_view_type::HostMirror h_lock    = create_mirror_view(locks_);
+    typename locks_type::HostMirror h_lock       = create_mirror_view(locks_);
    typename int_view_type::HostMirror h_p       = create_mirror_view(p_);

    // Execute on the HostMirror's default execution space.
@ -997,11 +1146,8 @@ class Random_XorShift1024_Pool {

  KOKKOS_INLINE_FUNCTION
  Random_XorShift1024<DeviceType> get_state() const {
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-    const int i = DeviceType::hardware_thread_id();
-#else
-    const int i = DeviceType::impl_hardware_thread_id();
-#endif
+    const int i =
+        Impl::Random_UniqueIndex<execution_space>::get_state_idx(locks_);
    return Random_XorShift1024<DeviceType>(state_, p_(i), i);
  };

@ -1014,482 +1160,11 @@ class Random_XorShift1024_Pool {
  KOKKOS_INLINE_FUNCTION
  void free_state(const Random_XorShift1024<DeviceType>& state) const {
    for (int i = 0; i < 16; i++) state_(state.state_idx_, i) = state.state_[i];
-    p_(state.state_idx_) = state.p_;
+    p_(state.state_idx_)     = state.p_;
+    locks_(state.state_idx_) = 0;
  }
 };

-#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDACC__)
-
-template <>
-class Random_XorShift1024<Kokkos::Cuda> {
- private:
-  int p_;
-  const int state_idx_;
-  uint64_t* state_;
-  const int stride_;
-  friend class Random_XorShift1024_Pool<Kokkos::Cuda>;
-
- public:
-  typedef Kokkos::Cuda device_type;
-  typedef Random_XorShift1024_Pool<device_type> pool_type;
-
-  enum { MAX_URAND = 0xffffffffU };
-  enum { MAX_URAND64 = 0xffffffffffffffffULL - 1 };
-  enum { MAX_RAND = static_cast<int>(0xffffffffU / 2) };
-  enum { MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL / 2 - 1) };
-
-  KOKKOS_INLINE_FUNCTION
-  Random_XorShift1024(const typename pool_type::state_data_type& state, int p,
-                      int state_idx = 0)
-      : p_(p),
-        state_idx_(state_idx),
-        state_(&state(state_idx, 0)),
-        stride_(state.stride_1()) {}
-
-  KOKKOS_INLINE_FUNCTION
-  uint32_t urand() {
-    uint64_t state_0 = state_[p_ * stride_];
-    uint64_t state_1 = state_[(p_ = (p_ + 1) & 15) * stride_];
-    state_1 ^= state_1 << 31;
-    state_1 ^= state_1 >> 11;
-    state_0 ^= state_0 >> 30;
-    uint64_t tmp =
-        (state_[p_ * stride_] = state_0 ^ state_1) * 1181783497276652981ULL;
-    tmp = tmp >> 16;
-    return static_cast<uint32_t>(tmp & MAX_URAND);
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  uint64_t urand64() {
-    uint64_t state_0 = state_[p_ * stride_];
-    uint64_t state_1 = state_[(p_ = (p_ + 1) & 15) * stride_];
-    state_1 ^= state_1 << 31;
-    state_1 ^= state_1 >> 11;
-    state_0 ^= state_0 >> 30;
-    return ((state_[p_ * stride_] = state_0 ^ state_1) *
-            1181783497276652981LL) -
-           1;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  uint32_t urand(const uint32_t& range) {
-    const uint32_t max_val = (MAX_URAND / range) * range;
-    uint32_t tmp           = urand();
-    while (tmp >= max_val) urand();
-    return tmp % range;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  uint32_t urand(const uint32_t& start, const uint32_t& end) {
-    return urand(end - start) + start;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  uint64_t urand64(const uint64_t& range) {
-    const uint64_t max_val = (MAX_URAND64 / range) * range;
-    uint64_t tmp           = urand64();
-    while (tmp >= max_val) urand64();
-    return tmp % range;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  uint64_t urand64(const uint64_t& start, const uint64_t& end) {
-    return urand64(end - start) + start;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  int rand() { return static_cast<int>(urand() / 2); }
-
-  KOKKOS_INLINE_FUNCTION
-  int rand(const int& range) {
-    const int max_val = (MAX_RAND / range) * range;
-    int tmp           = rand();
-    while (tmp >= max_val) rand();
-    return tmp % range;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  int rand(const int& start, const int& end) {
-    return rand(end - start) + start;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  int64_t rand64() { return static_cast<int64_t>(urand64() / 2); }
-
-  KOKKOS_INLINE_FUNCTION
-  int64_t rand64(const int64_t& range) {
-    const int64_t max_val = (MAX_RAND64 / range) * range;
-    int64_t tmp           = rand64();
-    while (tmp >= max_val) rand64();
-    return tmp % range;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  int64_t rand64(const int64_t& start, const int64_t& end) {
-    return rand64(end - start) + start;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  float frand() { return 1.0f * urand64() / MAX_URAND64; }
-
-  KOKKOS_INLINE_FUNCTION
-  float frand(const float& range) { return range * urand64() / MAX_URAND64; }
-
-  KOKKOS_INLINE_FUNCTION
-  float frand(const float& start, const float& end) {
-    return frand(end - start) + start;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  double drand() { return 1.0 * urand64() / MAX_URAND64; }
-
-  KOKKOS_INLINE_FUNCTION
-  double drand(const double& range) { return range * urand64() / MAX_URAND64; }
-
-  KOKKOS_INLINE_FUNCTION
-  double drand(const double& start, const double& end) {
-    return frand(end - start) + start;
-  }
-
-  // Marsaglia polar method for drawing a standard normal distributed random
-  // number
-  KOKKOS_INLINE_FUNCTION
-  double normal() {
-    double S = 2.0;
-    double U;
-    while (S >= 1.0) {
-      U              = 2.0 * drand() - 1.0;
-      const double V = 2.0 * drand() - 1.0;
-      S              = U * U + V * V;
-    }
-    return U * std::sqrt(-2.0 * log(S) / S);
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  double normal(const double& mean, const double& std_dev = 1.0) {
-    return mean + normal() * std_dev;
-  }
-};
-
-template <>
-inline Random_XorShift64_Pool<Kokkos::Cuda>::Random_XorShift64_Pool(
-    uint64_t seed) {
-  num_states_ = 0;
-  init(seed, 4 * 32768);
-}
-
-template <>
-KOKKOS_INLINE_FUNCTION Random_XorShift64<Kokkos::Cuda>
-Random_XorShift64_Pool<Kokkos::Cuda>::get_state() const {
-#ifdef __CUDA_ARCH__
-  const int i_offset =
-      (threadIdx.x * blockDim.y + threadIdx.y) * blockDim.z + threadIdx.z;
-  int i = (((blockIdx.x * gridDim.y + blockIdx.y) * gridDim.z + blockIdx.z) *
-               blockDim.x * blockDim.y * blockDim.z +
-           i_offset) %
-          num_states_;
-  while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
-    i += blockDim.x * blockDim.y * blockDim.z;
-    if (i >= num_states_) {
-      i = i_offset;
-    }
-  }
-
-  return Random_XorShift64<Kokkos::Cuda>(state_(i), i);
-#else
-  return Random_XorShift64<Kokkos::Cuda>(state_(0), 0);
-#endif
-}
-
-template <>
-KOKKOS_INLINE_FUNCTION void Random_XorShift64_Pool<Kokkos::Cuda>::free_state(
-    const Random_XorShift64<Kokkos::Cuda>& state) const {
-  state_(state.state_idx_) = state.state_;
-#ifdef __CUDA_ARCH__
-  locks_(state.state_idx_) = 0;
-  return;
-#endif
-}
-
-template <>
-inline Random_XorShift1024_Pool<Kokkos::Cuda>::Random_XorShift1024_Pool(
-    uint64_t seed) {
-  num_states_ = 0;
-  init(seed, 4 * 32768);
-}
-
-template <>
-KOKKOS_INLINE_FUNCTION Random_XorShift1024<Kokkos::Cuda>
-Random_XorShift1024_Pool<Kokkos::Cuda>::get_state() const {
-#ifdef __CUDA_ARCH__
-  const int i_offset =
-      (threadIdx.x * blockDim.y + threadIdx.y) * blockDim.z + threadIdx.z;
-  int i = (((blockIdx.x * gridDim.y + blockIdx.y) * gridDim.z + blockIdx.z) *
-               blockDim.x * blockDim.y * blockDim.z +
-           i_offset) %
-          num_states_;
-  while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
-    i += blockDim.x * blockDim.y * blockDim.z;
-    if (i >= num_states_) {
-      i = i_offset;
-    }
-  }
-
-  return Random_XorShift1024<Kokkos::Cuda>(state_, p_(i), i);
-#else
-  return Random_XorShift1024<Kokkos::Cuda>(state_, p_(0), 0);
-#endif
-}
-
-template <>
-KOKKOS_INLINE_FUNCTION void Random_XorShift1024_Pool<Kokkos::Cuda>::free_state(
-    const Random_XorShift1024<Kokkos::Cuda>& state) const {
-  for (int i = 0; i < 16; i++) state_(state.state_idx_, i) = state.state_[i];
-#ifdef __CUDA_ARCH__
-  locks_(state.state_idx_) = 0;
-  return;
-#endif
-}
-
-#endif
-
-#if defined(KOKKOS_ENABLE_ROCM)
-
-template <>
-class Random_XorShift1024<Kokkos::Experimental::ROCm> {
- private:
-  int p_;
-  const int state_idx_;
-  uint64_t* state_;
-  const int stride_;
-  friend class Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>;
-
- public:
-  typedef Kokkos::Experimental::ROCm device_type;
-  typedef Random_XorShift1024_Pool<device_type> pool_type;
-
-  enum { MAX_URAND = 0xffffffffU };
-  enum { MAX_URAND64 = 0xffffffffffffffffULL - 1 };
-  enum { MAX_RAND = static_cast<int>(0xffffffffU / 2) };
-  enum { MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL / 2 - 1) };
-
-  KOKKOS_INLINE_FUNCTION
-  Random_XorShift1024(const typename pool_type::state_data_type& state, int p,
-                      int state_idx = 0)
-      : p_(p),
-        state_idx_(state_idx),
-        state_(&state(state_idx, 0)),
-        stride_(state.stride_1()) {}
-
-  KOKKOS_INLINE_FUNCTION
-  uint32_t urand() {
-    uint64_t state_0 = state_[p_ * stride_];
-    uint64_t state_1 = state_[(p_ = (p_ + 1) & 15) * stride_];
-    state_1 ^= state_1 << 31;
-    state_1 ^= state_1 >> 11;
-    state_0 ^= state_0 >> 30;
-    uint64_t tmp =
-        (state_[p_ * stride_] = state_0 ^ state_1) * 1181783497276652981ULL;
-    tmp = tmp >> 16;
-    return static_cast<uint32_t>(tmp & MAX_URAND);
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  uint64_t urand64() {
-    uint64_t state_0 = state_[p_ * stride_];
-    uint64_t state_1 = state_[(p_ = (p_ + 1) & 15) * stride_];
-    state_1 ^= state_1 << 31;
-    state_1 ^= state_1 >> 11;
-    state_0 ^= state_0 >> 30;
-    return ((state_[p_ * stride_] = state_0 ^ state_1) *
-            1181783497276652981LL) -
-           1;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  uint32_t urand(const uint32_t& range) {
-    const uint32_t max_val = (MAX_URAND / range) * range;
-    uint32_t tmp           = urand();
-    while (tmp >= max_val) urand();
-    return tmp % range;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  uint32_t urand(const uint32_t& start, const uint32_t& end) {
-    return urand(end - start) + start;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  uint64_t urand64(const uint64_t& range) {
-    const uint64_t max_val = (MAX_URAND64 / range) * range;
-    uint64_t tmp           = urand64();
-    while (tmp >= max_val) urand64();
-    return tmp % range;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  uint64_t urand64(const uint64_t& start, const uint64_t& end) {
-    return urand64(end - start) + start;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  int rand() { return static_cast<int>(urand() / 2); }
-
-  KOKKOS_INLINE_FUNCTION
-  int rand(const int& range) {
-    const int max_val = (MAX_RAND / range) * range;
-    int tmp           = rand();
-    while (tmp >= max_val) rand();
-    return tmp % range;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  int rand(const int& start, const int& end) {
-    return rand(end - start) + start;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  int64_t rand64() { return static_cast<int64_t>(urand64() / 2); }
-
-  KOKKOS_INLINE_FUNCTION
-  int64_t rand64(const int64_t& range) {
-    const int64_t max_val = (MAX_RAND64 / range) * range;
-    int64_t tmp           = rand64();
-    while (tmp >= max_val) rand64();
-    return tmp % range;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  int64_t rand64(const int64_t& start, const int64_t& end) {
-    return rand64(end - start) + start;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  float frand() { return 1.0f * urand64() / MAX_URAND64; }
-
-  KOKKOS_INLINE_FUNCTION
-  float frand(const float& range) { return range * urand64() / MAX_URAND64; }
-
-  KOKKOS_INLINE_FUNCTION
-  float frand(const float& start, const float& end) {
-    return frand(end - start) + start;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  double drand() { return 1.0 * urand64() / MAX_URAND64; }
-
-  KOKKOS_INLINE_FUNCTION
-  double drand(const double& range) { return range * urand64() / MAX_URAND64; }
-
-  KOKKOS_INLINE_FUNCTION
-  double drand(const double& start, const double& end) {
-    return frand(end - start) + start;
-  }
-
-  // Marsaglia polar method for drawing a standard normal distributed random
-  // number
-  KOKKOS_INLINE_FUNCTION
-  double normal() {
-    double S = 2.0;
-    double U;
-    while (S >= 1.0) {
-      U              = 2.0 * drand() - 1.0;
-      const double V = 2.0 * drand() - 1.0;
-      S              = U * U + V * V;
-    }
-    return U * std::sqrt(-2.0 * log(S) / S);
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  double normal(const double& mean, const double& std_dev = 1.0) {
-    return mean + normal() * std_dev;
-  }
-};
-
-template <>
-inline Random_XorShift64_Pool<
-    Kokkos::Experimental::ROCm>::Random_XorShift64_Pool(uint64_t seed) {
-  num_states_ = 0;
-  init(seed, 4 * 32768);
-}
-
-template <>
-KOKKOS_INLINE_FUNCTION Random_XorShift64<Kokkos::Experimental::ROCm>
-Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::get_state() const {
-#ifdef __HCC_ACCELERATOR__
-  const int i_offset =
-      (threadIdx_x * blockDim_y + threadIdx_y) * blockDim_z + threadIdx_z;
-  int i = (((blockIdx_x * gridDim_y + blockIdx_y) * gridDim_z + blockIdx_z) *
-               blockDim_x * blockDim_y * blockDim_z +
-           i_offset) %
-          num_states_;
-  while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
-    i += blockDim_x * blockDim_y * blockDim_z;
-    if (i >= num_states_) {
-      i = i_offset;
-    }
-  }
-
-  return Random_XorShift64<Kokkos::Experimental::ROCm>(state_(i), i);
-#else
-  return Random_XorShift64<Kokkos::Experimental::ROCm>(state_(0), 0);
-#endif
-}
-
-template <>
-KOKKOS_INLINE_FUNCTION void
-Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::free_state(
-    const Random_XorShift64<Kokkos::Experimental::ROCm>& state) const {
-#ifdef __HCC_ACCELERATOR__
-  state_(state.state_idx_) = state.state_;
-  locks_(state.state_idx_) = 0;
-  return;
-#endif
-}
-
-template <>
-inline Random_XorShift1024_Pool<
-    Kokkos::Experimental::ROCm>::Random_XorShift1024_Pool(uint64_t seed) {
-  num_states_ = 0;
-  init(seed, 4 * 32768);
-}
-
-template <>
-KOKKOS_INLINE_FUNCTION Random_XorShift1024<Kokkos::Experimental::ROCm>
-Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::get_state() const {
-#ifdef __HCC_ACCELERATOR__
-  const int i_offset =
-      (threadIdx_x * blockDim_y + threadIdx_y) * blockDim_z + threadIdx_z;
-  int i = (((blockIdx_x * gridDim_y + blockIdx_y) * gridDim_z + blockIdx_z) *
-               blockDim_x * blockDim_y * blockDim_z +
-           i_offset) %
-          num_states_;
-  while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
-    i += blockDim_x * blockDim_y * blockDim_z;
-    if (i >= num_states_) {
-      i = i_offset;
-    }
-  }
-
-  return Random_XorShift1024<Kokkos::Experimental::ROCm>(state_, p_(i), i);
-#else
-  return Random_XorShift1024<Kokkos::Experimental::ROCm>(state_, p_(0), 0);
-#endif
-}
-
-template <>
-KOKKOS_INLINE_FUNCTION void
-Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::free_state(
-    const Random_XorShift1024<Kokkos::Experimental::ROCm>& state) const {
-#ifdef __HCC_ACCELERATOR__
-  for (int i = 0; i < 16; i++) state_(state.state_idx_, i) = state.state_[i];
-  locks_(state.state_idx_) = 0;
-  return;
-#endif
-}
-
-#endif
-
 namespace Impl {

 template <class ViewType, class RandomPool, int loops, int rank,
@ -2043,7 +1718,7 @@ void fill_random(ViewType a, RandomPool g,
                 typename ViewType::const_value_type range) {
  int64_t LDA = a.extent(0);
  if (LDA > 0)
-    parallel_for((LDA + 127) / 128,
+    parallel_for("Kokkos::fill_random", (LDA + 127) / 128,
                 Impl::fill_random_functor_range<ViewType, RandomPool, 128,
                                                 ViewType::Rank, IndexType>(
                     a, g, range));
@ -2055,7 +1730,7 @@ void fill_random(ViewType a, RandomPool g,
                 typename ViewType::const_value_type end) {
  int64_t LDA = a.extent(0);
  if (LDA > 0)
-    parallel_for((LDA + 127) / 128,
+    parallel_for("Kokkos::fill_random", (LDA + 127) / 128,
                 Impl::fill_random_functor_begin_end<ViewType, RandomPool, 128,
                                                     ViewType::Rank, IndexType>(
                     a, g, begin, end));
--- a/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
@ -201,7 +201,7 @@ class BinSort {
  bool sort_within_bins;

 public:
-  BinSort() {}
+  BinSort() = default;

  //----------------------------------------
  // Constructor: takes the keys, the binning_operator and optionally whether to
@ -327,7 +327,7 @@ class BinSort {
                   Kokkos::RangePolicy<execution_space>(0, len), functor);
    }

-    Kokkos::fence();
+    execution_space().fence();
  }

  template <class ValuesViewType>
@ -349,14 +349,14 @@ class BinSort {

 public:
  KOKKOS_INLINE_FUNCTION
-  void operator()(const bin_count_tag& tag, const int& i) const {
+  void operator()(const bin_count_tag& /*tag*/, const int i) const {
    const int j = range_begin + i;
    bin_count_atomic(bin_op.bin(keys, j))++;
  }

  KOKKOS_INLINE_FUNCTION
-  void operator()(const bin_offset_tag& tag, const int& i, value_type& offset,
-                  const bool& final) const {
+  void operator()(const bin_offset_tag& /*tag*/, const int i,
+                  value_type& offset, const bool& final) const {
    if (final) {
      bin_offsets(i) = offset;
    }
@ -364,7 +364,7 @@ class BinSort {
  }

  KOKKOS_INLINE_FUNCTION
-  void operator()(const bin_binning_tag& tag, const int& i) const {
+  void operator()(const bin_binning_tag& /*tag*/, const int i) const {
    const int j     = range_begin + i;
    const int bin   = bin_op.bin(keys, j);
    const int count = bin_count_atomic(bin)++;
@ -373,7 +373,7 @@ class BinSort {
  }

  KOKKOS_INLINE_FUNCTION
-  void operator()(const bin_sort_bins_tag& tag, const int& i) const {
+  void operator()(const bin_sort_bins_tag& /*tag*/, const int i) const {
    auto bin_size = bin_count_const(i);
    if (bin_size <= 1) return;
    int upper_bound = bin_offsets(i) + bin_size;
@ -381,7 +381,7 @@ class BinSort {
    while (!sorted) {
      sorted      = true;
      int old_idx = sort_order(bin_offsets(i));
-      int new_idx;
+      int new_idx = 0;
      for (int k = bin_offsets(i) + 1; k < upper_bound; k++) {
        new_idx = sort_order(k);

@ -446,7 +446,7 @@ struct BinOp3D {
  typename KeyViewType::non_const_value_type range_[3];
  typename KeyViewType::non_const_value_type min_[3];

-  BinOp3D() {}
+  BinOp3D() = default;

  BinOp3D(int max_bins__[], typename KeyViewType::const_value_type min[],
          typename KeyViewType::const_value_type max[]) {
--- a/lib/kokkos/algorithms/unit_tests/CMakeLists.txt
+++ b/lib/kokkos/algorithms/unit_tests/CMakeLists.txt
@ -20,16 +20,38 @@ KOKKOS_ADD_TEST_LIBRARY(
  HEADERS ${GTEST_SOURCE_DIR}/gtest/gtest.h
  SOURCES ${GTEST_SOURCE_DIR}/gtest/gtest-all.cc
 )
-KOKKOS_TARGET_COMPILE_DEFINITIONS(kokkosalgorithms_gtest PUBLIC "-DGTEST_HAS_PTHREAD=0")
+# WORKAROUND FOR HIPCC
+IF(Kokkos_ENABLE_HIP)
+  TARGET_COMPILE_DEFINITIONS(kokkosalgorithms_gtest PUBLIC "-DGTEST_HAS_PTHREAD=0 --amdgpu-target=gfx906")
+ELSE()
+  TARGET_COMPILE_DEFINITIONS(kokkosalgorithms_gtest PUBLIC "-DGTEST_HAS_PTHREAD=0")
+ENDIF()
+
+TARGET_COMPILE_FEATURES(kokkosalgorithms_gtest PUBLIC cxx_std_11)

 SET(SOURCES
-  UnitTestMain.cpp 
-  TestCuda.cpp
-  )
+  UnitTestMain.cpp
+)

 IF(Kokkos_ENABLE_OPENMP)
  LIST( APPEND SOURCES
    TestOpenMP.cpp
+    TestOpenMP_Sort1D.cpp
+    TestOpenMP_Sort3D.cpp
+    TestOpenMP_SortDynamicView.cpp
+    TestOpenMP_Random.cpp
+  )
+ENDIF()
+
+IF(Kokkos_ENABLE_HIP)
+  LIST( APPEND SOURCES
+    TestHIP.cpp
+  )
+ENDIF()
+
+IF(Kokkos_ENABLE_CUDA)
+  LIST( APPEND SOURCES
+    TestCuda.cpp
  )
 ENDIF()

--- a/lib/kokkos/algorithms/unit_tests/Makefile
+++ b/lib/kokkos/algorithms/unit_tests/Makefile
@ -44,7 +44,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
-	OBJ_OPENMP = TestOpenMP.o UnitTestMain.o gtest-all.o
+	OBJ_OPENMP = TestOpenMP.o TestOpenMP_Random.o TestOpenMP_Sort1D.o TestOpenMP_Sort3D.o TestOpenMP_SortDynamicView.o UnitTestMain.o gtest-all.o
 	TARGETS += KokkosAlgorithms_UnitTest_OpenMP
 	TEST_TARGETS += test-openmp
 endif
--- a/lib/kokkos/algorithms/unit_tests/TestCuda.cpp
+++ b/lib/kokkos/algorithms/unit_tests/TestCuda.cpp
@ -59,11 +59,15 @@
 namespace Test {

 void cuda_test_random_xorshift64(int num_draws) {
-  Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::Cuda> >(num_draws);
+  Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::Cuda>>(num_draws);
+  Impl::test_random<Kokkos::Random_XorShift64_Pool<
+      Kokkos::Device<Kokkos::Cuda, Kokkos::CudaSpace>>>(num_draws);
 }

 void cuda_test_random_xorshift1024(int num_draws) {
-  Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::Cuda> >(num_draws);
+  Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::Cuda>>(num_draws);
+  Impl::test_random<Kokkos::Random_XorShift1024_Pool<
+      Kokkos::Device<Kokkos::Cuda, Kokkos::CudaSpace>>>(num_draws);
 }

 #define CUDA_RANDOM_XORSHIFT64(num_draws) \
--- a/lib/kokkos/algorithms/unit_tests/TestHIP.cpp
+++ b/lib/kokkos/algorithms/unit_tests/TestHIP.cpp
@ -0,0 +1,83 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_HIP
+
+#include <cstdint>
+#include <iostream>
+#include <iomanip>
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#include <TestRandom.hpp>
+#include <TestSort.hpp>
+
+namespace Test {
+
+void hip_test_random_xorshift64(size_t num_draws) {
+  Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::Experimental::HIP>>(
+      num_draws);
+  Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::Device<
+      Kokkos::Experimental::HIP, Kokkos::Experimental::HIPSpace>>>(num_draws);
+}
+
+void hip_test_random_xorshift1024(size_t num_draws) {
+  Impl::test_random<
+      Kokkos::Random_XorShift1024_Pool<Kokkos::Experimental::HIP>>(num_draws);
+  Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::Device<
+      Kokkos::Experimental::HIP, Kokkos::Experimental::HIPSpace>>>(num_draws);
+}
+
+TEST(hip, Random_XorShift64) { hip_test_random_xorshift64(132141141); }
+TEST(hip, Random_XorShift1024_0) { hip_test_random_xorshift1024(52428813); }
+TEST(hip, SortUnsigned) {
+  Impl::test_sort<Kokkos::Experimental::HIP, unsigned>(171);
+}
+}  // namespace Test
+#else
+void KOKKOS_ALGORITHMS_UNITTESTS_TESTHIP_PREVENT_LINK_ERROR() {}
+#endif /* #ifdef KOKKOS_ENABLE_HIP */
--- a/lib/kokkos/algorithms/unit_tests/TestOpenMP.cpp
+++ b/lib/kokkos/algorithms/unit_tests/TestOpenMP.cpp
@ -55,30 +55,8 @@

 namespace Test {

-#define OPENMP_RANDOM_XORSHIFT64(num_draws)                             \
-  TEST(openmp, Random_XorShift64) {                                     \
-    Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::OpenMP> >( \
-        num_draws);                                                     \
-  }
+TEST(openmp, SortIssue1160) { Impl::test_issue_1160_sort<Kokkos::OpenMP>(); }

-#define OPENMP_RANDOM_XORSHIFT1024(num_draws)                             \
-  TEST(openmp, Random_XorShift1024) {                                     \
-    Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::OpenMP> >( \
-        num_draws);                                                       \
-  }
-
-#define OPENMP_SORT_UNSIGNED(size)                   \
-  TEST(openmp, SortUnsigned) {                       \
-    Impl::test_sort<Kokkos::OpenMP, unsigned>(size); \
-  }
-
-OPENMP_RANDOM_XORSHIFT64(10240000)
-OPENMP_RANDOM_XORSHIFT1024(10130144)
-OPENMP_SORT_UNSIGNED(171)
-
-#undef OPENMP_RANDOM_XORSHIFT64
-#undef OPENMP_RANDOM_XORSHIFT1024
-#undef OPENMP_SORT_UNSIGNED
 }  // namespace Test
 #else
 void KOKKOS_ALGORITHMS_UNITTESTS_TESTOPENMP_PREVENT_LINK_ERROR() {}
--- a/lib/kokkos/algorithms/unit_tests/TestOpenMP_Random.cpp
+++ b/lib/kokkos/algorithms/unit_tests/TestOpenMP_Random.cpp
@ -0,0 +1,77 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_OPENMP
+
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+
+//----------------------------------------------------------------------------
+#include <TestRandom.hpp>
+#include <iomanip>
+
+namespace Test {
+
+#define OPENMP_RANDOM_XORSHIFT64(num_draws)                             \
+  TEST(openmp, Random_XorShift64) {                                     \
+    Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::OpenMP> >( \
+        num_draws);                                                     \
+  }
+
+#define OPENMP_RANDOM_XORSHIFT1024(num_draws)                             \
+  TEST(openmp, Random_XorShift1024) {                                     \
+    Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::OpenMP> >( \
+        num_draws);                                                       \
+  }
+
+OPENMP_RANDOM_XORSHIFT64(10240000)
+OPENMP_RANDOM_XORSHIFT1024(10130144)
+
+#undef OPENMP_RANDOM_XORSHIFT64
+#undef OPENMP_RANDOM_XORSHIFT1024
+}  // namespace Test
+#else
+void KOKKOS_ALGORITHMS_UNITTESTS_TESTOPENMP_PREVENT_LINK_ERROR() {}
+#endif
--- a/lib/kokkos/algorithms/unit_tests/TestOpenMP_Sort1D.cpp
+++ b/lib/kokkos/algorithms/unit_tests/TestOpenMP_Sort1D.cpp
@ -0,0 +1,65 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_OPENMP
+
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+
+//----------------------------------------------------------------------------
+#include <TestRandom.hpp>
+#include <TestSort.hpp>
+#include <iomanip>
+
+namespace Test {
+
+TEST(openmp, SortUnsigned1D) {
+  Impl::test_1D_sort<Kokkos::OpenMP, unsigned>(171);
+}
+
+}  // namespace Test
+#else
+void KOKKOS_ALGORITHMS_UNITTESTS_TESTOPENMP_PREVENT_LINK_ERROR() {}
+#endif
--- a/lib/kokkos/algorithms/unit_tests/TestOpenMP_Sort3D.cpp
+++ b/lib/kokkos/algorithms/unit_tests/TestOpenMP_Sort3D.cpp
@ -0,0 +1,65 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_OPENMP
+
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+
+//----------------------------------------------------------------------------
+#include <TestRandom.hpp>
+#include <TestSort.hpp>
+#include <iomanip>
+
+namespace Test {
+
+TEST(openmp, SortUnsigned3D) {
+  Impl::test_3D_sort<Kokkos::OpenMP, unsigned>(171);
+}
+
+}  // namespace Test
+#else
+void KOKKOS_ALGORITHMS_UNITTESTS_TESTOPENMP_PREVENT_LINK_ERROR() {}
+#endif
--- a/lib/kokkos/algorithms/unit_tests/TestOpenMP_SortDynamicView.cpp
+++ b/lib/kokkos/algorithms/unit_tests/TestOpenMP_SortDynamicView.cpp
@ -0,0 +1,65 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_OPENMP
+
+#include <gtest/gtest.h>
+#include <Kokkos_Core.hpp>
+
+//----------------------------------------------------------------------------
+#include <TestRandom.hpp>
+#include <TestSort.hpp>
+#include <iomanip>
+
+namespace Test {
+
+TEST(openmp, SortUnsignedDynamicView) {
+  Impl::test_dynamic_view_sort<Kokkos::OpenMP, unsigned>(171);
+}
+
+}  // namespace Test
+#else
+void KOKKOS_ALGORITHMS_UNITTESTS_TESTOPENMP_PREVENT_LINK_ERROR() {}
+#endif
--- a/Show More
+++ b/Show More