Merge pull request #2311 from stanmoore1/kk_update_3.2

Update Kokkos library in LAMMPS to v3.2
2020-08-26 15:35:25 -04:00 · 2020-08-26 15:35:25 -04:00 · d00807ee9a
parent aeb3e20385 50b8b1bf60
commit d00807ee9a
1437 changed files with 19592 additions and 72168 deletions
--- a/cmake/Modules/Packages/KOKKOS.cmake
+++ b/cmake/Modules/Packages/KOKKOS.cmake
@ -35,8 +35,8 @@ if(DOWNLOAD_KOKKOS)
  list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}")
  include(ExternalProject)
  ExternalProject_Add(kokkos_build
-    URL https://github.com/kokkos/kokkos/archive/3.1.01.tar.gz
-    URL_MD5 3ccb2100f7fc316891e7dad3bc33fa37
+    URL https://github.com/kokkos/kokkos/archive/3.2.00.tar.gz
+    URL_MD5 81569170fe232e5e64ab074f7cca5e50
    CMAKE_ARGS ${KOKKOS_LIB_BUILD_ARGS}
    BUILD_BYPRODUCTS <INSTALL_DIR>/lib/libkokkoscore.a
  )
@ -50,7 +50,7 @@ if(DOWNLOAD_KOKKOS)
  target_link_libraries(lammps PRIVATE LAMMPS::KOKKOS)
  add_dependencies(LAMMPS::KOKKOS kokkos_build)
 elseif(EXTERNAL_KOKKOS)
-  find_package(Kokkos 3.1.01 REQUIRED CONFIG)
+  find_package(Kokkos 3.2.00 REQUIRED CONFIG)
  target_link_libraries(lammps PRIVATE Kokkos::kokkos)
 else()
  set(LAMMPS_LIB_KOKKOS_SRC_DIR ${LAMMPS_LIB_SOURCE_DIR}/kokkos)
--- a/doc/src/Build_extras.rst
+++ b/doc/src/Build_extras.rst
@ -361,9 +361,12 @@ be specified in uppercase.
   *  - AMDAVX
      - HOST
      - AMD 64-bit x86 CPU (AVX 1)
-   *  - EPYC
+   *  - ZEN
      - HOST
-      - AMD EPYC Zen class CPU (AVX 2)
+      - AMD Zen class CPU (AVX 2)
+   *  - ZEN2
+      - HOST
+      - AMD Zen2 class CPU (AVX 2)
   *  - ARMV80
      - HOST
      - ARMv8.0 Compatible CPU
@ -445,12 +448,18 @@ be specified in uppercase.
   *  - TURING75
      - GPU
      - NVIDIA Turing generation CC 7.5 GPU
+   *  - AMPERE80
+      - GPU
+      - NVIDIA Ampere generation CC 8.0 GPU
   *  - VEGA900
      - GPU
      - AMD GPU MI25 GFX900
   *  - VEGA906
      - GPU
      - AMD GPU MI50/MI60 GFX906
+   *  - INTEL_GEN
+      - GPU
+      - Intel GPUs Gen9+

 Basic CMake build settings:
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
--- a/lib/kokkos/BUILD.md
+++ b/lib/kokkos/BUILD.md
@ -10,33 +10,45 @@ for C++.  Applications heavily leveraging Kokkos are strongly encouraged to use
 You can either use Kokkos as an installed package (encouraged) or use Kokkos in-tree in your project.
 Modern CMake is exceedingly simple at a high-level (with the devil in the details).
 Once Kokkos is installed In your `CMakeLists.txt` simply use:
-````
+````cmake
 find_package(Kokkos REQUIRED)
 ````
 Then for every executable or library in your project:
-````
+````cmake
 target_link_libraries(myTarget Kokkos::kokkos)
 ````
 That's it! There is no checking Kokkos preprocessor, compiler, or linker flags.
 Kokkos propagates all the necessary flags to your project.
 This means not only is linking to Kokkos easy, but Kokkos itself can actually configure compiler and linker flags for *your*
-project. If building in-tree, there is no `find_package` and you link with `target_link_libraries(kokkos)`.
+project.
+When configuring your project just set:
+````bash
+> cmake ${srcdir} \
+  -DKokkos_ROOT=${kokkos_install_prefix} \
+  -DCMAKE_CXX_COMPILER=${compiler_used_to_build_kokkos}
+````
+Note: You may need the following if using some versions of CMake (e.g. 3.12):
+````cmake
+cmake_policy(SET CMP0074 NEW)
+````
+If building in-tree, there is no `find_package`. You can use `add_subdirectory(kokkos)` with the Kokkos source and again just link with `target_link_libraries(Kokkos::kokkos)`.
+The examples in `examples/cmake_build_installed` and `examples/cmake_build_in_tree` can help get you started.


 ## Configuring CMake
-A very basic installation is done with:
-````
-cmake ${srcdir} \
+A very basic installation of Kokkos is done with:
+````bash
+> cmake ${srcdir} \
 -DCMAKE_CXX_COMPILER=g++ \
- -DCMAKE_INSTALL_PREFIX=${my_install_folder}
+ -DCMAKE_INSTALL_PREFIX=${kokkos_install_folder}
 ````
 which builds and installed a default Kokkos when you run `make install`.
 There are numerous device backends, options, and architecture-specific optimizations that can be configured, e.g.
-````
-cmake ${srcdir} \
+````bash
+> cmake ${srcdir} \
 -DCMAKE_CXX_COMPILER=g++ \
- -DCMAKE_INSTALL_PREFIX=${my_install_folder} \
- -DKokkos_ENABLE_OPENMP=On
+ -DCMAKE_INSTALL_PREFIX=${kokkos_install_folder} \
+ -DKokkos_ENABLE_OPENMP=ON
 ````
 which activates the OpenMP backend. All of the options controlling device backends, options, architectures, and third-party libraries (TPLs) are given below.

@ -50,16 +62,16 @@ which activates the OpenMP backend. All of the options controlling device backen
 ## Spack
 An alternative to manually building with the CMake is to use the Spack package manager.
 To do so, download the `kokkos-spack` git repo and add to the package list:
-````
-spack repo add $path-to-kokkos-spack
+````bash
+> spack repo add $path-to-kokkos-spack
 ````
 A basic installation would be done as:
-````
-spack install kokkos
+````bash
+> spack install kokkos
 ````
 Spack allows options and and compilers to be tuned in the install command.
-````
-spack install kokkos@3.0 %gcc@7.3.0 +openmp
+````bash
+> spack install kokkos@3.0 %gcc@7.3.0 +openmp
 ````
 This example illustrates the three most common parameters to Spack:
 * Variants: specified with, e.g. `+openmp`, this activates (or deactivates with, e.g. `~openmp`) certain options.
@ -67,17 +79,17 @@ This example illustrates the three most common parameters to Spack:
 * Compiler: a default compiler will be chosen if not specified, but an exact compiler version can be given with the `%`option.

 For a complete list of Kokkos options, run:
+````bash
+> spack info kokkos
 ````
-spack info kokkos
-````
-More details can be found in the kokkos-spack repository [README](https://github.com/kokkos/kokkos-spack/blob/master/README.md).
+More details can be found in the [Spack README](Spack.md)

 #### Spack Development
 Spack currently installs packages to a location determined by a unique hash. This hash name is not really "human readable".
 Generally, Spack usage should never really require you to reference the computer-generated unique install folder.
 If you must know, you can locate Spack Kokkos installations with:
-````
-spack find -p kokkos ...
+````bash
+> spack find -p kokkos ...
 ````
 where `...` is the unique spec identifying the particular Kokkos configuration and version.

@ -102,8 +114,14 @@ Device backends can be enabled by specifying `-DKokkos_ENABLE_X`.
    * Whether to build Pthread backend
    * BOOL Default: OFF
 * Kokkos_ENABLE_SERIAL
-    * Whether to build serial  backend
+    * Whether to build serial backend
    * BOOL Default: ON
+* Kokkos_ENABLE_HIP (Experimental)
+    * Whether to build HIP backend
+    * BOOL Default: OFF
+* Kokkos_ENABLE_OPENMPTARGET (Experimental)
+    * Whether to build the OpenMP target backend
+    * BOOL Default: OFF

 ## Enable Options
 Options can be enabled by specifying `-DKokkos_ENABLE_X`.
@ -138,9 +156,6 @@ Options can be enabled by specifying `-DKokkos_ENABLE_X`.
 * Kokkos_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK
    * Debug check on dual views
    * BOOL Default: OFF
-* Kokkos_ENABLE_DEPRECATED_CODE
-    * Whether to enable deprecated code
-    * BOOL Default: OFF
 * Kokkos_ENABLE_EXAMPLES
    * Whether to enable building examples
    * BOOL Default: OFF
@ -150,9 +165,6 @@ Options can be enabled by specifying `-DKokkos_ENABLE_X`.
 * Kokkos_ENABLE_LARGE_MEM_TESTS
    * Whether to perform extra large memory tests
    * BOOL_Default: OFF
-* Kokkos_ENABLE_PROFILING
-    * Whether to create bindings for profiling tools
-    * BOOL Default: ON
 * Kokkos_ENABLE_PROFILING_LOAD_PRINT
    * Whether to print information about which profiling tools gotloaded
    * BOOL Default: OFF
@ -235,8 +247,11 @@ Architecture-specific optimizations can be enabled by specifying `-DKokkos_ARCH_
 * Kokkos_ARCH_BGQ
    * Whether to optimize for the BGQ architecture
    * BOOL Default: OFF
-* Kokkos_ARCH_EPYC
-    * Whether to optimize for the EPYC architecture
+* Kokkos_ARCH_ZEN
+    * Whether to optimize for the Zen architecture
+    * BOOL Default: OFF
+* Kokkos_ARCH_ZEN2
+    * Whether to optimize for the Zen2 architecture
    * BOOL Default: OFF
 * Kokkos_ARCH_HSW
    * Whether to optimize for the HSW architecture
--- a/lib/kokkos/CHANGELOG.md
+++ b/lib/kokkos/CHANGELOG.md
@ -1,6 +1,113 @@
 # Change Log

-## [3.1.1](https://github.com/kokkos/kokkos/tree/3.1.1) (2020-04-14)
+## [3.2.00](https://github.com/kokkos/kokkos/tree/3.2.00) (2020-08-19)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/3.1.01...3.2.00)
+
+**Implemented enhancements:**
+
+- HIP:Enable stream in HIP [\#3163](https://github.com/kokkos/kokkos/issues/3163)
+- HIP:Add support for shuffle reduction for the HIP backend [\#3154](https://github.com/kokkos/kokkos/issues/3154)
+- HIP:Add implementations of missing HIPHostPinnedSpace methods for LAMMPS [\#3137](https://github.com/kokkos/kokkos/issues/3137)
+- HIP:Require HIP 3.5.0 or higher [\#3099](https://github.com/kokkos/kokkos/issues/3099)
+- HIP:WorkGraphPolicy for HIP [\#3096](https://github.com/kokkos/kokkos/issues/3096)
+- OpenMPTarget: Significant update to the new experimental backend.  Requires C++17, works on Intel GPUs, reference counting fixes. [\#3169](https://github.com/kokkos/kokkos/issues/3169)
+- Windows Cuda support [\#3018](https://github.com/kokkos/kokkos/issues/3018)
+- Pass `-Wext-lambda-captures-this` to NVCC when support for `__host__ __device__` lambda is enabled from CUDA 11 [\#3241](https://github.com/kokkos/kokkos/issues/3241)
+- Use explicit staging buffer for constant memory kernel launches and cleanup host/device synchronization [\#3234](https://github.com/kokkos/kokkos/issues/3234)
+- Various fixup to policies including making TeamPolicy default constructible and making RangePolicy and TeamPolicy assignable 1: [\#3202](https://github.com/kokkos/kokkos/issues/3202)
+- Various fixup to policies including making TeamPolicy default constructible and making RangePolicy and TeamPolicy assignable 2: [\#3203](https://github.com/kokkos/kokkos/issues/3203)
+- Various fixup to policies including making TeamPolicy default constructible and making RangePolicy and TeamPolicy assignable 3: [\#3196](https://github.com/kokkos/kokkos/issues/3196)
+- Annotations for `DefaultExectutionSpace` and `DefaultHostExectutionSpace` to use in static analysis [\#3189](https://github.com/kokkos/kokkos/issues/3189)
+- Add documentation on using Spack to install Kokkos and developing packages that depend on Kokkos [\#3187](https://github.com/kokkos/kokkos/issues/3187)
+- Improve support for nvcc\_wrapper with exotic host compiler [\#3186](https://github.com/kokkos/kokkos/issues/3186)
+- Add OpenMPTarget backend flags for NVC++ compiler [\#3185](https://github.com/kokkos/kokkos/issues/3185)
+- Move deep\_copy/create\_mirror\_view on Experimental::OffsetView into Kokkos:: namespace [\#3166](https://github.com/kokkos/kokkos/issues/3166)
+- Allow for larger block size in HIP [\#3165](https://github.com/kokkos/kokkos/issues/3165)
+- View: Added names of Views to the different View initialize/free kernels [\#3159](https://github.com/kokkos/kokkos/issues/3159)
+- Cuda: Caching cudaFunctorAttributes and whether L1/Shmem prefer was set [\#3151](https://github.com/kokkos/kokkos/issues/3151)
+- BuildSystem: Provide an explicit default CMAKE\_BUILD\_TYPE [\#3131](https://github.com/kokkos/kokkos/issues/3131)
+- Cuda: Update CUDA occupancy calculation [\#3124](https://github.com/kokkos/kokkos/issues/3124)
+- Vector: Adding data() to Vector [\#3123](https://github.com/kokkos/kokkos/issues/3123)
+- BuildSystem: Add CUDA Ampere configuration support [\#3122](https://github.com/kokkos/kokkos/issues/3122)
+- General: Apply [[noreturn]] to Kokkos::abort when applicable [\#3106](https://github.com/kokkos/kokkos/issues/3106)
+- TeamPolicy: Validate storage level argument passed to TeamPolicy::set\_scratch\_size() [\#3098](https://github.com/kokkos/kokkos/issues/3098)
+- nvcc\_wrapper: send --cudart to nvcc instead of host compiler [\#3092](https://github.com/kokkos/kokkos/issues/3092)
+- BuildSystem: Make kokkos\_has\_string() function in Makefile.kokkos case insensitive [\#3091](https://github.com/kokkos/kokkos/issues/3091)
+- Modify KOKKOS\_FUNCTION macro for clang-tidy analysis [\#3087](https://github.com/kokkos/kokkos/issues/3087)
+- Move allocation profiling to allocate/deallocate calls [\#3084](https://github.com/kokkos/kokkos/issues/3084)
+- BuildSystem: FATAL\_ERROR when attempting in-source build [\#3082](https://github.com/kokkos/kokkos/issues/3082)
+- Change enums in ScatterView to types [\#3076](https://github.com/kokkos/kokkos/issues/3076)
+- HIP: Changes for new compiler/runtime [\#3067](https://github.com/kokkos/kokkos/issues/3067)
+- Extract and use get\_gpu [\#3061](https://github.com/kokkos/kokkos/issues/3061)
+- Extract and use get\_gpu [\#3048](https://github.com/kokkos/kokkos/issues/3048)
+- Add is\_allocated to View-like containers [\#3059](https://github.com/kokkos/kokkos/issues/3059)
+- Combined reducers for scalar references [\#3052](https://github.com/kokkos/kokkos/issues/3052)
+- Add configurable capacity for UniqueToken [\#3051](https://github.com/kokkos/kokkos/issues/3051)
+- Add installation testing [\#3034](https://github.com/kokkos/kokkos/issues/3034)
+- BuildSystem: Add -expt-relaxed-constexpr flag to nvcc\_wrapper [\#3021](https://github.com/kokkos/kokkos/issues/3021)
+- HIP: Add UniqueToken [\#3020](https://github.com/kokkos/kokkos/issues/3020)
+- Autodetect number of devices [\#3013](https://github.com/kokkos/kokkos/issues/3013)
+
+
+**Fixed bugs:**
+
+- Check error code from `cudaStreamSynchronize` in CUDA fences [\#3255](https://github.com/kokkos/kokkos/issues/3255)
+- Fix issue with C++ standard flags when using `nvcc\_wrapper` with PGI [\#3254](https://github.com/kokkos/kokkos/issues/3254)
+- Add missing threadfence in lock-based atomics [\#3208](https://github.com/kokkos/kokkos/issues/3208)
+- Fix dedup of linker flags for shared lib on CMake <=3.12 [\#3176](https://github.com/kokkos/kokkos/issues/3176)
+- Fix memory leak with CUDA streams [\#3170](https://github.com/kokkos/kokkos/issues/3170)
+- BuildSystem: Fix OpenMP Target flags for Cray [\#3161](https://github.com/kokkos/kokkos/issues/3161)
+- ScatterView: fix for OpenmpTarget remove inheritance from reducers [\#3162](https://github.com/kokkos/kokkos/issues/3162)
+- BuildSystem: Set OpenMP flags according to host compiler [\#3127](https://github.com/kokkos/kokkos/issues/3127)
+- OpenMP: Fix logic for nested omp in partition\_master bug [\#3101](https://github.com/kokkos/kokkos/issues/3101)
+- BuildSystem: Fixes for Cuda/11 and c++17 [\#3085](https://github.com/kokkos/kokkos/issues/3085)
+- HIP: Fix print\_configuration [\#3080](https://github.com/kokkos/kokkos/issues/3080)
+- Conditionally define get\_gpu [\#3072](https://github.com/kokkos/kokkos/issues/3072)
+- Fix bounds for ranges in random number generator [\#3069](https://github.com/kokkos/kokkos/issues/3069)
+- Fix Cuda minor arch check [\#3035](https://github.com/kokkos/kokkos/issues/3035)
+
+**Incompatibilities:**
+
+- Remove ETI support [\#3157](https://github.com/kokkos/kokkos/issues/3157)
+- Remove KOKKOS\_INTERNAL\_ENABLE\_NON\_CUDA\_BACKEND [\#3147](https://github.com/kokkos/kokkos/issues/3147)
+- Remove core/unit\_test/config [\#3146](https://github.com/kokkos/kokkos/issues/3146)
+- Removed the preprocessor branch for KOKKOS\_ENABLE\_PROFILING [\#3115](https://github.com/kokkos/kokkos/issues/3115)
+- Disable profiling with MSVC [\#3066](https://github.com/kokkos/kokkos/issues/3066)
+
+**Closed issues:**
+
+- Silent error (Validate storage level arg to set_scratch_size) [\#3097](https://github.com/kokkos/kokkos/issues/3097)
+- Remove KOKKKOS\_ENABLE\_PROFILING Option [\#3095](https://github.com/kokkos/kokkos/issues/3095)
+- Cuda 11 -\> allow C++17 [\#3083](https://github.com/kokkos/kokkos/issues/3083)
+- In source build failure not explained [\#3081](https://github.com/kokkos/kokkos/issues/3081)
+- Allow naming of Views for initialization kernel [\#3070](https://github.com/kokkos/kokkos/issues/3070)
+- DefaultInit tests failing when using CTest resource allocation feature [\#3040](https://github.com/kokkos/kokkos/issues/3040)
+- Add installation testing.  [\#3037](https://github.com/kokkos/kokkos/issues/3037)
+- nvcc\_wrapper needs to handle `-expt-relaxed-constexpr` flag [\#3017](https://github.com/kokkos/kokkos/issues/3017)
+- CPU core oversubscription warning on macOS with OpenMP backend [\#2996](https://github.com/kokkos/kokkos/issues/2996)
+- Default behavior of KOKKOS\_NUM\_DEVICES to use all devices available [\#2975](https://github.com/kokkos/kokkos/issues/2975)
+- Assert blocksize \> 0 [\#2974](https://github.com/kokkos/kokkos/issues/2974)
+- Add ability to assign kokkos profile function from executable  [\#2973](https://github.com/kokkos/kokkos/issues/2973)
+- ScatterView Support for the pre/post increment operator [\#2967](https://github.com/kokkos/kokkos/issues/2967)
+
+- Compiler issue: Cuda build with clang 10 has errors with the atomic unit tests [\#3237](https://github.com/kokkos/kokkos/issues/3237)
+- Incompatibility of flags for C++ standard with PGI v20.4 on Power9/NVIDIA V100 system [\#3252](https://github.com/kokkos/kokkos/issues/3252)
+- Error configuring as subproject [\#3140](https://github.com/kokkos/kokkos/issues/3140)
+- CMake fails with Nvidia compilers when the GPU architecture option is not supplied (Fix configure with OMPT and Cuda) [\#3207](https://github.com/kokkos/kokkos/issues/3207)
+- PGI compiler being passed the gcc -fopenmp flag [\#3125](https://github.com/kokkos/kokkos/issues/3125)
+- Cuda: Memory leak when using CUDA stream [\#3167](https://github.com/kokkos/kokkos/issues/3167)
+- RangePolicy has an implicitly deleted assignment operator [\#3192](https://github.com/kokkos/kokkos/issues/3192)
+- MemorySpace::allocate needs to have memory pool counting.  [\#3064](https://github.com/kokkos/kokkos/issues/3064)
+- Missing write fence for lock based atomics on CUDA [\#3038](https://github.com/kokkos/kokkos/issues/3038)
+- CUDA compute capability version check problem [\#3026](https://github.com/kokkos/kokkos/issues/3026)
+- Make DynRankView fencing consistent [\#3014](https://github.com/kokkos/kokkos/issues/3014)
+- nvcc\_wrapper cant handle -Xcompiler -o out.o [\#2993](https://github.com/kokkos/kokkos/issues/2993)
+- Reductions of non-trivial types of size 4 fail in CUDA shfl operations [\#2990](https://github.com/kokkos/kokkos/issues/2990)
+- complex\_double misalignment in reduce, clang+CUDA [\#2989](https://github.com/kokkos/kokkos/issues/2989)
+- Span of degenerated \(zero-length\) subviews is not zero in some special cases [\#2979](https://github.com/kokkos/kokkos/issues/2979)
+- Rank 1 custom layouts dont work as expected. [\#2840](https://github.com/kokkos/kokkos/issues/2840)
+
+## [3.1.01](https://github.com/kokkos/kokkos/tree/3.1.1) (2020-04-14)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/3.1.00...3.1.1)

 **Fixed bugs:**
--- a/lib/kokkos/CMakeLists.txt
+++ b/lib/kokkos/CMakeLists.txt
@ -1,4 +1,9 @@

+# Disable in-source builds to prevent source tree corruption.
+if( "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}" )
+  message( FATAL_ERROR "FATAL: In-source builds are not allowed. You should create a separate directory for build files." )
+endif()
+
 # We want to determine if options are given with the wrong case
 # In order to detect which arguments are given to compare against
 # the list of valid arguments, at the beginning here we need to
@ -34,6 +39,9 @@ IF(COMMAND TRIBITS_PACKAGE_DECL)
 ELSE()
  SET(KOKKOS_HAS_TRILINOS OFF)
 ENDIF()
+# Is this build a subdirectory of another project
+GET_DIRECTORY_PROPERTY(HAS_PARENT PARENT_DIRECTORY)
+

 INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_functions.cmake)
 INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_pick_cxx_std.cmake)
@ -75,16 +83,17 @@ IF(NOT KOKKOS_HAS_TRILINOS)
      SET(CMAKE_CXX_COMPILER ${SPACK_CXX} CACHE STRING "the C++ compiler" FORCE)
      SET(ENV{CXX} ${SPACK_CXX})
    ENDIF()
-  ENDif()
-  IF(NOT DEFINED ${PROJECT_NAME})
-    # WORKAROUND FOR HIPCC
-    IF(Kokkos_ENABLE_HIP)
-      SET(KOKKOS_INTERNAL_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
-      SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --amdgpu-target=gfx906")
-    ENDIF()
-    PROJECT(Kokkos CXX)
-    IF(Kokkos_ENABLE_HIP)
-      SET(CMAKE_CXX_FLAGS ${KOKKOS_INTERNAL_CMAKE_CXX_FLAGS})
+  ENDIF()
+  # Always call the project command to define Kokkos_ variables
+  # and to make sure that C++ is an enabled language
+  PROJECT(Kokkos CXX)
+  IF(NOT HAS_PARENT)
+    IF (NOT CMAKE_BUILD_TYPE)
+      SET(DEFAULT_BUILD_TYPE "RelWithDebInfo")
+      MESSAGE(STATUS "Setting build type to '${DEFAULT_BUILD_TYPE}' as none was specified.")
+      SET(CMAKE_BUILD_TYPE "${DEFAULT_BUILD_TYPE}" CACHE STRING
+          "Choose the type of build, options are: Debug, Release, RelWithDebInfo and MinSizeRel."
+          FORCE)
    ENDIF()
  ENDIF()
 ENDIF()
@ -102,8 +111,8 @@ ENDIF()


 set(Kokkos_VERSION_MAJOR 3)
-set(Kokkos_VERSION_MINOR 1)
-set(Kokkos_VERSION_PATCH 1)
+set(Kokkos_VERSION_MINOR 2)
+set(Kokkos_VERSION_PATCH 0)
 set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}")
 math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}")

@ -147,6 +156,7 @@ INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_tribits.cmake)
 # Check the environment and set certain variables
 # to allow platform-specific checks
 INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_check_env.cmake)
+
 # The build environment setup goes in the following steps
 # 1) Check all the enable options. This includes checking Kokkos_DEVICES
 # 2) Check the compiler ID (type and version)
@ -169,7 +179,6 @@ SET(KOKKOS_EXT_LIBRARIES Kokkos::kokkos Kokkos::kokkoscore Kokkos::kokkoscontain
 SET(KOKKOS_INT_LIBRARIES kokkos kokkoscore kokkoscontainers kokkosalgorithms)
 SET_PROPERTY(GLOBAL PROPERTY KOKKOS_INT_LIBRARIES ${KOKKOS_INT_LIBRARIES})

-GET_DIRECTORY_PROPERTY(HAS_PARENT PARENT_DIRECTORY)
 IF (KOKKOS_HAS_TRILINOS)
  SET(TRILINOS_INCDIR ${CMAKE_INSTALL_PREFIX}/${${PROJECT_NAME}_INSTALL_INCLUDE_DIR})
  SET(KOKKOS_HEADER_DIR ${TRILINOS_INCDIR})
@ -203,7 +212,7 @@ IF (KOKKOS_HAS_TRILINOS)
    SET(KOKKOSCORE_XCOMPILER_OPTIONS "${KOKKOSCORE_XCOMPILER_OPTIONS} -Xcompiler ${XCOMP_FLAG}")
    LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS -Xcompiler ${XCOMP_FLAG})
  ENDFOREACH()
-  SET(KOKKOSCORE_CXX_FLAGS "${KOKKOSCORE_COMPILE_OPTIONS} ${CMAKE_CXX${KOKKOS_CXX_STANDARD}_STANDARD_COMPILE_OPTION} ${KOKKOSCORE_XCOMPILER_OPTIONS}")
+  SET(KOKKOSCORE_CXX_FLAGS "${KOKKOSCORE_COMPILE_OPTIONS} ${KOKKOSCORE_XCOMPILER_OPTIONS}")
  IF (KOKKOS_ENABLE_CUDA)
    STRING(REPLACE ";" " " KOKKOSCORE_CUDA_OPTIONS    "${KOKKOS_CUDA_OPTIONS}")
    FOREACH(CUDAFE_FLAG ${KOKKOS_CUDAFE_OPTIONS})
@ -246,7 +255,7 @@ KOKKOS_PACKAGE_POSTPROCESS()
 #We are ready to configure the header
 CONFIGURE_FILE(cmake/KokkosCore_config.h.in KokkosCore_config.h @ONLY)

-IF (NOT KOKKOS_HAS_TRILINOS)
+IF (NOT KOKKOS_HAS_TRILINOS AND NOT Kokkos_INSTALL_TESTING)
  ADD_LIBRARY(kokkos INTERFACE)
  #Make sure in-tree projects can reference this as Kokkos::
  #to match the installed target names
@ -262,8 +271,6 @@ INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_install.cmake)
 # If the argument of DESTINATION is a relative path, CMake computes it
 # as relative to ${CMAKE_INSTALL_PATH}.
 INSTALL(PROGRAMS ${CMAKE_CURRENT_SOURCE_DIR}/bin/nvcc_wrapper DESTINATION ${CMAKE_INSTALL_BINDIR})
-INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_config.h" DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
-

 #  Finally - if we are a subproject - make sure the enabled devices are visible
 IF (HAS_PARENT)
--- a/lib/kokkos/Makefile.kokkos
+++ b/lib/kokkos/Makefile.kokkos
@ -11,20 +11,20 @@ CXXFLAGS += $(SHFLAGS)
 endif

 KOKKOS_VERSION_MAJOR = 3
-KOKKOS_VERSION_MINOR = 1
-KOKKOS_VERSION_PATCH = 1
+KOKKOS_VERSION_MINOR = 2
+KOKKOS_VERSION_PATCH = 0
 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc)

 # Options: Cuda,HIP,ROCm,OpenMP,Pthread,Serial
 KOKKOS_DEVICES ?= "OpenMP"
 #KOKKOS_DEVICES ?= "Pthread"
-# Options:
+# Options: 
 # Intel:    KNC,KNL,SNB,HSW,BDW,SKX
-# NVIDIA:   Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75
+# NVIDIA:   Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80
 # ARM:      ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2
 # IBM:      BGQ,Power7,Power8,Power9
 # AMD-GPUS: Vega900,Vega906
-# AMD-CPUS: AMDAVX,EPYC
+# AMD-CPUS: AMDAVX,Zen,Zen2
 KOKKOS_ARCH ?= ""
 # Options: yes,no
 KOKKOS_DEBUG ?= "no"
@ -32,10 +32,8 @@ KOKKOS_DEBUG ?= "no"
 KOKKOS_USE_TPLS ?= ""
 # Options: c++11,c++14,c++1y,c++17,c++1z,c++2a
 KOKKOS_CXX_STANDARD ?= "c++11"
-# Options: aggressive_vectorization,disable_profiling,enable_deprecated_code,disable_deprecated_code,enable_large_mem_tests,disable_complex_align
+# Options: aggressive_vectorization,disable_profiling,enable_large_mem_tests,disable_complex_align
 KOKKOS_OPTIONS ?= ""
-# Option for setting ETI path
-KOKKOS_ETI_PATH ?= ${KOKKOS_PATH}/core/src/eti
 KOKKOS_CMAKE ?= "no"
 KOKKOS_TRIBITS ?= "no"
 KOKKOS_STANDALONE_CMAKE ?= "no"
@ -74,6 +72,7 @@ KOKKOS_INTERNAL_ENABLE_CXX1Y := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),
 KOKKOS_INTERNAL_ENABLE_CXX17 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++17)
 KOKKOS_INTERNAL_ENABLE_CXX1Z := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++1z)
 KOKKOS_INTERNAL_ENABLE_CXX2A := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++2a)
+KOKKOS_INTERNAL_ENABLE_CXX20 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++20)

 # Check for external libraries.
 KOKKOS_INTERNAL_USE_HWLOC := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),hwloc)
@ -83,9 +82,7 @@ KOKKOS_INTERNAL_USE_MEMKIND := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),exper
 # Check for advanced settings.
 KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),compiler_warnings)
 KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION := $(call kokkos_has_string,$(KOKKOS_OPTIONS),aggressive_vectorization)
-KOKKOS_INTERNAL_DISABLE_PROFILING := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_profiling)
-KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_deprecated_code)
-KOKKOS_INTERNAL_ENABLE_DEPRECATED_CODE := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_deprecated_code)
+KOKKOS_INTERNAL_ENABLE_TUNING := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_tuning)
 KOKKOS_INTERNAL_DISABLE_COMPLEX_ALIGN := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_complex_align)
 KOKKOS_INTERNAL_DISABLE_DUALVIEW_MODIFY_CHECK := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_dualview_modify_check)
 KOKKOS_INTERNAL_ENABLE_PROFILING_LOAD_PRINT := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_profile_load_print)
@ -96,7 +93,6 @@ KOKKOS_INTERNAL_CUDA_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS
 KOKKOS_INTERNAL_CUDA_USE_LAMBDA := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_lambda)
 KOKKOS_INTERNAL_CUDA_USE_CONSTEXPR := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_constexpr)
 KOKKOS_INTERNAL_HPX_ENABLE_ASYNC_DISPATCH := $(call kokkos_has_string,$(KOKKOS_HPX_OPTIONS),enable_async_dispatch)
-KOKKOS_INTERNAL_ENABLE_ETI := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_eti)

 KOKKOS_INTERNAL_HIP_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_HIP_OPTIONS),rdc)

@ -140,6 +136,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
  KOKKOS_DEVICELIST += OPENMPTARGET
+  KOKKOS_INTERNAL_HAVE_CXX17_OR_NEWER := $(shell expr $(KOKKOS_INTERNAL_ENABLE_CXX17) \
+                                                    + $(KOKKOS_INTERNAL_ENABLE_CXX20) \
+                                                    + $(KOKKOS_INTERNAL_ENABLE_CXX2A))
+  ifneq ($(KOKKOS_INTERNAL_HAVE_CXX17_OR_NEWER), 1)
+    $(error OpenMPTarget backend requires C++17 or newer)
+  endif
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
@ -281,7 +283,7 @@ endif
 ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
  KOKKOS_INTERNAL_CXX11_FLAG := --c++11
  KOKKOS_INTERNAL_CXX14_FLAG := --c++14
-  #KOKKOS_INTERNAL_CXX17_FLAG := --c++17
+  KOKKOS_INTERNAL_CXX17_FLAG := --c++17
 else
  ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
     KOKKOS_INTERNAL_CXX11_FLAG := -std=c++11
@ -338,35 +340,27 @@ KOKKOS_INTERNAL_USE_ARCH_PASCAL60 := $(call kokkos_has_string,$(KOKKOS_ARCH),Pas
 KOKKOS_INTERNAL_USE_ARCH_VOLTA70 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volta70)
 KOKKOS_INTERNAL_USE_ARCH_VOLTA72 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volta72)
 KOKKOS_INTERNAL_USE_ARCH_TURING75 := $(call kokkos_has_string,$(KOKKOS_ARCH),Turing75)
+KOKKOS_INTERNAL_USE_ARCH_AMPERE80 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere80)
 KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30)  \
                                              + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32)  \
                                              + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35)  \
                                              + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37)  \
+                                              + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
+                                              + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
+                                              + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) \
                                              + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61)  \
                                              + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL60)  \
                                              + $(KOKKOS_INTERNAL_USE_ARCH_VOLTA70)   \
                                              + $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72)   \
                                              + $(KOKKOS_INTERNAL_USE_ARCH_TURING75)  \
-                                              + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
-                                              + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
-                                              + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53))
+                                              + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80))

 #SEK: This seems like a bug to me
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
  KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(call kokkos_has_string,$(KOKKOS_ARCH),Maxwell)
  KOKKOS_INTERNAL_USE_ARCH_KEPLER35 := $(call kokkos_has_string,$(KOKKOS_ARCH),Kepler)
-  KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30)  \
-                                                + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32)  \
-                                                + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35)  \
-                                                + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37)  \
-                                                + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61)  \
-                                                + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL60)  \
-                                                + $(KOKKOS_INTERNAL_USE_ARCH_VOLTA70)   \
-                                                + $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72)   \
-                                                + $(KOKKOS_INTERNAL_USE_ARCH_TURING75)  \
-                                                + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
-                                                + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
-                                                + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53))
+  KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35)  \
+                                                + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50))
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 1)
@ -394,19 +388,20 @@ KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_

 # AMD based.
 KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(call kokkos_has_string,$(KOKKOS_ARCH),AMDAVX)
-KOKKOS_INTERNAL_USE_ARCH_EPYC := $(call kokkos_has_string,$(KOKKOS_ARCH),EPYC)
+KOKKOS_INTERNAL_USE_ARCH_ZEN2 := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen2)
+KOKKOS_INTERNAL_USE_ARCH_ZEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen)
 KOKKOS_INTERNAL_USE_ARCH_VEGA900 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega900)
 KOKKOS_INTERNAL_USE_ARCH_VEGA906 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega906)

 # Any AVX?
 KOKKOS_INTERNAL_USE_ARCH_SSE42      := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM))
 KOKKOS_INTERNAL_USE_ARCH_AVX        := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_AMDAVX))
-KOKKOS_INTERNAL_USE_ARCH_AVX2       := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_EPYC))
+KOKKOS_INTERNAL_USE_ARCH_AVX2       := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2))
 KOKKOS_INTERNAL_USE_ARCH_AVX512MIC  := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNL))
 KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SKX))

 # Decide what ISA level we are able to support.
-KOKKOS_INTERNAL_USE_ISA_X86_64    := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM) + $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_KNL) + $(KOKKOS_INTERNAL_USE_ARCH_SKX) + $(KOKKOS_INTERNAL_USE_ARCH_EPYC))
+KOKKOS_INTERNAL_USE_ISA_X86_64    := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM) + $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_KNL) + $(KOKKOS_INTERNAL_USE_ARCH_SKX) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2))
 KOKKOS_INTERNAL_USE_ISA_KNC       := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNC))
 KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER8) + $(KOKKOS_INTERNAL_USE_ARCH_POWER9))
 KOKKOS_INTERNAL_USE_ISA_POWERPCBE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER7))
@ -430,7 +425,7 @@ endif
 KOKKOS_CPPFLAGS =
 KOKKOS_LIBDIRS =
 ifneq ($(KOKKOS_CMAKE), yes)
-  KOKKOS_CPPFLAGS = -I./ -I$(KOKKOS_PATH)/core/src -I$(KOKKOS_PATH)/containers/src -I$(KOKKOS_PATH)/algorithms/src -I$(KOKKOS_ETI_PATH)
+  KOKKOS_CPPFLAGS = -I./ -I$(KOKKOS_PATH)/core/src -I$(KOKKOS_PATH)/containers/src -I$(KOKKOS_PATH)/algorithms/src
 endif
 KOKKOS_TPL_INCLUDE_DIRS =
 KOKKOS_TPL_LIBRARY_DIRS =
@ -458,88 +453,91 @@ KOKKOS_CONFIG_HEADER=KokkosCore_config.h
 # Functions for generating config header file
 kokkos_append_header = $(shell echo $1 >> $(KOKKOS_INTERNAL_CONFIG_TMP))

+# assign hash sign to variable for compat. with make 4.3
+H := \#
+
 # Do not append first line
 tmp := $(shell echo "/* ---------------------------------------------" > KokkosCore_config.tmp)
 tmp := $(call kokkos_append_header,"Makefile constructed configuration:")
 tmp := $(call kokkos_append_header,"$(shell date)")
 tmp := $(call kokkos_append_header,"----------------------------------------------*/")

-tmp := $(call kokkos_append_header,'\#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)')
-tmp := $(call kokkos_append_header,'\#error "Do not include $(KOKKOS_CONFIG_HEADER) directly; include Kokkos_Macros.hpp instead."')
-tmp := $(call kokkos_append_header,'\#else')
-tmp := $(call kokkos_append_header,'\#define KOKKOS_CORE_CONFIG_H')
-tmp := $(call kokkos_append_header,'\#endif')
+tmp := $(call kokkos_append_header,'$H''if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)')
+tmp := $(call kokkos_append_header,'$H''error "Do not include $(KOKKOS_CONFIG_HEADER) directly; include Kokkos_Macros.hpp instead."')
+tmp := $(call kokkos_append_header,'$H''else')
+tmp := $(call kokkos_append_header,'$H''define KOKKOS_CORE_CONFIG_H')
+tmp := $(call kokkos_append_header,'$H''endif')

 tmp := $(call kokkos_append_header,"")
-tmp := $(call kokkos_append_header,"\#define KOKKOS_VERSION $(KOKKOS_VERSION)")
+tmp := $(call kokkos_append_header,"$H""define KOKKOS_VERSION $(KOKKOS_VERSION)")
 tmp := $(call kokkos_append_header,"")
-
+	
 tmp := $(call kokkos_append_header,"/* Execution Spaces */")

 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CUDA")
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_COMPILER_CUDA_VERSION $(KOKKOS_INTERNAL_COMPILER_NVCC_VERSION)")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_COMPILER_CUDA_VERSION $(KOKKOS_INTERNAL_COMPILER_NVCC_VERSION)")
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
-  tmp := $(call kokkos_append_header,'\#define KOKKOS_ENABLE_ROCM')
-  tmp := $(call kokkos_append_header,'\#define KOKKOS_IMPL_ROCM_CLANG_WORKAROUND 1')
+  tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_ROCM')
+  tmp := $(call kokkos_append_header,'$H''define KOKKOS_IMPL_ROCM_CLANG_WORKAROUND 1')
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
-  tmp := $(call kokkos_append_header,'\#define KOKKOS_ENABLE_HIP')
+  tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_HIP')
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
-  tmp := $(call kokkos_append_header,'\#define KOKKOS_ENABLE_OPENMPTARGET')
+  tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_OPENMPTARGET')
  ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_WORKAROUND_OPENMPTARGET_GCC")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_WORKAROUND_OPENMPTARGET_GCC")
  endif
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
-  tmp := $(call kokkos_append_header,'\#define KOKKOS_ENABLE_OPENMP')
+  tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_OPENMP')
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_THREADS")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_THREADS")
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_HPX")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HPX")
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_SERIAL")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_SERIAL")
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_TM), 1)
-  tmp := $(call kokkos_append_header,"\#ifndef __CUDA_ARCH__")
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_TM")
-  tmp := $(call kokkos_append_header,"\#endif")
+  tmp := $(call kokkos_append_header,"$H""ifndef __CUDA_ARCH__")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_TM")
+  tmp := $(call kokkos_append_header,"$H""endif")
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_ISA_X86_64), 1)
-  tmp := $(call kokkos_append_header,"\#ifndef __CUDA_ARCH__")
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_USE_ISA_X86_64")
-  tmp := $(call kokkos_append_header,"\#endif")
+  tmp := $(call kokkos_append_header,"$H""ifndef __CUDA_ARCH__")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_USE_ISA_X86_64")
+  tmp := $(call kokkos_append_header,"$H""endif")
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_ISA_KNC), 1)
-  tmp := $(call kokkos_append_header,"\#ifndef __CUDA_ARCH__")
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_USE_ISA_KNC")
-  tmp := $(call kokkos_append_header,"\#endif")
+  tmp := $(call kokkos_append_header,"$H""ifndef __CUDA_ARCH__")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_USE_ISA_KNC")
+  tmp := $(call kokkos_append_header,"$H""endif")
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_ISA_POWERPCLE), 1)
-  tmp := $(call kokkos_append_header,"\#ifndef __CUDA_ARCH__")
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_USE_ISA_POWERPCLE")
-  tmp := $(call kokkos_append_header,"\#endif")
+  tmp := $(call kokkos_append_header,"$H""ifndef __CUDA_ARCH__")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_USE_ISA_POWERPCLE")
+  tmp := $(call kokkos_append_header,"$H""endif")
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_ISA_POWERPCBE), 1)
-  tmp := $(call kokkos_append_header,"\#ifndef __CUDA_ARCH__")
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_USE_ISA_POWERPCBE")
-  tmp := $(call kokkos_append_header,"\#endif")
+  tmp := $(call kokkos_append_header,"$H""ifndef __CUDA_ARCH__")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_USE_ISA_POWERPCBE")
+  tmp := $(call kokkos_append_header,"$H""endif")
 endif

 #only add the c++ standard flags if this is not CMake
@ -548,34 +546,39 @@ ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX11), 1)
 ifneq ($(KOKKOS_STANDALONE_CMAKE), yes)
  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX11_FLAG)
 endif
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX11")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX11")
 endif
 ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX14), 1)
 ifneq ($(KOKKOS_STANDALONE_CMAKE), yes)
  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX14_FLAG)
 endif
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX14")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX14")
 endif
 ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX1Y), 1)
  #I cannot make CMake add this in a good way - so add it here
  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX1Y_FLAG)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX14")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX14")
 endif
 ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX17), 1)
 ifneq ($(KOKKOS_STANDALONE_CMAKE), yes)
  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX17_FLAG)
 endif
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX17")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX17")
 endif
 ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX1Z), 1)
  #I cannot make CMake add this in a good way - so add it here
  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX1Z_FLAG)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX17")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX17")
 endif
 ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX2A), 1)
  #I cannot make CMake add this in a good way - so add it here
  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX2A_FLAG)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX20")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX20")
+endif
+ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX20), 1)
+  #I cannot make CMake add this in a good way - so add it here
+  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX20_FLAG)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX20")
 endif

 ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1)
@ -585,20 +588,26 @@ ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1)

  KOKKOS_CXXFLAGS += -g
  KOKKOS_LDFLAGS += -g
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK")
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_DEBUG")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_DEBUG")
  ifeq ($(KOKKOS_INTERNAL_DISABLE_DUALVIEW_MODIFY_CHECK), 0)
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK")
  endif
 endif
 ifeq ($(KOKKOS_INTERNAL_DISABLE_COMPLEX_ALIGN), 0)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_COMPLEX_ALIGN")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_COMPLEX_ALIGN")
 endif

 ifeq ($(KOKKOS_INTERNAL_ENABLE_PROFILING_LOAD_PRINT), 1)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_PROFILING_LOAD_PRINT")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_PROFILING_LOAD_PRINT")
 endif

+ifeq ($(KOKKOS_INTERNAL_ENABLE_TUNING), 1)
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_TUNING")
+endif
+
+tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_LIBDL")
+
 ifeq ($(KOKKOS_INTERNAL_USE_HWLOC), 1)
  ifneq ($(KOKKOS_CMAKE), yes)
    ifneq ($(HWLOC_PATH),)
@ -611,11 +620,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_HWLOC), 1)
    KOKKOS_LIBS += -lhwloc
    KOKKOS_TPL_LIBRARY_NAMES += hwloc
  endif
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_HWLOC")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HWLOC")
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_LIBRT), 1)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_USE_LIBRT")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_USE_LIBRT")
  KOKKOS_LIBS += -lrt
  KOKKOS_TPL_LIBRARY_NAMES += rt
 endif
@ -632,50 +641,36 @@ ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
    KOKKOS_LIBS += -lmemkind -lnuma
    KOKKOS_TPL_LIBRARY_NAMES += memkind numa
  endif
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_HBWSPACE")
-endif
-
-ifeq ($(KOKKOS_INTERNAL_DISABLE_PROFILING), 0)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_PROFILING")
-endif
-
-ifeq ($(KOKKOS_INTERNAL_USE_HPX), 0)
-  ifeq ($(KOKKOS_INTERNAL_ENABLE_DEPRECATED_CODE), 1)
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_DEPRECATED_CODE")
-  endif
-endif
-
-ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_ETI")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HBWSPACE")
 endif

 ifeq ($(KOKKOS_INTERNAL_ENABLE_LARGE_MEM_TESTS), 1)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_LARGE_MEM_TESTS")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_LARGE_MEM_TESTS")
 endif

 tmp := $(call kokkos_append_header,"/* Optimization Settings */")

 ifeq ($(KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION), 1)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION")
 endif

 tmp := $(call kokkos_append_header,"/* Cuda Settings */")

 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
  ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LDG), 1)
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CUDA_LDG_INTRINSIC")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_LDG_INTRINSIC")
  else
    ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
-      tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CUDA_LDG_INTRINSIC")
+      tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_LDG_INTRINSIC")
    endif
  endif

  ifeq ($(KOKKOS_INTERNAL_CUDA_USE_UVM), 1)
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CUDA_UVM")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_UVM")
  endif

  ifeq ($(KOKKOS_INTERNAL_CUDA_USE_RELOC), 1)
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE")
    ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
      KOKKOS_CXXFLAGS += -fcuda-rdc
      KOKKOS_LDFLAGS += -fcuda-rdc
@ -696,7 +691,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
  ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LAMBDA), 1)
    ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
      ifeq ($(shell test $(KOKKOS_INTERNAL_COMPILER_NVCC_VERSION) -gt 70; echo $$?),0)
-        tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CUDA_LAMBDA")
+        tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_LAMBDA")
        KOKKOS_CXXFLAGS += -expt-extended-lambda
      else
        $(warning Warning: Cuda Lambda support was requested but NVCC version is too low. This requires NVCC for Cuda version 7.5 or higher. Disabling Lambda support now.)
@ -704,14 +699,14 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
    endif

    ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
-      tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CUDA_LAMBDA")
+      tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_LAMBDA")
    endif
  endif

  ifeq ($(KOKKOS_INTERNAL_CUDA_USE_CONSTEXPR), 1)
    ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
      ifeq ($(shell test $(KOKKOS_INTERNAL_COMPILER_NVCC_VERSION) -ge 80; echo $$?),0)
-        tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CUDA_CONSTEXPR")
+        tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_CONSTEXPR")
        KOKKOS_CXXFLAGS += -expt-relaxed-constexpr
      else
        $(warning Warning: Cuda relaxed constexpr support was requested but NVCC version is too low. This requires NVCC for Cuda version 8.0 or higher. Disabling relaxed constexpr support now.)
@ -719,25 +714,25 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
    endif

    ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
-      tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CUDA_CONSTEXPR")
+      tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_CONSTEXPR")
    endif
  endif

  ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_IMPL_CUDA_CLANG_WORKAROUND")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_IMPL_CUDA_CLANG_WORKAROUND")
  endif
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
  ifeq ($(KOKKOS_INTERNAL_HPX_ENABLE_ASYNC_DISPATCH), 1)
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_HPX_ASYNC_DISPATCH")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HPX_ASYNC_DISPATCH")
  endif
 endif

 # Add Architecture flags.

 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV80), 1)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ARMV80")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV80")

  ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
    KOKKOS_CXXFLAGS +=
@ -754,7 +749,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV80), 1)
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV81), 1)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ARMV81")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV81")

  ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
    KOKKOS_CXXFLAGS +=
@ -770,9 +765,9 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV81), 1)
  endif
 endif

-ifeq ($(KOKKOS_INTERNAL_USE_ARCH_EPYC), 1)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_AMD_EPYC")
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_AMD_AVX2")
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_ZEN")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_AVX2")

  ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
    KOKKOS_CXXFLAGS += -mavx2
@ -783,9 +778,22 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_EPYC), 1)
  endif
 endif

+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN2), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_ZEN2")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_AVX2")
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
+    KOKKOS_CXXFLAGS += -mavx2
+    KOKKOS_LDFLAGS += -mavx2
+  else
+    KOKKOS_CXXFLAGS += -march=znver2 -mtune=znver2
+    KOKKOS_LDFLAGS += -march=znver2 -mtune=znver2
+  endif
+endif
+
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX), 1)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ARMV80")
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ARMV8_THUNDERX")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV80")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV8_THUNDERX")

  ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
    KOKKOS_CXXFLAGS +=
@ -802,8 +810,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX), 1)
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2), 1)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ARMV81")
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ARMV8_THUNDERX2")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV81")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV8_THUNDERX2")

  ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
    KOKKOS_CXXFLAGS +=
@ -820,7 +828,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2), 1)
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_SSE42), 1)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_SSE42")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_SSE42")

  ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
    KOKKOS_CXXFLAGS += -xSSE4.2
@ -842,7 +850,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_SSE42), 1)
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_AVX")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX")

  ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
    KOKKOS_CXXFLAGS += -mavx
@ -864,7 +872,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1)
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER7), 1)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_POWER7")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_POWER7")

  ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)

@ -876,7 +884,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER7), 1)
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_POWER8")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_POWER8")

  ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)

@ -897,7 +905,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1)
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER9), 1)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_POWER9")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_POWER9")

  ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)

@ -918,7 +926,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER9), 1)
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HSW), 1)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_AVX2")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX2")

  ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
    KOKKOS_CXXFLAGS += -xCORE-AVX2
@ -940,7 +948,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HSW), 1)
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_BDW), 1)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_AVX2")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX2")

  ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
    KOKKOS_CXXFLAGS += -xCORE-AVX2
@ -962,7 +970,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_BDW), 1)
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC), 1)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_AVX512MIC")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX512MIC")

  ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
    KOKKOS_CXXFLAGS += -xMIC-AVX512
@ -983,7 +991,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC), 1)
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON), 1)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_AVX512XEON")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX512XEON")

  ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
    KOKKOS_CXXFLAGS += -xCORE-AVX512
@ -1004,7 +1012,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON), 1)
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KNC), 1)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KNC")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KNC")
  KOKKOS_CXXFLAGS += -mmic
  KOKKOS_LDFLAGS += -mmic
 endif
@ -1022,8 +1030,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1)
  ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
    KOKKOS_INTERNAL_CUDA_ARCH_FLAG=-arch
  else ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
-    KOKKOS_INTERNAL_CUDA_ARCH_FLAG=--cuda-gpu-arch
-    KOKKOS_CXXFLAGS += -x cuda
+		KOKKOS_INTERNAL_CUDA_ARCH_FLAG=--cuda-gpu-arch
+		KOKKOS_CXXFLAGS += -x cuda
  else
    $(error Makefile.kokkos: CUDA is enabled but the compiler is neither NVCC nor Clang (got version string $(KOKKOS_CXX_VERSION)) )
  endif
@ -1039,65 +1047,70 @@ endif

 ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1)
  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1)
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KEPLER")
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KEPLER30")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER30")
    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_30
  endif
  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER32), 1)
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KEPLER")
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KEPLER32")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER32")
    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_32
  endif
  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER35), 1)
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KEPLER")
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KEPLER35")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER35")
    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_35
  endif
  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER37), 1)
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KEPLER")
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KEPLER37")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER37")
    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_37
  endif
  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50), 1)
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_MAXWELL")
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_MAXWELL50")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL50")
    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_50
  endif
  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52), 1)
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_MAXWELL")
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_MAXWELL52")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL52")
    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_52
  endif
  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1)
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_MAXWELL")
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_MAXWELL53")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL53")
    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_53
  endif
  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL60), 1)
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_PASCAL")
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_PASCAL60")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL60")
    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_60
  endif
  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1)
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_PASCAL")
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_PASCAL61")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL61")
    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_61
  endif
  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA70), 1)
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_VOLTA")
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_VOLTA70")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA70")
    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_70
  endif
  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA72), 1)
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_VOLTA")
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_VOLTA72")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA72")
    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_72
  endif
  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_TURING75), 1)
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_TURING")
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_TURING75")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_TURING")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_TURING75")
    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_75
  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE80), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE80")
+    KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_80
+  endif

  ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
    KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)
@ -1121,13 +1134,13 @@ endif
 ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
  # Lets start with adding architecture defines
  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA900), 1)
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_HIP 900")
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_VEGA900")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HIP 900")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA900")
    KOKKOS_INTERNAL_HIP_ARCH_FLAG := --amdgpu-target=gfx900
  endif
  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA906), 1)
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_HIP 906")
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_VEGA906")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HIP 906")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA906")
    KOKKOS_INTERNAL_HIP_ARCH_FLAG := --amdgpu-target=gfx906
  endif

@ -1138,7 +1151,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
  KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_HIP_ARCH_FLAG)

  ifeq ($(KOKKOS_INTERNAL_HIP_USE_RELOC), 1)
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE")
    KOKKOS_CXXFLAGS+=-fgpu-rdc
    KOKKOS_LDFLAGS+=-fgpu-rdc
  else
@ -1171,9 +1184,6 @@ KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.cpp)

 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.cpp)
-ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
-  KOKKOS_SRC += $(wildcard $(KOKKOS_ETI_PATH)/Cuda/*.cpp)
-endif
  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp)
  ifneq ($(CUDA_PATH),)
    KOKKOS_CPPLAGS += -I$(CUDA_PATH)/include
@ -1211,9 +1221,6 @@ endif

 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.cpp)
-ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
-  KOKKOS_SRC += $(wildcard $(KOKKOS_ETI_PATH)/OpenMP/*.cpp)
-endif
  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.hpp)

  ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
@ -1228,9 +1235,6 @@ endif

 ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.cpp)
-ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
-  KOKKOS_SRC += $(wildcard $(KOKKOS_ETI_PATH)/Threads/*.cpp)
-endif
  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.hpp)
  KOKKOS_LIBS += -lpthread
  KOKKOS_TPL_LIBRARY_NAMES += pthread
@ -1279,9 +1283,6 @@ endif
 # Don't include Kokkos_Serial.cpp or Kokkos_Serial_Task.cpp if not using Serial
 # device to avoid a link warning.
 ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
-ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
-  KOKKOS_SRC += $(wildcard $(KOKKOS_ETI_PATH)/Serial/*.cpp)
-endif
 endif
 ifneq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
  KOKKOS_SRC := $(filter-out $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial.cpp,$(KOKKOS_SRC))
--- a/lib/kokkos/Makefile.targets
+++ b/lib/kokkos/Makefile.targets
@ -26,21 +26,17 @@ Kokkos_Spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Spi
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp
 Kokkos_HostBarrier.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostBarrier.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostBarrier.cpp
-Kokkos_Profiling_Interface.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp
+Kokkos_Profiling.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling.cpp
 Kokkos_SharedAlloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_SharedAlloc.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_SharedAlloc.cpp
 Kokkos_MemoryPool.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp
+Kokkos_MemorySpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_MemorySpace.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemorySpace.cpp
 Kokkos_HostSpace_deepcopy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp 
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp

-ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
-ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
-  include $(KOKKOS_ETI_PATH)/Serial/Makefile.eti_Serial
-endif
-endif
-
 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
 Kokkos_Cuda_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Instance.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Instance.cpp
@ -50,9 +46,6 @@ Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cu
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
 Kokkos_Cuda_Locks.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
-ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
-  include $(KOKKOS_ETI_PATH)/Cuda/Makefile.eti_Cuda
-endif
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
@ -75,9 +68,6 @@ Kokkos_ROCm_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/ROCm/Kokkos_RO
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Task.cpp
 Kokkos_ROCm_Impl.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Impl.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Impl.cpp
-ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
-  include $(KOKKOS_ETI_PATH)/ROCm/Makefile.eti_ROCm
-endif
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
@ -85,9 +75,6 @@ Kokkos_ThreadsExec_base.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec_base.cpp
 Kokkos_ThreadsExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp
-ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
-  include $(KOKKOS_ETI_PATH)/Threads/Makefile.eti_Threads
-endif
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
@ -95,9 +82,6 @@ Kokkos_OpenMP_Exec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokko
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
 Kokkos_OpenMP_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
-ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
-  include $(KOKKOS_ETI_PATH)/OpenMP/Makefile.eti_OpenMP
-endif
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
--- a/lib/kokkos/README.md
+++ b/lib/kokkos/README.md
@ -151,7 +151,7 @@ Full details are given in the [build instructions](BUILD.md). Basic setups are s
 ## CMake

 The best way to install Kokkos is using the CMake build system. Assuming Kokkos lives in `$srcdir`:
-````
+````bash
 cmake $srcdir \
  -DCMAKE_CXX_COMPILER=$path_to_compiler \
  -DCMAKE_INSTALL_PREFIX=$path_to_install \
@ -170,7 +170,7 @@ and run `make test` after completing the build.

 For your CMake project using Kokkos, code such as the following:

-````
+````cmake
 find_package(Kokkos)
 ...
 target_link_libraries(myTarget Kokkos::kokkos)
@ -187,17 +187,15 @@ for the install location given above.

 ## Spack
 An alternative to manually building with the CMake is to use the Spack package manager.
-To do so, download the `kokkos-spack` git repo and add to the package list:
-````
-spack repo add $path-to-kokkos-spack
+To get started, download the Spack [repo](https://github.com/spack/spack).
 ````
 A basic installation would be done as:
-````
-spack install kokkos
+````bash
+> spack install kokkos
 ````
 Spack allows options and and compilers to be tuned in the install command.
-````
-spack install kokkos@3.0 %gcc@7.3.0 +openmp
+````bash
+> spack install kokkos@3.0 %gcc@7.3.0 +openmp
 ````
 This example illustrates the three most common parameters to Spack:
 * Variants: specified with, e.g. `+openmp`, this activates (or deactivates with, e.g. `~openmp`) certain options.
@ -205,33 +203,33 @@ This example illustrates the three most common parameters to Spack:
 * Compiler: a default compiler will be chosen if not specified, but an exact compiler version can be given with the `%`option.

 For a complete list of Kokkos options, run:
-````
-spack info kokkos
+````bash
+> spack info kokkos
 ````
 Spack currently installs packages to a location determined by a unique hash. This hash name is not really "human readable".
 Generally, Spack usage should never really require you to reference the computer-generated unique install folder.
 More details are given in the [build instructions](BUILD.md). If you must know, you can locate Spack Kokkos installations with:
-````
-spack find -p kokkos ...
+````bash
+> spack find -p kokkos ...
 ````
 where `...` is the unique spec identifying the particular Kokkos configuration and version.
-
+Some more details can found in the Kokkos spack [documentation](Spack.md) or the Spack [website](https://spack.readthedocs.io/en/latest).

 ## Raw Makefile
 A bash script is provided to generate raw makefiles.
 To install Kokkos as a library create a build directory and run the following
-````
-$KOKKOS_PATH/generate_makefile.bash --prefix=$path_to_install
+````bash
+> $KOKKOS_PATH/generate_makefile.bash --prefix=$path_to_install
 ````
 Once the Makefile is generated, run:
-````
-make kokkoslib
-make install
+````bash
+> make kokkoslib
+> make install
 ````
 To additionally run the unit tests:
-````
-make build-test
-make test
+````bash
+> make build-test
+> make test
 ````
 Run `generate_makefile.bash --help` for more detailed options such as
 changing the device type for which to build.
@ -274,7 +272,7 @@ more than a single GPU is used by a single process.

 If you publish work which mentions Kokkos, please cite the following paper:

-````
+````BibTeX
@article{CarterEdwards20143202,
  title = "Kokkos: Enabling manycore performance portability through polymorphic memory access patterns ",
  journal = "Journal of Parallel and Distributed Computing ",
--- a/lib/kokkos/Spack.md
+++ b/lib/kokkos/Spack.md
@ -0,0 +1,267 @@
+![Kokkos](https://avatars2.githubusercontent.com/u/10199860?s=200&v=4)
+
+# Kokkos Spack
+
+This gives instructions for using Spack to install Kokkos and developing packages that depend on Kokkos.
+
+## Getting Started
+
+Make sure you have downloaded [Spack](https://github.com/spack/spack).
+The easiest way to configure the Spack environment is:
+````bash
+> source spack/share/spack/setup-env.sh
+````
+with other scripts available for other shells.
+You can display information about how to install packages with:
+````bash
+> spack info kokkos
+````
+This will print all the information about how to install Kokkos with Spack.
+For detailed instructions on how to use Spack, see the [User Manual](https://spack.readthedocs.io).
+
+## Setting Up Spack: Avoiding the Package Cascade
+By default, Spack doesn't 'see' anything on your system - including things like CMake and CUDA.
+This can be limited by adding a `packages.yaml` to your `$HOME/.spack` folder that includes CMake (and CUDA, if applicable).  For example, your `packages.yaml` file could be:
+````yaml
+packages:
+ cuda:
+  modules:
+   cuda@10.1.243: [cuda/10.1.243]
+  paths:
+   cuda@10.1.243:
+    /opt/local/ppc64le-pwr8-nvidia/cuda/10.1.243
+  buildable: false
+ cmake:
+  modules:
+   cmake: [cmake/3.16.8]
+  paths:
+   cmake:
+    /opt/local/ppc64le/cmake/3.16.8
+  buildable: false
+````
+The `modules` entry is only necessary on systems that require loading Modules (i.e. most DOE systems).
+The `buildable` flag is useful to make sure Spack crashes if there is a path error,
+rather than having a type-o and Spack rebuilding everything because `cmake` isn't found.
+You can verify your environment is set up correctly by running `spack graph` or `spack spec`.
+For example:
+````bash
+> spack graph kokkos +cuda
+o  kokkos
+|\
+o |  cuda
+ /
+o  cmake
+````
+Without the existing CUDA and CMake being identified in `packages.yaml`, a (subset!) of the output would be:
+````bash
+o  kokkos
+|\
+| o  cmake
+| |\
+| | | |\
+| | | | | |\
+| | | | | | | |\
+| | | | | | | | | |\
+| | | | | | | o | | |  libarchive
+| | | | | | | |\ \ \ \
+| | | | | | | | | |\ \ \ \
+| | | | | | | | | | | | |_|/
+| | | | | | | | | | | |/| |
+| | | | | | | | | | | | | o  curl
+| | |_|_|_|_|_|_|_|_|_|_|/|
+| |/| | | |_|_|_|_|_|_|_|/
+| | | | |/| | | | | | | |
+| | | | o | | | | | | | |  openssl
+| |/| | | | | | | | | | |
+| | | | | | | | | | o | |  libxml2
+| | |_|_|_|_|_|_|_|/| | |
+| | | | | | | | | | |\ \ \
+| o | | | | | | | | | | | |  zlib
+|  / / / / / / / / / / / /
+| o | | | | | | | | | | |  xz
+|  / / / / / / / / / / /
+| o | | | | | | | | | |  rhash
+|  / / / / / / / / / /
+| | | | o | | | | | |  nettle
+| | | | |\ \ \ \ \ \ \
+| | | o | | | | | | | |  libuv
+| | | | o | | | | | | |  autoconf
+| | |_|/| | | | | | | |
+| | | | |/ / / / / / /
+| o | | | | | | | | |  perl
+| o | | | | | | | | |  gdbm
+| o | | | | | | | | |  readline
+````
+
+## Configuring Kokkos as a Project Dependency
+Say you have a project "SuperScience" which needs to use Kokkos.
+In your `package.py` file, you would generally include something like:
+````python
+class SuperScience(CMakePackage):
+  ...
+  depends_on("kokkos")
+````
+Often projects want to tweak behavior when using certain features, e.g.
+````python
+  depends_on("kokkos+cuda", when="+cuda")
+````
+if your project needs CUDA-specific logic to configure and build.
+This illustrates the general principle in Spack of "flowing-up".
+A user requests a feature in the final app:
+````bash
+> spack install superscience+cuda
+````
+This flows upstream to the Kokkos dependency, causing the `kokkos+cuda` variant to build.
+The downstream app (SuperScience) tells the upstream app (Kokkos) how to build.
+
+Because Kokkos is a performance portability library, it somewhat inverts this principle.
+Kokkos "flows-down", telling your application how best to configure for performance.
+Rather than a downstream app (SuperScience) telling the upstream (Kokkos) what variants to build,
+a pre-built Kokkos should be telling the downstream app SuperScience what variants to use.
+Kokkos works best when there is an "expert" configuration installed on your system.
+Your build should simply request `-DKokkos_ROOT=<BEST_KOKKOS_FOR_MY_SYSTEM>` and configure appropriately based on the Kokkos it finds.
+
+Kokkos has many, many build variants.
+Where possible, projects should only depend on a general Kokkos, not specific variants.
+We recommend instead adding for each system you build on a Kokkos configuration to your `packages.yaml` file (usually found in `~/.spack` for specific users).
+For a Xeon + Volta system, this could look like:
+````yaml
+ kokkos:
+  variants: +cuda +openmp +cuda_lambda +wrapper ^cuda@10.1 cuda_arch=70
+  compiler: [gcc@7.2.0]
+````
+which gives the "best" Kokkos configuration as CUDA+OpenMP optimized for a Volta 70 architecture using CUDA 10.1.
+It also enables support for CUDA Lambdas.
+The `+wrapper` option tells Kokkos to build with the special `nvcc_wrapper` (more below).
+Note here that we use the built-in `cuda_arch` variant of Spack to specify the archicture.
+For a Haswell system, we use
+````yaml
+ kokkos:
+  variants: +openmp std=14 target=haswell
+  compiler: [intel@18]
+````
+which uses the built-in microarchitecture variants of Spack.
+Consult the Spack documentation for more details of Spack microarchitectures
+and CUDA architectures.
+Spack does not currently provide an AMD GPU microarchitecture option.
+If building for HIP or an AMD GPU, Kokkos provides an `amd_gpu_arch` similar to `cuda_arch`.
+````yaml
+ kokkos:
+  variants: +hip amd_gpu_arch=vega900
+````
+
+Without an optimal default in your `packages.yaml` file, it is highly likely that the default Kokkos configuration you get will not be what you want.
+For example, CUDA is not enabled by default (there is no easy logic to conditionally activate this for CUDA-enabled systems).
+If you don't specify a CUDA build variant in a `packages.yaml` and you build your Kokkos-dependent project:
+````bash
+> spack install superscience
+````
+you may end up just getting the default Kokkos (i.e. Serial).
+Some examples are included in the `config/yaml` folder for common platforms.
+Before running `spack install <package>` we recommend running `spack spec <package>` to confirm your dependency tree is correct.
+For example, with Kokkos Kernels:
+````bash
+kokkos-kernels@3.0%gcc@8.3.0~blas build_type=RelWithDebInfo ~cblas~complex_double~complex_float~cublas~cuda cuda_arch=none ~cusparse~diy+double execspace_cuda=auto execspace_openmp=auto execspace_serial=auto execspace_threads=auto ~float~lapack~lapacke+layoutleft~layoutright memspace_cudaspace=auto memspace_cudauvmspace=auto +memspace_hostspace~mkl+offset_int+offset_size_t~openmp+ordinal_int~ordinal_int64_t~serial~superlu arch=linux-rhel7-skylake_avx512
+    ^cmake@3.16.2%gcc@8.3.0~doc+ncurses+openssl+ownlibs~qt arch=linux-rhel7-skylake_avx512
+        ^kokkos@3.0%gcc@8.3.0~aggressive_vectorization~amdavx~armv80~armv81~armv8_thunderx~armv8_tx2~bdw~bgq build_type=RelWithDebInfo ~carrizo~compiler_warnings+cuda cuda_arch=none +cuda_lambda~cuda_ldg_intrinsic~cuda_relocatable_device_code~cuda_uvm~debug~debug_bounds_check~debug_dualview_modify_check~deprecated_code~diy~epyc~examples~explicit_instantiation~fiji~gfx901~hpx~hpx_async_dispatch~hsw~hwloc~kaveri~kepler30~kepler32~kepler35~kepler37~knc~knl~maxwell50~maxwell52~maxwell53~memkind~numactl+openmp~pascal60~pascal61~power7~power8~power9+profiling~profiling_load_print~pthread~qthread~rocm~ryzen~serial~skx~snb std=14 ~tests~turing75~vega+volta70~volta72+wrapper~wsm arch=linux-rhel7-skylake_avx512
+                ^cuda@10.1%gcc@8.3.0 arch=linux-rhel7-skylake_avx512
+                        ^kokkos-nvcc-wrapper@old%gcc@8.3.0 build_type=RelWithDebInfo +mpi arch=linux-rhel7-skylake_avx512
+                                    ^openmpi@4.0.2%gcc@8.3.0~cuda+cxx_exceptions fabrics=none ~java~legacylaunchers~memchecker patches=073477a76bba780c67c36e959cd3ee6910743e2735c7e76850ffba6791d498e4 ~pmi schedulers=none ~sqlite3~thread_multiple+vt arch=linux-rhel7-skylake_avx512
+````
+The output can be very verbose, but we can verify the expected `kokkos`:
+````bash
+kokkos@3.0%gcc@8.3.0~aggressive_vectorization~amdavx~armv80~armv81~armv8_thunderx~armv8_tx2~bdw~bgq build_type=RelWithDebInfo ~carrizo~compiler_warnings+cuda cuda_arch=none +cuda_lambda~cuda_ldg_intrinsic~cuda_relocatable_device_code~cuda_uvm~debug~debug_bounds_check~debug_dualview_modify_check~deprecated_code~diy~epyc~examples~explicit_instantiation~fiji~gfx901~hpx~hpx_async_dispatch~hsw~hwloc~kaveri~kepler30~kepler32~kepler35~kepler37~knc~knl~maxwell50~maxwell52~maxwell53~memkind~numactl+openmp~pascal60~pascal61~power7~power8~power9+profiling~profiling_load_print~pthread~qthread~rocm~ryzen~serial~skx~snb std=11 ~tests~turing75~vega+volta70~volta72+wrapper~wsm arch=linux-rhel7-skylake_avx512
+````
+We see that we do have `+volta70` and `+wrapper`, e.g.
+
+### Spack Environments
+The encouraged way to use Spack is with Spack environments ([more details here](https://spack-tutorial.readthedocs.io/en/latest/tutorial_environments.html#dealing-with-many-specs-at-once)).
+Rather than installing packages one-at-a-time, you add packages to an environment.
+After adding all packages, you concretize and install them all.
+Using environments, one can explicitly add a desired Kokkos for the environment, e.g.
+````bash
+> spack add kokkos +cuda +cuda_lambda +volta70
+> spack add my_project +my_variant
+> ...
+> spack install
+````
+All packages within the environment will build against the CUDA-enabled Kokkos,
+even if they only request a default Kokkos.
+
+## NVCC Wrapper
+Kokkos is a C++ project, but often builds for the CUDA backend.
+This is particularly problematic with CMake. At this point, `nvcc` does not accept all the flags that normally get passed to a C++ compiler.
+Kokkos provides `nvcc_wrapper` that identifies correctly as a C++ compiler to CMake and accepts C++ flags, but uses `nvcc` as the underlying compiler.
+`nvcc` itself also uses an underlying host compiler, e.g. GCC.
+
+In Spack, the underlying host compiler is specified as below, e.g.:
+````bash
+> spack install package %gcc@8.0.0
+````
+This is still valid for Kokkos. To use the special wrapper for CUDA builds, request a desired compiler and simply add the `+wrapper` variant.
+````bash
+> spack install kokkos +cuda +wrapper %gcc@7.2.0
+````
+Downstream projects depending on Kokkos need to override their compiler.
+Kokkos provides the compiler in a `kokkos_cxx` variable,
+which points to either `nvcc_wrapper` when needed or the regular compiler otherwise.
+Spack projects already do this to use MPI compiler wrappers.
+````python
+def cmake_args(self):
+  options = []
+  ...
+  options.append("-DCMAKE_CXX_COMPILER=%s" % self.spec["kokkos"].kokkos_cxx)
+  ...
+  return options
+````
+Note: `nvcc_wrapper` works with the MPI compiler wrappers.
+If building your project with MPI, do NOT set your compiler to `nvcc_wrapper`.
+Instead set your compiler to `mpicxx` and `nvcc_wrapper` will be used under the hood.
+````python
+def cmake_args(self):
+  options = []
+  ...
+  options.append("-DCMAKE_CXX_COMPILER=%s" % self.spec["mpi"].mpicxx)
+  ...
+  return options
+````
+To accomplish this, `nvcc_wrapper` must depend on MPI (even though it uses no MPI).
+This has the unfortunate consequence that Kokkos CUDA projects not using MPI will implicitly depend on MPI anyway.
+This behavior is necessary for now, but will hopefully be removed later.
+When using environments, if MPI is not needed, you can remove the MPI dependency with:
+````bash
+> spack add kokkos-nvcc-wrapper ~mpi
+````
+
+## Developing With Spack
+
+Spack has historically been much more suited to *deployment* of mature packages than active testing or developing.
+However, recent features have improved support for development.
+Future releases are likely to make this even easier and incorporate Git integration.
+The most common commands will do a full build and install of the packages.
+If doing development, you may wish to merely set up a build environment.
+This allows you to modify the source and re-build.
+In this case, you can stop after configuring.
+Suppose you have Kokkos checkout in the folder `kokkos-src`:
+````bash
+> spack dev-build -d kokkos-src -u cmake kokkos@develop +wrapper +openmp
+````
+This sets up a development environment for you in `kokkos-src` which you can use (Bash example shown):
+Note: Always specify `develop` as the version when doing `dev-build`, except in rare cases.
+You are usually developing a feature branch that will merge into `develop`,
+hence you are making a new `develop` branch.
+
+````bash
+> cd kokko-src
+> source spack-build-env.txt
+> cd spack-build
+> make
+````
+Before sourcing the Spack development environment, you may wish to save your current environment:
+````bash
+> declare -px > myenv.sh
+````
+When done with Spack, you can then restore your original environment:
+````bash
+> source myenv.sh
+````
--- a/lib/kokkos/algorithms/CMakeLists.txt
+++ b/lib/kokkos/algorithms/CMakeLists.txt
@ -2,7 +2,9 @@

 KOKKOS_SUBPACKAGE(Algorithms)

-ADD_SUBDIRECTORY(src)
+IF (NOT Kokkos_INSTALL_TESTING)
+  ADD_SUBDIRECTORY(src)
+ENDIF()

 KOKKOS_ADD_TEST_DIRECTORIES(unit_tests)

--- a/lib/kokkos/algorithms/src/CMakeLists.txt
+++ b/lib/kokkos/algorithms/src/CMakeLists.txt
@ -7,9 +7,15 @@ KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})

 #-----------------------------------------------------------------------------

-FILE(GLOB HEADERS *.hpp)
-FILE(GLOB SOURCES *.cpp)
-LIST(APPEND HEADERS ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}_config.h)
+FILE(GLOB ALGO_HEADERS *.hpp)
+FILE(GLOB ALGO_SOURCES *.cpp)
+LIST(APPEND ALGO_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}_config.h)
+
+INSTALL (
+  DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/"
+  DESTINATION ${KOKKOS_HEADER_DIR}
+  FILES_MATCHING PATTERN "*.hpp"
+)

 #-----------------------------------------------------------------------------

@ -17,8 +23,8 @@ LIST(APPEND HEADERS ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}_config.h)
 # These will get ignored for standalone CMake and a true interface library made
 KOKKOS_ADD_INTERFACE_LIBRARY(
  kokkosalgorithms
-  HEADERS ${HEADERS}
-  SOURCES ${SOURCES}
+  HEADERS ${ALGO_HEADERS}
+  SOURCES ${ALGO_SOURCES}
 )
 KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkosalgorithms
  ${KOKKOS_TOP_BUILD_DIR}
--- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
@ -94,9 +94,9 @@ namespace Kokkos {
    class Pool {
     public:
      //The Kokkos device type
-      typedef Device device_type;
+      using device_type = Device;
      //The actual generator type
-      typedef Generator<Device> generator_type;
+      using generator_type = Generator<Device>;

      //Default constructor: does not initialize a pool
      Pool();
@ -124,7 +124,7 @@ namespace Kokkos {
    class Generator {
     public:
     //The Kokkos device type
-    typedef DeviceType device_type;
+    using device_type = DeviceType;

    //Max return values of respective [X]rand[S]() functions
    enum {MAX_URAND = 0xffffffffU};
@ -138,75 +138,75 @@ namespace Kokkos {
    KOKKOS_INLINE_FUNCTION
    Generator (STATE_ARGUMENTS, int state_idx = 0);

-    //Draw a equidistributed uint32_t in the range (0,MAX_URAND]
+    //Draw a equidistributed uint32_t in the range [0,MAX_URAND)
    KOKKOS_INLINE_FUNCTION
    uint32_t urand();

-    //Draw a equidistributed uint64_t in the range (0,MAX_URAND64]
+    //Draw a equidistributed uint64_t in the range [0,MAX_URAND64)
    KOKKOS_INLINE_FUNCTION
    uint64_t urand64();

-    //Draw a equidistributed uint32_t in the range (0,range]
+    //Draw a equidistributed uint32_t in the range [0,range)
    KOKKOS_INLINE_FUNCTION
    uint32_t urand(const uint32_t& range);

-    //Draw a equidistributed uint32_t in the range (start,end]
+    //Draw a equidistributed uint32_t in the range [start,end)
    KOKKOS_INLINE_FUNCTION
    uint32_t urand(const uint32_t& start, const uint32_t& end );

-    //Draw a equidistributed uint64_t in the range (0,range]
+    //Draw a equidistributed uint64_t in the range [0,range)
    KOKKOS_INLINE_FUNCTION
    uint64_t urand64(const uint64_t& range);

-    //Draw a equidistributed uint64_t in the range (start,end]
+    //Draw a equidistributed uint64_t in the range [start,end)
    KOKKOS_INLINE_FUNCTION
    uint64_t urand64(const uint64_t& start, const uint64_t& end );

-    //Draw a equidistributed int in the range (0,MAX_RAND]
+    //Draw a equidistributed int in the range [0,MAX_RAND)
    KOKKOS_INLINE_FUNCTION
    int rand();

-    //Draw a equidistributed int in the range (0,range]
+    //Draw a equidistributed int in the range [0,range)
    KOKKOS_INLINE_FUNCTION
    int rand(const int& range);

-    //Draw a equidistributed int in the range (start,end]
+    //Draw a equidistributed int in the range [start,end)
    KOKKOS_INLINE_FUNCTION
    int rand(const int& start, const int& end );

-    //Draw a equidistributed int64_t in the range (0,MAX_RAND64]
+    //Draw a equidistributed int64_t in the range [0,MAX_RAND64)
    KOKKOS_INLINE_FUNCTION
    int64_t rand64();

-    //Draw a equidistributed int64_t in the range (0,range]
+    //Draw a equidistributed int64_t in the range [0,range)
    KOKKOS_INLINE_FUNCTION
    int64_t rand64(const int64_t& range);

-    //Draw a equidistributed int64_t in the range (start,end]
+    //Draw a equidistributed int64_t in the range [start,end)
    KOKKOS_INLINE_FUNCTION
    int64_t rand64(const int64_t& start, const int64_t& end );

-    //Draw a equidistributed float in the range (0,1.0]
+    //Draw a equidistributed float in the range [0,1.0)
    KOKKOS_INLINE_FUNCTION
    float frand();

-    //Draw a equidistributed float in the range (0,range]
+    //Draw a equidistributed float in the range [0,range)
    KOKKOS_INLINE_FUNCTION
    float frand(const float& range);

-    //Draw a equidistributed float in the range (start,end]
+    //Draw a equidistributed float in the range [start,end)
    KOKKOS_INLINE_FUNCTION
    float frand(const float& start, const float& end );

-    //Draw a equidistributed double in the range (0,1.0]
+    //Draw a equidistributed double in the range [0,1.0)
    KOKKOS_INLINE_FUNCTION
    double drand();

-    //Draw a equidistributed double in the range (0,range]
+    //Draw a equidistributed double in the range [0,range)
    KOKKOS_INLINE_FUNCTION
    double drand(const double& range);

-    //Draw a equidistributed double in the range (start,end]
+    //Draw a equidistributed double in the range [start,end)
    KOKKOS_INLINE_FUNCTION
    double drand(const double& start, const double& end );

@ -221,11 +221,11 @@ namespace Kokkos {

    //Additional Functions:

-    //Fills view with random numbers in the range (0,range]
+    //Fills view with random numbers in the range [0,range)
    template<class ViewType, class PoolType>
    void fill_random(ViewType view, PoolType pool, ViewType::value_type range);

-    //Fills view with random numbers in the range (start,end]
+    //Fills view with random numbers in the range [start,end)
    template<class ViewType, class PoolType>
    void fill_random(ViewType view, PoolType pool,
                     ViewType::value_type start, ViewType::value_type end);
@ -381,7 +381,7 @@ struct rand<Generator, unsigned long> {
 // NOTE (mfh 26 oct 2014) This is a partial specialization for long
 // long, a C99 / C++11 signed type which is guaranteed to be at
 // least 64 bits.  Do NOT write a partial specialization for
-// int64_t!!!  This is just a typedef!  It could be either long or
+// int64_t!!!  This is just an alias!  It could be either long or
 // long long.  We don't know which a priori, and I've seen both.
 // The types long and long long are guaranteed to differ, so it's
 // always safe to specialize for both.
@ -413,7 +413,7 @@ struct rand<Generator, long long> {
 // NOTE (mfh 26 oct 2014) This is a partial specialization for
 // unsigned long long, a C99 / C++11 unsigned type which is
 // guaranteed to be at least 64 bits.  Do NOT write a partial
-// specialization for uint64_t!!!  This is just a typedef!  It could
+// specialization for uint64_t!!!  This is just an alias!  It could
 // be either unsigned long or unsigned long long.  We don't know
 // which a priori, and I've seen both.  The types unsigned long and
 // unsigned long long are guaranteed to differ, so it's always safe
@ -604,11 +604,7 @@ struct Random_UniqueIndex {
  KOKKOS_FUNCTION
  static int get_state_idx(const locks_view_type) {
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-    const int i = ExecutionSpace::hardware_thread_id();
-#else
    const int i = ExecutionSpace::impl_hardware_thread_id();
-#endif
    return i;
 #else
    return 0;
@ -652,15 +648,13 @@ struct Random_UniqueIndex<Kokkos::Experimental::HIP> {
  static int get_state_idx(const locks_view_type& locks_) {
 #ifdef __HIP_DEVICE_COMPILE__
    const int i_offset =
-        (hipThreadIdx_x * hipBlockDim_y + hipThreadIdx_y) * hipBlockDim_z +
-        hipThreadIdx_z;
-    int i = (((hipBlockIdx_x * hipGridDim_y + hipBlockIdx_y) * hipGridDim_z +
-              hipBlockIdx_z) *
-                 hipBlockDim_x * hipBlockDim_y * hipBlockDim_z +
+        (threadIdx.x * blockDim.y + threadIdx.y) * blockDim.z + threadIdx.z;
+    int i = (((blockIdx.x * gridDim.y + blockIdx.y) * gridDim.z + blockIdx.z) *
+                 blockDim.x * blockDim.y * blockDim.z +
             i_offset) %
            locks_.extent(0);
    while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
-      i += hipBlockDim_x * hipBlockDim_y * hipBlockDim_z;
+      i += blockDim.x * blockDim.y * blockDim.z;
      if (i >= static_cast<int>(locks_.extent(0))) {
        i = i_offset;
      }
@ -687,7 +681,7 @@ class Random_XorShift64 {
  friend class Random_XorShift64_Pool<DeviceType>;

 public:
-  typedef DeviceType device_type;
+  using device_type = DeviceType;

  constexpr static uint32_t MAX_URAND   = std::numeric_limits<uint32_t>::max();
  constexpr static uint64_t MAX_URAND64 = std::numeric_limits<uint64_t>::max();
@ -805,11 +799,6 @@ class Random_XorShift64 {
  // number
  KOKKOS_INLINE_FUNCTION
  double normal() {
-#ifndef __HIP_DEVICE_COMPILE__  // FIXME_HIP
-    using std::sqrt;
-#else
-    using ::sqrt;
-#endif
    double S = 2.0;
    double U;
    while (S >= 1.0) {
@ -817,7 +806,7 @@ class Random_XorShift64 {
      const double V = 2.0 * drand() - 1.0;
      S              = U * U + V * V;
    }
-    return U * sqrt(-2.0 * log(S) / S);
+    return U * std::sqrt(-2.0 * log(S) / S);
  }

  KOKKOS_INLINE_FUNCTION
@ -830,15 +819,15 @@ template <class DeviceType = Kokkos::DefaultExecutionSpace>
 class Random_XorShift64_Pool {
 private:
  using execution_space = typename DeviceType::execution_space;
-  typedef View<int*, execution_space> locks_type;
-  typedef View<uint64_t*, DeviceType> state_data_type;
+  using locks_type      = View<int*, execution_space>;
+  using state_data_type = View<uint64_t*, DeviceType>;
  locks_type locks_;
  state_data_type state_;
  int num_states_;

 public:
-  typedef Random_XorShift64<DeviceType> generator_type;
-  typedef DeviceType device_type;
+  using generator_type = Random_XorShift64<DeviceType>;
+  using device_type    = DeviceType;

  KOKKOS_INLINE_FUNCTION
  Random_XorShift64_Pool() { num_states_ = 0; }
@ -923,8 +912,8 @@ class Random_XorShift1024 {
  friend class Random_XorShift1024_Pool<DeviceType>;

 public:
-  typedef Random_XorShift1024_Pool<DeviceType> pool_type;
-  typedef DeviceType device_type;
+  using pool_type   = Random_XorShift1024_Pool<DeviceType>;
+  using device_type = DeviceType;

  constexpr static uint32_t MAX_URAND   = std::numeric_limits<uint32_t>::max();
  constexpr static uint64_t MAX_URAND64 = std::numeric_limits<uint64_t>::max();
@ -1046,11 +1035,6 @@ class Random_XorShift1024 {
  // number
  KOKKOS_INLINE_FUNCTION
  double normal() {
-#ifndef KOKKOS_ENABLE_HIP  // FIXME_HIP
-    using std::sqrt;
-#else
-    using ::sqrt;
-#endif
    double S = 2.0;
    double U;
    while (S >= 1.0) {
@ -1058,7 +1042,7 @@ class Random_XorShift1024 {
      const double V = 2.0 * drand() - 1.0;
      S              = U * U + V * V;
    }
-    return U * sqrt(-2.0 * log(S) / S);
+    return U * std::sqrt(-2.0 * log(S) / S);
  }

  KOKKOS_INLINE_FUNCTION
@ -1071,9 +1055,9 @@ template <class DeviceType = Kokkos::DefaultExecutionSpace>
 class Random_XorShift1024_Pool {
 private:
  using execution_space = typename DeviceType::execution_space;
-  typedef View<int*, execution_space> locks_type;
-  typedef View<int*, DeviceType> int_view_type;
-  typedef View<uint64_t * [16], DeviceType> state_data_type;
+  using locks_type      = View<int*, execution_space>;
+  using int_view_type   = View<int*, DeviceType>;
+  using state_data_type = View<uint64_t * [16], DeviceType>;

  locks_type locks_;
  state_data_type state_;
@ -1082,9 +1066,9 @@ class Random_XorShift1024_Pool {
  friend class Random_XorShift1024<DeviceType>;

 public:
-  typedef Random_XorShift1024<DeviceType> generator_type;
+  using generator_type = Random_XorShift1024<DeviceType>;

-  typedef DeviceType device_type;
+  using device_type = DeviceType;

  KOKKOS_INLINE_FUNCTION
  Random_XorShift1024_Pool() { num_states_ = 0; }
@ -1176,14 +1160,13 @@ struct fill_random_functor_begin_end;

 template <class ViewType, class RandomPool, int loops, class IndexType>
 struct fill_random_functor_range<ViewType, RandomPool, loops, 1, IndexType> {
-  typedef typename ViewType::execution_space execution_space;
+  using execution_space = typename ViewType::execution_space;
  ViewType a;
  RandomPool rand_pool;
  typename ViewType::const_value_type range;

-  typedef rand<typename RandomPool::generator_type,
-               typename ViewType::non_const_value_type>
-      Rand;
+  using Rand = rand<typename RandomPool::generator_type,
+                    typename ViewType::non_const_value_type>;

  fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
                            typename ViewType::const_value_type range_)
@ -1203,14 +1186,13 @@ struct fill_random_functor_range<ViewType, RandomPool, loops, 1, IndexType> {

 template <class ViewType, class RandomPool, int loops, class IndexType>
 struct fill_random_functor_range<ViewType, RandomPool, loops, 2, IndexType> {
-  typedef typename ViewType::execution_space execution_space;
+  using execution_space = typename ViewType::execution_space;
  ViewType a;
  RandomPool rand_pool;
  typename ViewType::const_value_type range;

-  typedef rand<typename RandomPool::generator_type,
-               typename ViewType::non_const_value_type>
-      Rand;
+  using Rand = rand<typename RandomPool::generator_type,
+                    typename ViewType::non_const_value_type>;

  fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
                            typename ViewType::const_value_type range_)
@ -1232,14 +1214,13 @@ struct fill_random_functor_range<ViewType, RandomPool, loops, 2, IndexType> {

 template <class ViewType, class RandomPool, int loops, class IndexType>
 struct fill_random_functor_range<ViewType, RandomPool, loops, 3, IndexType> {
-  typedef typename ViewType::execution_space execution_space;
+  using execution_space = typename ViewType::execution_space;
  ViewType a;
  RandomPool rand_pool;
  typename ViewType::const_value_type range;

-  typedef rand<typename RandomPool::generator_type,
-               typename ViewType::non_const_value_type>
-      Rand;
+  using Rand = rand<typename RandomPool::generator_type,
+                    typename ViewType::non_const_value_type>;

  fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
                            typename ViewType::const_value_type range_)
@ -1262,14 +1243,13 @@ struct fill_random_functor_range<ViewType, RandomPool, loops, 3, IndexType> {

 template <class ViewType, class RandomPool, int loops, class IndexType>
 struct fill_random_functor_range<ViewType, RandomPool, loops, 4, IndexType> {
-  typedef typename ViewType::execution_space execution_space;
+  using execution_space = typename ViewType::execution_space;
  ViewType a;
  RandomPool rand_pool;
  typename ViewType::const_value_type range;

-  typedef rand<typename RandomPool::generator_type,
-               typename ViewType::non_const_value_type>
-      Rand;
+  using Rand = rand<typename RandomPool::generator_type,
+                    typename ViewType::non_const_value_type>;

  fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
                            typename ViewType::const_value_type range_)
@ -1293,14 +1273,13 @@ struct fill_random_functor_range<ViewType, RandomPool, loops, 4, IndexType> {

 template <class ViewType, class RandomPool, int loops, class IndexType>
 struct fill_random_functor_range<ViewType, RandomPool, loops, 5, IndexType> {
-  typedef typename ViewType::execution_space execution_space;
+  using execution_space = typename ViewType::execution_space;
  ViewType a;
  RandomPool rand_pool;
  typename ViewType::const_value_type range;

-  typedef rand<typename RandomPool::generator_type,
-               typename ViewType::non_const_value_type>
-      Rand;
+  using Rand = rand<typename RandomPool::generator_type,
+                    typename ViewType::non_const_value_type>;

  fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
                            typename ViewType::const_value_type range_)
@ -1326,14 +1305,13 @@ struct fill_random_functor_range<ViewType, RandomPool, loops, 5, IndexType> {

 template <class ViewType, class RandomPool, int loops, class IndexType>
 struct fill_random_functor_range<ViewType, RandomPool, loops, 6, IndexType> {
-  typedef typename ViewType::execution_space execution_space;
+  using execution_space = typename ViewType::execution_space;
  ViewType a;
  RandomPool rand_pool;
  typename ViewType::const_value_type range;

-  typedef rand<typename RandomPool::generator_type,
-               typename ViewType::non_const_value_type>
-      Rand;
+  using Rand = rand<typename RandomPool::generator_type,
+                    typename ViewType::non_const_value_type>;

  fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
                            typename ViewType::const_value_type range_)
@ -1361,14 +1339,13 @@ struct fill_random_functor_range<ViewType, RandomPool, loops, 6, IndexType> {

 template <class ViewType, class RandomPool, int loops, class IndexType>
 struct fill_random_functor_range<ViewType, RandomPool, loops, 7, IndexType> {
-  typedef typename ViewType::execution_space execution_space;
+  using execution_space = typename ViewType::execution_space;
  ViewType a;
  RandomPool rand_pool;
  typename ViewType::const_value_type range;

-  typedef rand<typename RandomPool::generator_type,
-               typename ViewType::non_const_value_type>
-      Rand;
+  using Rand = rand<typename RandomPool::generator_type,
+                    typename ViewType::non_const_value_type>;

  fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
                            typename ViewType::const_value_type range_)
@ -1398,14 +1375,13 @@ struct fill_random_functor_range<ViewType, RandomPool, loops, 7, IndexType> {

 template <class ViewType, class RandomPool, int loops, class IndexType>
 struct fill_random_functor_range<ViewType, RandomPool, loops, 8, IndexType> {
-  typedef typename ViewType::execution_space execution_space;
+  using execution_space = typename ViewType::execution_space;
  ViewType a;
  RandomPool rand_pool;
  typename ViewType::const_value_type range;

-  typedef rand<typename RandomPool::generator_type,
-               typename ViewType::non_const_value_type>
-      Rand;
+  using Rand = rand<typename RandomPool::generator_type,
+                    typename ViewType::non_const_value_type>;

  fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
                            typename ViewType::const_value_type range_)
@ -1437,14 +1413,13 @@ struct fill_random_functor_range<ViewType, RandomPool, loops, 8, IndexType> {
 template <class ViewType, class RandomPool, int loops, class IndexType>
 struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 1,
                                     IndexType> {
-  typedef typename ViewType::execution_space execution_space;
+  using execution_space = typename ViewType::execution_space;
  ViewType a;
  RandomPool rand_pool;
  typename ViewType::const_value_type begin, end;

-  typedef rand<typename RandomPool::generator_type,
-               typename ViewType::non_const_value_type>
-      Rand;
+  using Rand = rand<typename RandomPool::generator_type,
+                    typename ViewType::non_const_value_type>;

  fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
                                typename ViewType::const_value_type begin_,
@ -1466,14 +1441,13 @@ struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 1,
 template <class ViewType, class RandomPool, int loops, class IndexType>
 struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 2,
                                     IndexType> {
-  typedef typename ViewType::execution_space execution_space;
+  using execution_space = typename ViewType::execution_space;
  ViewType a;
  RandomPool rand_pool;
  typename ViewType::const_value_type begin, end;

-  typedef rand<typename RandomPool::generator_type,
-               typename ViewType::non_const_value_type>
-      Rand;
+  using Rand = rand<typename RandomPool::generator_type,
+                    typename ViewType::non_const_value_type>;

  fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
                                typename ViewType::const_value_type begin_,
@ -1497,14 +1471,13 @@ struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 2,
 template <class ViewType, class RandomPool, int loops, class IndexType>
 struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 3,
                                     IndexType> {
-  typedef typename ViewType::execution_space execution_space;
+  using execution_space = typename ViewType::execution_space;
  ViewType a;
  RandomPool rand_pool;
  typename ViewType::const_value_type begin, end;

-  typedef rand<typename RandomPool::generator_type,
-               typename ViewType::non_const_value_type>
-      Rand;
+  using Rand = rand<typename RandomPool::generator_type,
+                    typename ViewType::non_const_value_type>;

  fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
                                typename ViewType::const_value_type begin_,
@ -1529,14 +1502,13 @@ struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 3,
 template <class ViewType, class RandomPool, int loops, class IndexType>
 struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 4,
                                     IndexType> {
-  typedef typename ViewType::execution_space execution_space;
+  using execution_space = typename ViewType::execution_space;
  ViewType a;
  RandomPool rand_pool;
  typename ViewType::const_value_type begin, end;

-  typedef rand<typename RandomPool::generator_type,
-               typename ViewType::non_const_value_type>
-      Rand;
+  using Rand = rand<typename RandomPool::generator_type,
+                    typename ViewType::non_const_value_type>;

  fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
                                typename ViewType::const_value_type begin_,
@ -1562,14 +1534,13 @@ struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 4,
 template <class ViewType, class RandomPool, int loops, class IndexType>
 struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 5,
                                     IndexType> {
-  typedef typename ViewType::execution_space execution_space;
+  using execution_space = typename ViewType::execution_space;
  ViewType a;
  RandomPool rand_pool;
  typename ViewType::const_value_type begin, end;

-  typedef rand<typename RandomPool::generator_type,
-               typename ViewType::non_const_value_type>
-      Rand;
+  using Rand = rand<typename RandomPool::generator_type,
+                    typename ViewType::non_const_value_type>;

  fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
                                typename ViewType::const_value_type begin_,
@ -1597,14 +1568,13 @@ struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 5,
 template <class ViewType, class RandomPool, int loops, class IndexType>
 struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 6,
                                     IndexType> {
-  typedef typename ViewType::execution_space execution_space;
+  using execution_space = typename ViewType::execution_space;
  ViewType a;
  RandomPool rand_pool;
  typename ViewType::const_value_type begin, end;

-  typedef rand<typename RandomPool::generator_type,
-               typename ViewType::non_const_value_type>
-      Rand;
+  using Rand = rand<typename RandomPool::generator_type,
+                    typename ViewType::non_const_value_type>;

  fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
                                typename ViewType::const_value_type begin_,
@ -1634,14 +1604,13 @@ struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 6,
 template <class ViewType, class RandomPool, int loops, class IndexType>
 struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 7,
                                     IndexType> {
-  typedef typename ViewType::execution_space execution_space;
+  using execution_space = typename ViewType::execution_space;
  ViewType a;
  RandomPool rand_pool;
  typename ViewType::const_value_type begin, end;

-  typedef rand<typename RandomPool::generator_type,
-               typename ViewType::non_const_value_type>
-      Rand;
+  using Rand = rand<typename RandomPool::generator_type,
+                    typename ViewType::non_const_value_type>;

  fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
                                typename ViewType::const_value_type begin_,
@ -1673,14 +1642,13 @@ struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 7,
 template <class ViewType, class RandomPool, int loops, class IndexType>
 struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 8,
                                     IndexType> {
-  typedef typename ViewType::execution_space execution_space;
+  using execution_space = typename ViewType::execution_space;
  ViewType a;
  RandomPool rand_pool;
  typename ViewType::const_value_type begin, end;

-  typedef rand<typename RandomPool::generator_type,
-               typename ViewType::non_const_value_type>
-      Rand;
+  using Rand = rand<typename RandomPool::generator_type,
+                    typename ViewType::non_const_value_type>;

  fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
                                typename ViewType::const_value_type begin_,
--- a/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
@ -95,9 +95,9 @@ class BinSort {
 public:
  template <class DstViewType, class SrcViewType>
  struct copy_functor {
-    typedef typename SrcViewType::const_type src_view_type;
+    using src_view_type = typename SrcViewType::const_type;

-    typedef Impl::CopyOp<DstViewType, src_view_type> copy_op;
+    using copy_op = Impl::CopyOp<DstViewType, src_view_type>;

    DstViewType dst_values;
    src_view_type src_values;
@ -120,17 +120,17 @@ class BinSort {
    // If a Kokkos::View then can generate constant random access
    // otherwise can only use the constant type.

-    typedef typename std::conditional<
+    using src_view_type = typename std::conditional<
        Kokkos::is_view<SrcViewType>::value,
        Kokkos::View<typename SrcViewType::const_data_type,
                     typename SrcViewType::array_layout,
                     typename SrcViewType::device_type,
                     Kokkos::MemoryTraits<Kokkos::RandomAccess> >,
-        typename SrcViewType::const_type>::type src_view_type;
+        typename SrcViewType::const_type>::type;

-    typedef typename PermuteViewType::const_type perm_view_type;
+    using perm_view_type = typename PermuteViewType::const_type;

-    typedef Impl::CopyOp<DstViewType, src_view_type> copy_op;
+    using copy_op = Impl::CopyOp<DstViewType, src_view_type>;

    DstViewType dst_values;
    perm_view_type sort_order;
@ -151,8 +151,8 @@ class BinSort {
    }
  };

-  typedef typename Space::execution_space execution_space;
-  typedef BinSortOp bin_op_type;
+  using execution_space = typename Space::execution_space;
+  using bin_op_type     = BinSortOp;

  struct bin_count_tag {};
  struct bin_offset_tag {};
@ -160,30 +160,30 @@ class BinSort {
  struct bin_sort_bins_tag {};

 public:
-  typedef SizeType size_type;
-  typedef size_type value_type;
+  using size_type  = SizeType;
+  using value_type = size_type;

-  typedef Kokkos::View<size_type*, Space> offset_type;
-  typedef Kokkos::View<const int*, Space> bin_count_type;
+  using offset_type    = Kokkos::View<size_type*, Space>;
+  using bin_count_type = Kokkos::View<const int*, Space>;

-  typedef typename KeyViewType::const_type const_key_view_type;
+  using const_key_view_type = typename KeyViewType::const_type;

  // If a Kokkos::View then can generate constant random access
  // otherwise can only use the constant type.

-  typedef typename std::conditional<
+  using const_rnd_key_view_type = typename std::conditional<
      Kokkos::is_view<KeyViewType>::value,
      Kokkos::View<typename KeyViewType::const_data_type,
                   typename KeyViewType::array_layout,
                   typename KeyViewType::device_type,
                   Kokkos::MemoryTraits<Kokkos::RandomAccess> >,
-      const_key_view_type>::type const_rnd_key_view_type;
+      const_key_view_type>::type;

-  typedef typename KeyViewType::non_const_value_type non_const_key_scalar;
-  typedef typename KeyViewType::const_value_type const_key_scalar;
+  using non_const_key_scalar = typename KeyViewType::non_const_value_type;
+  using const_key_scalar     = typename KeyViewType::const_value_type;

-  typedef Kokkos::View<int*, Space, Kokkos::MemoryTraits<Kokkos::Atomic> >
-      bin_count_atomic_type;
+  using bin_count_atomic_type =
+      Kokkos::View<int*, Space, Kokkos::MemoryTraits<Kokkos::Atomic> >;

 private:
  const_key_view_type keys;
@ -266,10 +266,10 @@ class BinSort {
  template <class ValuesViewType>
  void sort(ValuesViewType const& values, int values_range_begin,
            int values_range_end) const {
-    typedef Kokkos::View<typename ValuesViewType::data_type,
-                         typename ValuesViewType::array_layout,
-                         typename ValuesViewType::device_type>
-        scratch_view_type;
+    using scratch_view_type =
+        Kokkos::View<typename ValuesViewType::data_type,
+                     typename ValuesViewType::array_layout,
+                     typename ValuesViewType::device_type>;

    const size_t len        = range_end - range_begin;
    const size_t values_len = values_range_end - values_range_begin;
@ -278,13 +278,6 @@ class BinSort {
          "BinSort::sort: values range length != permutation vector length");
    }

-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-    scratch_view_type sorted_values(
-        ViewAllocateWithoutInitializing(
-            "Kokkos::SortImpl::BinSortFunctor::sorted_values"),
-        len, values.extent(1), values.extent(2), values.extent(3),
-        values.extent(4), values.extent(5), values.extent(6), values.extent(7));
-#else
    scratch_view_type sorted_values(
        ViewAllocateWithoutInitializing(
            "Kokkos::SortImpl::BinSortFunctor::sorted_values"),
@ -303,7 +296,6 @@ class BinSort {
                                : KOKKOS_IMPL_CTOR_DEFAULT_ARG,
        values.rank_dynamic > 7 ? values.extent(7)
                                : KOKKOS_IMPL_CTOR_DEFAULT_ARG);
-#endif

    {
      copy_permute_functor<scratch_view_type /* DstViewType */
@ -511,8 +503,8 @@ bool try_std_sort(ViewType view) {

 template <class ViewType>
 struct min_max_functor {
-  typedef Kokkos::MinMaxScalar<typename ViewType::non_const_value_type>
-      minmax_scalar;
+  using minmax_scalar =
+      Kokkos::MinMaxScalar<typename ViewType::non_const_value_type>;

  ViewType view;
  min_max_functor(const ViewType& view_) : view(view_) {}
@ -531,7 +523,7 @@ void sort(ViewType const& view, bool const always_use_kokkos_sort = false) {
  if (!always_use_kokkos_sort) {
    if (Impl::try_std_sort(view)) return;
  }
-  typedef BinOp1D<ViewType> CompType;
+  using CompType = BinOp1D<ViewType>;

  Kokkos::MinMaxScalar<typename ViewType::non_const_value_type> result;
  Kokkos::MinMax<typename ViewType::non_const_value_type> reducer(result);
@ -548,8 +540,8 @@ void sort(ViewType const& view, bool const always_use_kokkos_sort = false) {

 template <class ViewType>
 void sort(ViewType view, size_t const begin, size_t const end) {
-  typedef Kokkos::RangePolicy<typename ViewType::execution_space> range_policy;
-  typedef BinOp1D<ViewType> CompType;
+  using range_policy = Kokkos::RangePolicy<typename ViewType::execution_space>;
+  using CompType     = BinOp1D<ViewType>;

  Kokkos::MinMaxScalar<typename ViewType::non_const_value_type> result;
  Kokkos::MinMax<typename ViewType::non_const_value_type> reducer(result);
--- a/lib/kokkos/algorithms/unit_tests/CMakeLists.txt
+++ b/lib/kokkos/algorithms/unit_tests/CMakeLists.txt
@ -20,14 +20,18 @@ KOKKOS_ADD_TEST_LIBRARY(
  HEADERS ${GTEST_SOURCE_DIR}/gtest/gtest.h
  SOURCES ${GTEST_SOURCE_DIR}/gtest/gtest-all.cc
 )
-# WORKAROUND FOR HIPCC
-IF(Kokkos_ENABLE_HIP)
-  TARGET_COMPILE_DEFINITIONS(kokkosalgorithms_gtest PUBLIC "-DGTEST_HAS_PTHREAD=0 --amdgpu-target=gfx906")
-ELSE()
-  TARGET_COMPILE_DEFINITIONS(kokkosalgorithms_gtest PUBLIC "-DGTEST_HAS_PTHREAD=0")
+
+# avoid deprecation warnings from MSVC
+TARGET_COMPILE_DEFINITIONS(kokkosalgorithms_gtest PUBLIC GTEST_HAS_TR1_TUPLE=0 GTEST_HAS_PTHREAD=0)
+
+IF(NOT (Kokkos_ENABLE_CUDA AND WIN32))
+TARGET_COMPILE_FEATURES(kokkosalgorithms_gtest PUBLIC cxx_std_11)
 ENDIF()

-TARGET_COMPILE_FEATURES(kokkosalgorithms_gtest PUBLIC cxx_std_11)
+# Suppress clang-tidy diagnostics on code that we do not have control over
+IF(CMAKE_CXX_CLANG_TIDY)
+  SET_TARGET_PROPERTIES(kokkosalgorithms_gtest PROPERTIES CXX_CLANG_TIDY "")
+ENDIF()

 SET(SOURCES
  UnitTestMain.cpp
--- a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp
+++ b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp
@ -111,10 +111,10 @@ struct RandomProperties {

 template <class GeneratorPool, class Scalar>
 struct test_random_functor {
-  typedef typename GeneratorPool::generator_type rnd_type;
+  using rnd_type = typename GeneratorPool::generator_type;

-  typedef RandomProperties value_type;
-  typedef typename GeneratorPool::device_type device_type;
+  using value_type  = RandomProperties;
+  using device_type = typename GeneratorPool::device_type;

  GeneratorPool rand_pool;
  const double mean;
@ -125,12 +125,12 @@ struct test_random_functor {
  // implementations might violate this upper bound, due to rounding
  // error.  Just in case, we leave an extra space at the end of each
  // dimension, in the View types below.
-  typedef Kokkos::View<int[HIST_DIM1D + 1], typename GeneratorPool::device_type>
-      type_1d;
+  using type_1d =
+      Kokkos::View<int[HIST_DIM1D + 1], typename GeneratorPool::device_type>;
  type_1d density_1d;
-  typedef Kokkos::View<int[HIST_DIM3D + 1][HIST_DIM3D + 1][HIST_DIM3D + 1],
-                       typename GeneratorPool::device_type>
-      type_3d;
+  using type_3d =
+      Kokkos::View<int[HIST_DIM3D + 1][HIST_DIM3D + 1][HIST_DIM3D + 1],
+                   typename GeneratorPool::device_type>;
  type_3d density_3d;

  test_random_functor(GeneratorPool rand_pool_, type_1d d1d, type_3d d3d)
@ -200,9 +200,9 @@ struct test_random_functor {

 template <class DeviceType>
 struct test_histogram1d_functor {
-  typedef RandomProperties value_type;
-  typedef typename DeviceType::execution_space execution_space;
-  typedef typename DeviceType::memory_space memory_space;
+  using value_type      = RandomProperties;
+  using execution_space = typename DeviceType::execution_space;
+  using memory_space    = typename DeviceType::memory_space;

  // NOTE (mfh 03 Nov 2014): Kokkos::rand::max() is supposed to define
  // an exclusive upper bound on the range of random numbers that
@ -210,7 +210,7 @@ struct test_histogram1d_functor {
  // implementations might violate this upper bound, due to rounding
  // error.  Just in case, we leave an extra space at the end of each
  // dimension, in the View type below.
-  typedef Kokkos::View<int[HIST_DIM1D + 1], memory_space> type_1d;
+  using type_1d = Kokkos::View<int[HIST_DIM1D + 1], memory_space>;
  type_1d density_1d;
  double mean;

@ -219,7 +219,7 @@ struct test_histogram1d_functor {

  KOKKOS_INLINE_FUNCTION void operator()(
      const typename memory_space::size_type i, RandomProperties& prop) const {
-    typedef typename memory_space::size_type size_type;
+    using size_type    = typename memory_space::size_type;
    const double count = density_1d(i);
    prop.mean += count;
    prop.variance += 1.0 * (count - mean) * (count - mean);
@ -234,9 +234,9 @@ struct test_histogram1d_functor {

 template <class DeviceType>
 struct test_histogram3d_functor {
-  typedef RandomProperties value_type;
-  typedef typename DeviceType::execution_space execution_space;
-  typedef typename DeviceType::memory_space memory_space;
+  using value_type      = RandomProperties;
+  using execution_space = typename DeviceType::execution_space;
+  using memory_space    = typename DeviceType::memory_space;

  // NOTE (mfh 03 Nov 2014): Kokkos::rand::max() is supposed to define
  // an exclusive upper bound on the range of random numbers that
@ -244,9 +244,9 @@ struct test_histogram3d_functor {
  // implementations might violate this upper bound, due to rounding
  // error.  Just in case, we leave an extra space at the end of each
  // dimension, in the View type below.
-  typedef Kokkos::View<int[HIST_DIM3D + 1][HIST_DIM3D + 1][HIST_DIM3D + 1],
-                       memory_space>
-      type_3d;
+  using type_3d =
+      Kokkos::View<int[HIST_DIM3D + 1][HIST_DIM3D + 1][HIST_DIM3D + 1],
+                   memory_space>;
  type_3d density_3d;
  double mean;

@ -255,7 +255,7 @@ struct test_histogram3d_functor {

  KOKKOS_INLINE_FUNCTION void operator()(
      const typename memory_space::size_type i, RandomProperties& prop) const {
-    typedef typename memory_space::size_type size_type;
+    using size_type    = typename memory_space::size_type;
    const double count = density_3d(
        i / (HIST_DIM3D * HIST_DIM3D),
        (i % (HIST_DIM3D * HIST_DIM3D)) / HIST_DIM3D, i % HIST_DIM3D);
@ -276,7 +276,7 @@ struct test_histogram3d_functor {
 //
 template <class RandomGenerator, class Scalar>
 struct test_random_scalar {
-  typedef typename RandomGenerator::generator_type rnd_type;
+  using rnd_type = typename RandomGenerator::generator_type;

  int pass_mean, pass_var, pass_covar;
  int pass_hist1d_mean, pass_hist1d_var, pass_hist1d_covar;
@ -294,7 +294,7 @@ struct test_random_scalar {
      cout << " -- Testing randomness properties" << endl;

      RandomProperties result;
-      typedef test_random_functor<RandomGenerator, Scalar> functor_type;
+      using functor_type = test_random_functor<RandomGenerator, Scalar>;
      parallel_reduce(num_draws / 1024,
                      functor_type(pool, density_1d, density_3d), result);

@ -325,8 +325,8 @@ struct test_random_scalar {
      cout << " -- Testing 1-D histogram" << endl;

      RandomProperties result;
-      typedef test_histogram1d_functor<typename RandomGenerator::device_type>
-          functor_type;
+      using functor_type =
+          test_histogram1d_functor<typename RandomGenerator::device_type>;
      parallel_reduce(HIST_DIM1D, functor_type(density_1d, num_draws), result);

      double tolerance   = 6 * std::sqrt(1.0 / HIST_DIM1D);
@ -357,8 +357,8 @@ struct test_random_scalar {
      cout << " -- Testing 3-D histogram" << endl;

      RandomProperties result;
-      typedef test_histogram3d_functor<typename RandomGenerator::device_type>
-          functor_type;
+      using functor_type =
+          test_histogram3d_functor<typename RandomGenerator::device_type>;
      parallel_reduce(HIST_DIM1D, functor_type(density_3d, num_draws), result);

      double tolerance   = 6 * std::sqrt(1.0 / HIST_DIM1D);
--- a/lib/kokkos/algorithms/unit_tests/TestSort.hpp
+++ b/lib/kokkos/algorithms/unit_tests/TestSort.hpp
@ -55,8 +55,8 @@ namespace Impl {

 template <class ExecutionSpace, class Scalar>
 struct is_sorted_struct {
-  typedef unsigned int value_type;
-  typedef ExecutionSpace execution_space;
+  using value_type      = unsigned int;
+  using execution_space = ExecutionSpace;

  Kokkos::View<Scalar*, ExecutionSpace> keys;

@ -69,8 +69,8 @@ struct is_sorted_struct {

 template <class ExecutionSpace, class Scalar>
 struct sum {
-  typedef double value_type;
-  typedef ExecutionSpace execution_space;
+  using value_type      = double;
+  using execution_space = ExecutionSpace;

  Kokkos::View<Scalar*, ExecutionSpace> keys;

@ -81,8 +81,8 @@ struct sum {

 template <class ExecutionSpace, class Scalar>
 struct bin3d_is_sorted_struct {
-  typedef unsigned int value_type;
-  typedef ExecutionSpace execution_space;
+  using value_type      = unsigned int;
+  using execution_space = ExecutionSpace;

  Kokkos::View<Scalar * [3], ExecutionSpace> keys;

@ -115,8 +115,8 @@ struct bin3d_is_sorted_struct {

 template <class ExecutionSpace, class Scalar>
 struct sum3D {
-  typedef double value_type;
-  typedef ExecutionSpace execution_space;
+  using value_type      = double;
+  using execution_space = ExecutionSpace;

  Kokkos::View<Scalar * [3], ExecutionSpace> keys;

@ -131,7 +131,7 @@ struct sum3D {

 template <class ExecutionSpace, typename KeyType>
 void test_1D_sort_impl(unsigned int n, bool force_kokkos) {
-  typedef Kokkos::View<KeyType*, ExecutionSpace> KeyViewType;
+  using KeyViewType = Kokkos::View<KeyType*, ExecutionSpace>;
  KeyViewType keys("Keys", n);

  // Test sorting array with all numbers equal
@ -166,7 +166,7 @@ void test_1D_sort_impl(unsigned int n, bool force_kokkos) {

 template <class ExecutionSpace, typename KeyType>
 void test_3D_sort_impl(unsigned int n) {
-  typedef Kokkos::View<KeyType * [3], ExecutionSpace> KeyViewType;
+  using KeyViewType = Kokkos::View<KeyType * [3], ExecutionSpace>;

  KeyViewType keys("Keys", n * n * n);

@ -186,7 +186,7 @@ void test_3D_sort_impl(unsigned int n) {
  typename KeyViewType::value_type min[3] = {0, 0, 0};
  typename KeyViewType::value_type max[3] = {100, 100, 100};

-  typedef Kokkos::BinOp3D<KeyViewType> BinOp;
+  using BinOp = Kokkos::BinOp3D<KeyViewType>;
  BinOp bin_op(bin_max, min, max);
  Kokkos::BinSort<KeyViewType, BinOp> Sorter(keys, bin_op, false);
  Sorter.create_permute_vector();
@ -215,9 +215,9 @@ void test_3D_sort_impl(unsigned int n) {

 template <class ExecutionSpace, typename KeyType>
 void test_dynamic_view_sort_impl(unsigned int n) {
-  typedef Kokkos::Experimental::DynamicView<KeyType*, ExecutionSpace>
-      KeyDynamicViewType;
-  typedef Kokkos::View<KeyType*, ExecutionSpace> KeyViewType;
+  using KeyDynamicViewType =
+      Kokkos::Experimental::DynamicView<KeyType*, ExecutionSpace>;
+  using KeyViewType = Kokkos::View<KeyType*, ExecutionSpace>;

  const size_t upper_bound    = 2 * n;
  const size_t min_chunk_size = 1024;
@ -305,8 +305,8 @@ void test_issue_1160_impl() {
  Kokkos::deep_copy(x_, h_x);
  Kokkos::deep_copy(v_, h_v);

-  typedef decltype(element_) KeyViewType;
-  typedef Kokkos::BinOp1D<KeyViewType> BinOp;
+  using KeyViewType = decltype(element_);
+  using BinOp       = Kokkos::BinOp1D<KeyViewType>;

  int begin = 3;
  int end   = 8;
--- a/lib/kokkos/appveyor.yml
+++ b/lib/kokkos/appveyor.yml
@ -5,6 +5,6 @@ build_script:
 - cmd: >-
    mkdir build &&
    cd build &&
-    cmake c:\projects\source -DKokkos_ENABLE_TESTS=ON -DKokkos_ENABLE_LIBDL=OFF -DKokkos_ENABLE_PROFILING=OFF &&
+    cmake c:\projects\source -DKokkos_ENABLE_TESTS=ON &&
    cmake --build . --target install &&
    ctest -C Debug -V
--- a/lib/kokkos/benchmarks/atomic/main.cpp
+++ b/lib/kokkos/benchmarks/atomic/main.cpp
@ -69,13 +69,13 @@ int main(int argc, char* argv[]) {
      return 0;
    }

-    int L    = atoi(argv[1]);
-    int N    = atoi(argv[2]);
-    int M    = atoi(argv[3]);
-    int D    = atoi(argv[4]);
-    int K    = atoi(argv[5]);
-    int R    = atoi(argv[6]);
-    int type = atoi(argv[7]);
+    int L    = std::stoi(argv[1]);
+    int N    = std::stoi(argv[2]);
+    int M    = std::stoi(argv[3]);
+    int D    = std::stoi(argv[4]);
+    int K    = std::stoi(argv[5]);
+    int R    = std::stoi(argv[6]);
+    int type = std::stoi(argv[7]);

    Kokkos::View<int*> offsets("Offsets", L, M);
    Kokkos::Random_XorShift64_Pool<> pool(12371);
--- a/lib/kokkos/benchmarks/bytes_and_flops/main.cpp
+++ b/lib/kokkos/benchmarks/bytes_and_flops/main.cpp
@ -73,15 +73,15 @@ int main(int argc, char* argv[]) {
    return 0;
  }

-  int P = atoi(argv[1]);
-  int N = atoi(argv[2]);
-  int K = atoi(argv[3]);
-  int R = atoi(argv[4]);
-  int D = atoi(argv[5]);
-  int U = atoi(argv[6]);
-  int F = atoi(argv[7]);
-  int T = atoi(argv[8]);
-  int S = atoi(argv[9]);
+  int P = std::stoi(argv[1]);
+  int N = std::stoi(argv[2]);
+  int K = std::stoi(argv[3]);
+  int R = std::stoi(argv[4]);
+  int D = std::stoi(argv[5]);
+  int U = std::stoi(argv[6]);
+  int F = std::stoi(argv[7]);
+  int T = std::stoi(argv[8]);
+  int S = std::stoi(argv[9]);

  if (U > 8) {
    printf("U must be 1-8\n");
--- a/lib/kokkos/benchmarks/gather/main.cpp
+++ b/lib/kokkos/benchmarks/gather/main.cpp
@ -72,13 +72,13 @@ int main(int argc, char* argv[]) {
    return 0;
  }

-  int S = atoi(argv[1]);
-  int N = atoi(argv[2]);
-  int K = atoi(argv[3]);
-  int D = atoi(argv[4]);
-  int R = atoi(argv[5]);
-  int U = atoi(argv[6]);
-  int F = atoi(argv[7]);
+  int S = std::stoi(argv[1]);
+  int N = std::stoi(argv[2]);
+  int K = std::stoi(argv[3]);
+  int D = std::stoi(argv[4]);
+  int R = std::stoi(argv[5]);
+  int U = std::stoi(argv[6]);
+  int F = std::stoi(argv[7]);

  if ((S != 1) && (S != 2) && (S != 4)) {
    printf("S must be one of 1,2,4\n");
--- a/lib/kokkos/benchmarks/gups/gups-kokkos.cc
+++ b/lib/kokkos/benchmarks/gups/gups-kokkos.cc
@ -50,151 +50,152 @@
 #define HLINE "-------------------------------------------------------------\n"

 #if defined(KOKKOS_ENABLE_CUDA)
-typedef Kokkos::View<int64_t*, Kokkos::CudaSpace>::HostMirror GUPSHostArray;
-typedef Kokkos::View<int64_t*, Kokkos::CudaSpace> GUPSDeviceArray;
+using GUPSHostArray   = Kokkos::View<int64_t*, Kokkos::CudaSpace>::HostMirror;
+using GUPSDeviceArray = Kokkos::View<int64_t*, Kokkos::CudaSpace>;
 #else
-typedef Kokkos::View<int64_t*, Kokkos::HostSpace>::HostMirror GUPSHostArray;
-typedef Kokkos::View<int64_t*, Kokkos::HostSpace> GUPSDeviceArray;
+using GUPSHostArray   = Kokkos::View<int64_t*, Kokkos::HostSpace>::HostMirror;
+using GUPSDeviceArray = Kokkos::View<int64_t*, Kokkos::HostSpace>;
 #endif

-typedef int GUPSIndex;
+using GUPSIndex = int;

 double now() {
-	struct timeval now;
-	gettimeofday(&now, nullptr);
+  struct timeval now;
+  gettimeofday(&now, nullptr);

-	return (double) now.tv_sec + ((double) now.tv_usec * 1.0e-6);
+  return (double)now.tv_sec + ((double)now.tv_usec * 1.0e-6);
 }

-void randomize_indices(GUPSHostArray& indices, GUPSDeviceArray& dev_indices, const int64_t dataCount) {
-	for( GUPSIndex i = 0; i < indices.extent(0); ++i ) {
-		indices[i] = lrand48() % dataCount;
-	}
+void randomize_indices(GUPSHostArray& indices, GUPSDeviceArray& dev_indices,
+                       const int64_t dataCount) {
+  for (GUPSIndex i = 0; i < indices.extent(0); ++i) {
+    indices[i] = lrand48() % dataCount;
+  }

-	Kokkos::deep_copy(dev_indices, indices);
+  Kokkos::deep_copy(dev_indices, indices);
 }

-void run_gups(GUPSDeviceArray& indices, GUPSDeviceArray& data, const int64_t datum,
-	const bool performAtomics) {
+void run_gups(GUPSDeviceArray& indices, GUPSDeviceArray& data,
+              const int64_t datum, const bool performAtomics) {
+  if (performAtomics) {
+    Kokkos::parallel_for(
+        "bench-gups-atomic", indices.extent(0),
+        KOKKOS_LAMBDA(const GUPSIndex i) {
+          Kokkos::atomic_fetch_xor(&data[indices[i]], datum);
+        });
+  } else {
+    Kokkos::parallel_for(
+        "bench-gups-non-atomic", indices.extent(0),
+        KOKKOS_LAMBDA(const GUPSIndex i) { data[indices[i]] ^= datum; });
+  }

-	if( performAtomics ) {
-		Kokkos::parallel_for("bench-gups-atomic", indices.extent(0), KOKKOS_LAMBDA(const GUPSIndex i) {
-			Kokkos::atomic_fetch_xor( &data[indices[i]], datum );
-		});
-	} else {
-		Kokkos::parallel_for("bench-gups-non-atomic", indices.extent(0), KOKKOS_LAMBDA(const GUPSIndex i) {
-			data[indices[i]] ^= datum;
-		});
-	}
-
-	Kokkos::fence();
+  Kokkos::fence();
 }

-int run_benchmark(const GUPSIndex indicesCount, const GUPSIndex dataCount, const int repeats,
-	const bool useAtomics) {
+int run_benchmark(const GUPSIndex indicesCount, const GUPSIndex dataCount,
+                  const int repeats, const bool useAtomics) {
+  printf("Reports fastest timing per kernel\n");
+  printf("Creating Views...\n");

-	printf("Reports fastest timing per kernel\n");
-	printf("Creating Views...\n");
+  printf("Memory Sizes:\n");
+  printf("- Elements:      %15" PRIu64 " (%12.4f MB)\n",
+         static_cast<uint64_t>(dataCount),
+         1.0e-6 * ((double)dataCount * (double)sizeof(int64_t)));
+  printf("- Indices:       %15" PRIu64 " (%12.4f MB)\n",
+         static_cast<uint64_t>(indicesCount),
+         1.0e-6 * ((double)indicesCount * (double)sizeof(int64_t)));
+  printf(" - Atomics:      %15s\n", (useAtomics ? "Yes" : "No"));
+  printf("Benchmark kernels will be performed for %d iterations.\n", repeats);

-	printf("Memory Sizes:\n");
-	printf("- Elements:      %15" PRIu64 " (%12.4f MB)\n", static_cast<uint64_t>(dataCount),
-		1.0e-6 * ((double) dataCount * (double) sizeof(int64_t)));
-	printf("- Indices:       %15" PRIu64 " (%12.4f MB)\n", static_cast<uint64_t>(indicesCount),
-		1.0e-6 * ((double) indicesCount * (double) sizeof(int64_t)));
-	printf(" - Atomics:      %15s\n", (useAtomics ? "Yes" : "No") );
-	printf("Benchmark kernels will be performed for %d iterations.\n", repeats);
+  printf(HLINE);

-	printf(HLINE);
+  GUPSDeviceArray dev_indices("indices", indicesCount);
+  GUPSDeviceArray dev_data("data", dataCount);
+  int64_t datum = -1;

-	GUPSDeviceArray dev_indices("indices", indicesCount);
-	GUPSDeviceArray dev_data("data", dataCount);
-	int64_t datum = -1;
+  GUPSHostArray indices = Kokkos::create_mirror_view(dev_indices);
+  GUPSHostArray data    = Kokkos::create_mirror_view(dev_data);

-	GUPSHostArray indices = Kokkos::create_mirror_view(dev_indices);
-	GUPSHostArray data    = Kokkos::create_mirror_view(dev_data);
+  double gupsTime = 0.0;

-	double gupsTime  = 0.0;
-
-	printf("Initializing Views...\n");
+  printf("Initializing Views...\n");

 #if defined(KOKKOS_HAVE_OPENMP)
-	Kokkos::parallel_for("init-data", Kokkos::RangePolicy<Kokkos::OpenMP>(0, dataCount),
+  Kokkos::parallel_for(
+      "init-data", Kokkos::RangePolicy<Kokkos::OpenMP>(0, dataCount),
 #else
-	Kokkos::parallel_for("init-data", Kokkos::RangePolicy<Kokkos::Serial>(0, dataCount),
+  Kokkos::parallel_for(
+      "init-data", Kokkos::RangePolicy<Kokkos::Serial>(0, dataCount),
 #endif
-		KOKKOS_LAMBDA(const int i) {
-
-		data[i] = 10101010101;
-	});
+      KOKKOS_LAMBDA(const int i) { data[i] = 10101010101; });

 #if defined(KOKKOS_HAVE_OPENMP)
-	Kokkos::parallel_for("init-indices", Kokkos::RangePolicy<Kokkos::OpenMP>(0, indicesCount),
+  Kokkos::parallel_for(
+      "init-indices", Kokkos::RangePolicy<Kokkos::OpenMP>(0, indicesCount),
 #else
-	Kokkos::parallel_for("init-indices", Kokkos::RangePolicy<Kokkos::Serial>(0, indicesCount),
+  Kokkos::parallel_for(
+      "init-indices", Kokkos::RangePolicy<Kokkos::Serial>(0, indicesCount),
 #endif
-		KOKKOS_LAMBDA(const int i) {
+      KOKKOS_LAMBDA(const int i) { indices[i] = 0; });

-		indices[i] = 0;
-	});
+  Kokkos::deep_copy(dev_data, data);
+  Kokkos::deep_copy(dev_indices, indices);
+  double start;

-	Kokkos::deep_copy(dev_data, data);
-	Kokkos::deep_copy(dev_indices, indices);
-	double start;
+  printf("Starting benchmarking...\n");

-	printf("Starting benchmarking...\n");
+  for (GUPSIndex k = 0; k < repeats; ++k) {
+    randomize_indices(indices, dev_indices, data.extent(0));

-	for( GUPSIndex k = 0; k < repeats; ++k ) {
-		randomize_indices(indices, dev_indices, data.extent(0));
+    start = now();
+    run_gups(dev_indices, dev_data, datum, useAtomics);
+    gupsTime += now() - start;
+  }

-		start = now();
-		run_gups(dev_indices, dev_data, datum, useAtomics);
-		gupsTime += now() - start;
-	}
+  Kokkos::deep_copy(indices, dev_indices);
+  Kokkos::deep_copy(data, dev_data);

-	Kokkos::deep_copy(indices, dev_indices);
-	Kokkos::deep_copy(data, dev_data);
+  printf(HLINE);
+  printf(
+      "GUP/s Random:      %18.6f\n",
+      (1.0e-9 * ((double)repeats) * (double)dev_indices.extent(0)) / gupsTime);
+  printf(HLINE);

-	printf(HLINE);
-	printf("GUP/s Random:      %18.6f\n",
-		(1.0e-9 * ((double) repeats) * (double) dev_indices.extent(0)) / gupsTime);
-	printf(HLINE);
-
-	return 0;
+  return 0;
 }

 int main(int argc, char* argv[]) {
+  printf(HLINE);
+  printf("Kokkos GUPS Benchmark\n");
+  printf(HLINE);

-	printf(HLINE);
-	printf("Kokkos GUPS Benchmark\n");
-	printf(HLINE);
+  srand48(1010101);

-	srand48(1010101);
+  Kokkos::initialize(argc, argv);

-	Kokkos::initialize(argc, argv);
+  int64_t indices = 8192;
+  int64_t data    = 33554432;
+  int64_t repeats = 10;
+  bool useAtomics = false;

-	int64_t indices = 8192;
-	int64_t data    = 33554432;
-	int64_t repeats = 10;
-	bool useAtomics = false;
+  for (int i = 1; i < argc; ++i) {
+    if (strcmp(argv[i], "--indices") == 0) {
+      indices = std::atoll(argv[i + 1]);
+      ++i;
+    } else if (strcmp(argv[i], "--data") == 0) {
+      data = std::atoll(argv[i + 1]);
+      ++i;
+    } else if (strcmp(argv[i], "--repeats") == 0) {
+      repeats = std::atoll(argv[i + 1]);
+      ++i;
+    } else if (strcmp(argv[i], "--atomics") == 0) {
+      useAtomics = true;
+    }
+  }

-	for( int i = 1; i < argc; ++i ) {
-		if( strcmp( argv[i], "--indices" ) == 0 ) {
-			indices = std::atoll(argv[i+1]);
-			++i;
-		} else if( strcmp( argv[i], "--data" ) == 0 ) {
-			data = std::atoll(argv[i+1]);
-			++i;
-		} else if( strcmp( argv[i], "--repeats" ) == 0 ) {
-			repeats = std::atoll(argv[i+1]);
-			++i;
-		} else if( strcmp( argv[i], "--atomics" ) == 0 ) {
-			useAtomics = true;
-		}
-	}
+  const int rc = run_benchmark(indices, data, repeats, useAtomics);

-	const int rc = run_benchmark(indices, data, repeats, useAtomics);
+  Kokkos::finalize();

-	Kokkos::finalize();
-
-	return rc;
+  return rc;
 }
--- a/lib/kokkos/benchmarks/policy_performance/main.cpp
+++ b/lib/kokkos/benchmarks/policy_performance/main.cpp
@ -94,22 +94,22 @@ int main(int argc, char* argv[]) {
    return 0;
  }

-  int team_range   = atoi(argv[1]);
-  int thread_range = atoi(argv[2]);
-  int vector_range = atoi(argv[3]);
+  int team_range   = std::stoi(argv[1]);
+  int thread_range = std::stoi(argv[2]);
+  int vector_range = std::stoi(argv[3]);

-  int outer_repeat  = atoi(argv[4]);
-  int thread_repeat = atoi(argv[5]);
-  int vector_repeat = atoi(argv[6]);
+  int outer_repeat  = std::stoi(argv[4]);
+  int thread_repeat = std::stoi(argv[5]);
+  int vector_repeat = std::stoi(argv[6]);

-  int team_size   = atoi(argv[7]);
-  int vector_size = atoi(argv[8]);
-  int schedule    = atoi(argv[9]);
-  int test_type   = atoi(argv[10]);
+  int team_size   = std::stoi(argv[7]);
+  int vector_size = std::stoi(argv[8]);
+  int schedule    = std::stoi(argv[9]);
+  int test_type   = std::stoi(argv[10]);

  int disable_verbose_output = 0;
  if (argc > 11) {
-    disable_verbose_output = atoi(argv[11]);
+    disable_verbose_output = std::stoi(argv[11]);
  }

  if (schedule != 1 && schedule != 2) {
@ -138,9 +138,9 @@ int main(int argc, char* argv[]) {
                    double& lval) { lval += 1; },
      result);

-  typedef Kokkos::View<double*, Kokkos::LayoutRight> view_type_1d;
-  typedef Kokkos::View<double**, Kokkos::LayoutRight> view_type_2d;
-  typedef Kokkos::View<double***, Kokkos::LayoutRight> view_type_3d;
+  using view_type_1d = Kokkos::View<double*, Kokkos::LayoutRight>;
+  using view_type_2d = Kokkos::View<double**, Kokkos::LayoutRight>;
+  using view_type_3d = Kokkos::View<double***, Kokkos::LayoutRight>;

  // Allocate view without initializing
  // Call a 'warmup' test with 1 repeat - this will initialize the corresponding
--- a/lib/kokkos/benchmarks/policy_performance/policy_perf_test.hpp
+++ b/lib/kokkos/benchmarks/policy_performance/policy_perf_test.hpp
@ -68,8 +68,8 @@ void test_policy(int team_range, int thread_range, int vector_range,
                 int team_size, int vector_size, int test_type, ViewType1& v1,
                 ViewType2& v2, ViewType3& v3, double& result,
                 double& result_expect, double& time) {
-  typedef Kokkos::TeamPolicy<ScheduleType, IndexType> t_policy;
-  typedef typename t_policy::member_type t_team;
+  using t_policy = Kokkos::TeamPolicy<ScheduleType, IndexType>;
+  using t_team   = typename t_policy::member_type;
  Kokkos::Timer timer;

  for (int orep = 0; orep < outer_repeat; orep++) {
--- a/lib/kokkos/benchmarks/stream/stream-kokkos.cc
+++ b/lib/kokkos/benchmarks/stream/stream-kokkos.cc
@ -48,219 +48,224 @@
 #include <sys/time.h>

 #define STREAM_ARRAY_SIZE 100000000
-#define STREAM_NTIMES     20
+#define STREAM_NTIMES 20

 #define HLINE "-------------------------------------------------------------\n"

 #if defined(KOKKOS_ENABLE_CUDA)
-typedef Kokkos::View<double*, Kokkos::CudaSpace>::HostMirror StreamHostArray;
-typedef Kokkos::View<double*, Kokkos::CudaSpace> StreamDeviceArray;
+using StreamHostArray   = Kokkos::View<double*, Kokkos::CudaSpace>::HostMirror;
+using StreamDeviceArray = Kokkos::View<double*, Kokkos::CudaSpace>;
 #else
-typedef Kokkos::View<double*, Kokkos::HostSpace>::HostMirror StreamHostArray;
-typedef Kokkos::View<double*, Kokkos::HostSpace> StreamDeviceArray;
+using StreamHostArray   = Kokkos::View<double*, Kokkos::HostSpace>::HostMirror;
+using StreamDeviceArray = Kokkos::View<double*, Kokkos::HostSpace>;
 #endif

-typedef int StreamIndex;
+using StreamIndex = int;

 double now() {
-	struct timeval now;
-	gettimeofday(&now, nullptr);
+  struct timeval now;
+  gettimeofday(&now, nullptr);

-	return (double) now.tv_sec + ((double) now.tv_usec * 1.0e-6);
+  return (double)now.tv_sec + ((double)now.tv_usec * 1.0e-6);
 }

-void perform_copy(StreamDeviceArray& a, StreamDeviceArray& b, StreamDeviceArray& c) {
+void perform_copy(StreamDeviceArray& a, StreamDeviceArray& b,
+                  StreamDeviceArray& c) {
+  Kokkos::parallel_for(
+      "copy", a.extent(0), KOKKOS_LAMBDA(const StreamIndex i) { c[i] = a[i]; });

-	Kokkos::parallel_for("copy", a.extent(0), KOKKOS_LAMBDA(const StreamIndex i) {
-		c[i] = a[i];
-	});
-
-	Kokkos::fence();
+  Kokkos::fence();
 }

-void perform_scale(StreamDeviceArray& a, StreamDeviceArray& b, StreamDeviceArray& c,
-       	const double scalar) {
+void perform_scale(StreamDeviceArray& a, StreamDeviceArray& b,
+                   StreamDeviceArray& c, const double scalar) {
+  Kokkos::parallel_for(
+      "copy", a.extent(0),
+      KOKKOS_LAMBDA(const StreamIndex i) { b[i] = scalar * c[i]; });

-	Kokkos::parallel_for("copy", a.extent(0), KOKKOS_LAMBDA(const StreamIndex i) {
-		b[i] = scalar * c[i];
-	});
-
-	Kokkos::fence();
+  Kokkos::fence();
 }

-void perform_add(StreamDeviceArray& a, StreamDeviceArray& b, StreamDeviceArray& c) {
-	Kokkos::parallel_for("add", a.extent(0), KOKKOS_LAMBDA(const StreamIndex i) {
-                c[i] = a[i] + b[i];
-        });
+void perform_add(StreamDeviceArray& a, StreamDeviceArray& b,
+                 StreamDeviceArray& c) {
+  Kokkos::parallel_for(
+      "add", a.extent(0),
+      KOKKOS_LAMBDA(const StreamIndex i) { c[i] = a[i] + b[i]; });

-	Kokkos::fence();
+  Kokkos::fence();
 }

-void perform_triad(StreamDeviceArray& a, StreamDeviceArray& b, StreamDeviceArray& c,
-	const double scalar) {
+void perform_triad(StreamDeviceArray& a, StreamDeviceArray& b,
+                   StreamDeviceArray& c, const double scalar) {
+  Kokkos::parallel_for(
+      "triad", a.extent(0),
+      KOKKOS_LAMBDA(const StreamIndex i) { a[i] = b[i] + scalar * c[i]; });

-	Kokkos::parallel_for("triad", a.extent(0), KOKKOS_LAMBDA(const StreamIndex i) {
-		a[i] = b[i] + scalar * c[i];
-	});
-
-	Kokkos::fence();
+  Kokkos::fence();
 }

-int perform_validation(StreamHostArray& a, StreamHostArray& b, StreamHostArray& c,
-	const StreamIndex arraySize, const double scalar) {
+int perform_validation(StreamHostArray& a, StreamHostArray& b,
+                       StreamHostArray& c, const StreamIndex arraySize,
+                       const double scalar) {
+  double ai = 1.0;
+  double bi = 2.0;
+  double ci = 0.0;

-	double ai = 1.0;
-	double bi = 2.0;
-	double ci = 0.0;
+  for (StreamIndex i = 0; i < arraySize; ++i) {
+    ci = ai;
+    bi = scalar * ci;
+    ci = ai + bi;
+    ai = bi + scalar * ci;
+  };

-	for( StreamIndex i = 0; i < arraySize; ++i ) {
-		ci = ai;
-		bi = scalar * ci;
-		ci = ai + bi;
-		ai = bi + scalar * ci;
-	};
+  double aError = 0.0;
+  double bError = 0.0;
+  double cError = 0.0;

-	double aError = 0.0;
-	double bError = 0.0;
-	double cError = 0.0;
+  for (StreamIndex i = 0; i < arraySize; ++i) {
+    aError = std::abs(a[i] - ai);
+    bError = std::abs(b[i] - bi);
+    cError = std::abs(c[i] - ci);
+  }

-	for( StreamIndex i = 0; i < arraySize; ++i ) {
-		aError = std::abs( a[i] - ai );
-		bError = std::abs( b[i] - bi );
-		cError = std::abs( c[i] - ci );
-	}
+  double aAvgError = aError / (double)arraySize;
+  double bAvgError = bError / (double)arraySize;
+  double cAvgError = cError / (double)arraySize;

-	double aAvgError = aError / (double) arraySize;
-	double bAvgError = bError / (double) arraySize;
-	double cAvgError = cError / (double) arraySize;
+  const double epsilon = 1.0e-13;
+  int errorCount       = 0;

-	const double epsilon = 1.0e-13;
-	int errorCount = 0;
+  if (std::abs(aAvgError / ai) > epsilon) {
+    fprintf(stderr, "Error: validation check on View a failed.\n");
+    errorCount++;
+  }

-	if( std::abs( aAvgError / ai ) > epsilon ) {
-		fprintf(stderr, "Error: validation check on View a failed.\n");
-		errorCount++;
-	}
+  if (std::abs(bAvgError / bi) > epsilon) {
+    fprintf(stderr, "Error: validation check on View b failed.\n");
+    errorCount++;
+  }

-	if( std::abs( bAvgError / bi ) > epsilon ) {
-		fprintf(stderr, "Error: validation check on View b failed.\n");
-		errorCount++;
-	}
+  if (std::abs(cAvgError / ci) > epsilon) {
+    fprintf(stderr, "Error: validation check on View c failed.\n");
+    errorCount++;
+  }

-	if( std::abs( cAvgError / ci ) > epsilon ) {
-		fprintf(stderr, "Error: validation check on View c failed.\n");
-		errorCount++;
-	}
+  if (errorCount == 0) {
+    printf("All solutions checked and verified.\n");
+  }

-	if( errorCount == 0 ) {
-		printf("All solutions checked and verified.\n");
-	}
-
-	return errorCount;
+  return errorCount;
 }

 int run_benchmark() {
+  printf("Reports fastest timing per kernel\n");
+  printf("Creating Views...\n");

-	printf("Reports fastest timing per kernel\n");
-	printf("Creating Views...\n");
+  printf("Memory Sizes:\n");
+  printf("- Array Size:    %" PRIu64 "\n",
+         static_cast<uint64_t>(STREAM_ARRAY_SIZE));
+  printf("- Per Array:     %12.2f MB\n",
+         1.0e-6 * (double)STREAM_ARRAY_SIZE * (double)sizeof(double));
+  printf("- Total:         %12.2f MB\n",
+         3.0e-6 * (double)STREAM_ARRAY_SIZE * (double)sizeof(double));

-	printf("Memory Sizes:\n");
-	printf("- Array Size:    %" PRIu64 "\n", static_cast<uint64_t>(STREAM_ARRAY_SIZE));
-	printf("- Per Array:     %12.2f MB\n", 1.0e-6 * (double) STREAM_ARRAY_SIZE * (double) sizeof(double));
-	printf("- Total:         %12.2f MB\n", 3.0e-6 * (double) STREAM_ARRAY_SIZE * (double) sizeof(double));
+  printf("Benchmark kernels will be performed for %d iterations.\n",
+         STREAM_NTIMES);

-	printf("Benchmark kernels will be performed for %d iterations.\n", STREAM_NTIMES);
+  printf(HLINE);

-	printf(HLINE);
+  StreamDeviceArray dev_a("a", STREAM_ARRAY_SIZE);
+  StreamDeviceArray dev_b("b", STREAM_ARRAY_SIZE);
+  StreamDeviceArray dev_c("c", STREAM_ARRAY_SIZE);

-	StreamDeviceArray dev_a("a", STREAM_ARRAY_SIZE);
-	StreamDeviceArray dev_b("b", STREAM_ARRAY_SIZE);
-	StreamDeviceArray dev_c("c", STREAM_ARRAY_SIZE);
+  StreamHostArray a = Kokkos::create_mirror_view(dev_a);
+  StreamHostArray b = Kokkos::create_mirror_view(dev_b);
+  StreamHostArray c = Kokkos::create_mirror_view(dev_c);

-	StreamHostArray a = Kokkos::create_mirror_view(dev_a);
-	StreamHostArray b = Kokkos::create_mirror_view(dev_b);
-	StreamHostArray c = Kokkos::create_mirror_view(dev_c);
+  const double scalar = 3.0;

-	const double scalar = 3.0;
+  double copyTime  = std::numeric_limits<double>::max();
+  double scaleTime = std::numeric_limits<double>::max();
+  double addTime   = std::numeric_limits<double>::max();
+  double triadTime = std::numeric_limits<double>::max();

-	double copyTime  = std::numeric_limits<double>::max();
-	double scaleTime = std::numeric_limits<double>::max();
-	double addTime   = std::numeric_limits<double>::max();
-	double triadTime = std::numeric_limits<double>::max();
-
-	printf("Initializing Views...\n");
+  printf("Initializing Views...\n");

 #if defined(KOKKOS_HAVE_OPENMP)
-	Kokkos::parallel_for("init", Kokkos::RangePolicy<Kokkos::OpenMP>(0, STREAM_ARRAY_SIZE),
+  Kokkos::parallel_for(
+      "init", Kokkos::RangePolicy<Kokkos::OpenMP>(0, STREAM_ARRAY_SIZE),
 #else
-	Kokkos::parallel_for("init", Kokkos::RangePolicy<Kokkos::Serial>(0, STREAM_ARRAY_SIZE),
+  Kokkos::parallel_for(
+      "init", Kokkos::RangePolicy<Kokkos::Serial>(0, STREAM_ARRAY_SIZE),
 #endif
-		KOKKOS_LAMBDA(const int i) {
+      KOKKOS_LAMBDA(const int i) {
+        a[i] = 1.0;
+        b[i] = 2.0;
+        c[i] = 0.0;
+      });

-		a[i] = 1.0;
-		b[i] = 2.0;
-		c[i] = 0.0;
-	});
+  // Copy contents of a (from the host) to the dev_a (device)
+  Kokkos::deep_copy(dev_a, a);
+  Kokkos::deep_copy(dev_b, b);
+  Kokkos::deep_copy(dev_c, c);

-	// Copy contents of a (from the host) to the dev_a (device)
-	Kokkos::deep_copy(dev_a, a);
-	Kokkos::deep_copy(dev_b, b);
-	Kokkos::deep_copy(dev_c, c);
+  double start;

-	double start;
+  printf("Starting benchmarking...\n");

-	printf("Starting benchmarking...\n");
+  for (StreamIndex k = 0; k < STREAM_NTIMES; ++k) {
+    start = now();
+    perform_copy(dev_a, dev_b, dev_c);
+    copyTime = std::min(copyTime, (now() - start));

-	for( StreamIndex k = 0; k < STREAM_NTIMES; ++k ) {
-		start = now();
-		perform_copy(dev_a, dev_b, dev_c);
-		copyTime = std::min( copyTime, (now() - start) );
+    start = now();
+    perform_scale(dev_a, dev_b, dev_c, scalar);
+    scaleTime = std::min(scaleTime, (now() - start));

-		start = now();
-		perform_scale(dev_a, dev_b, dev_c, scalar);
-		scaleTime = std::min( scaleTime, (now() - start) );
+    start = now();
+    perform_add(dev_a, dev_b, dev_c);
+    addTime = std::min(addTime, (now() - start));

-		start = now();
-		perform_add(dev_a, dev_b, dev_c);
-		addTime = std::min( addTime, (now() - start) );
+    start = now();
+    perform_triad(dev_a, dev_b, dev_c, scalar);
+    triadTime = std::min(triadTime, (now() - start));
+  }

-		start = now();
-		perform_triad(dev_a, dev_b, dev_c, scalar);
-		triadTime = std::min( triadTime, (now() - start) );
-	}
+  Kokkos::deep_copy(a, dev_a);
+  Kokkos::deep_copy(b, dev_b);
+  Kokkos::deep_copy(c, dev_c);

-	Kokkos::deep_copy(a, dev_a);
-	Kokkos::deep_copy(b, dev_b);
-	Kokkos::deep_copy(c, dev_c);
+  printf("Performing validation...\n");
+  int rc = perform_validation(a, b, c, STREAM_ARRAY_SIZE, scalar);

-	printf("Performing validation...\n");
-	int rc = perform_validation(a, b, c, STREAM_ARRAY_SIZE, scalar);
+  printf(HLINE);

-	printf(HLINE);
+  printf("Copy            %11.2f MB/s\n",
+         (1.0e-06 * 2.0 * (double)sizeof(double) * (double)STREAM_ARRAY_SIZE) /
+             copyTime);
+  printf("Scale           %11.2f MB/s\n",
+         (1.0e-06 * 2.0 * (double)sizeof(double) * (double)STREAM_ARRAY_SIZE) /
+             scaleTime);
+  printf("Add             %11.2f MB/s\n",
+         (1.0e-06 * 3.0 * (double)sizeof(double) * (double)STREAM_ARRAY_SIZE) /
+             addTime);
+  printf("Triad           %11.2f MB/s\n",
+         (1.0e-06 * 3.0 * (double)sizeof(double) * (double)STREAM_ARRAY_SIZE) /
+             triadTime);

-	printf("Copy            %11.2f MB/s\n",
-		( 1.0e-06 * 2.0 * (double) sizeof(double) * (double) STREAM_ARRAY_SIZE) / copyTime );
-	printf("Scale           %11.2f MB/s\n",
-		( 1.0e-06 * 2.0 * (double) sizeof(double) * (double) STREAM_ARRAY_SIZE) / scaleTime );
-	printf("Add             %11.2f MB/s\n",
-		( 1.0e-06 * 3.0 * (double) sizeof(double) * (double) STREAM_ARRAY_SIZE) / addTime );
-	printf("Triad           %11.2f MB/s\n",
-		( 1.0e-06 * 3.0 * (double) sizeof(double) * (double) STREAM_ARRAY_SIZE) / triadTime );
+  printf(HLINE);

-	printf(HLINE);
-
-	return rc;
+  return rc;
 }

 int main(int argc, char* argv[]) {
+  printf(HLINE);
+  printf("Kokkos STREAM Benchmark\n");
+  printf(HLINE);

-	printf(HLINE);
-	printf("Kokkos STREAM Benchmark\n");
-	printf(HLINE);
+  Kokkos::initialize(argc, argv);
+  const int rc = run_benchmark();
+  Kokkos::finalize();

-	Kokkos::initialize(argc, argv);
-	const int rc = run_benchmark();
-	Kokkos::finalize();
-
-	return rc;
+  return rc;
 }
--- a/lib/kokkos/bin/nvcc_wrapper
+++ b/lib/kokkos/bin/nvcc_wrapper
@ -19,6 +19,13 @@ default_arch="sm_35"
 # The default C++ compiler.
 #
 host_compiler=${NVCC_WRAPPER_DEFAULT_COMPILER:-"g++"}
+
+# Default to whatever is in the path
+nvcc_compiler=nvcc
+if [ ! -z $CUDA_ROOT ]; then
+  nvcc_compiler="$CUDA_ROOT/bin/nvcc"
+fi
+
 #host_compiler="icpc"
 #host_compiler="/usr/local/gcc/4.8.3/bin/g++"
 #host_compiler="/usr/local/gcc/4.9.1/bin/g++"
@ -58,7 +65,7 @@ object_files_xlinker=""
 shared_versioned_libraries_host=""
 shared_versioned_libraries=""

-# Does the User set the architecture 
+# Does the User set the architecture
 arch_set=0

 # Does the user overwrite the host compiler
@ -77,7 +84,7 @@ host_only_args=""
 # Just run version on host compiler
 get_host_version=0

-# Enable workaround for CUDA 6.5 for pragma ident 
+# Enable workaround for CUDA 6.5 for pragma ident
 replace_pragma_ident=0

 # Mark first host compiler argument
@ -179,7 +186,7 @@ do
    shift
    ;;
  #Handle known nvcc args
-  --dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|--resource-usage|-Xptxas*|--fmad*)
+  --dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|-Xptxas*|--fmad*|--Wext-lambda-captures-this|-Wext-lambda-captures-this)
    cuda_args="$cuda_args $1"
    ;;
  #Handle more known nvcc args
@ -187,7 +194,7 @@ do
    cuda_args="$cuda_args $1"
    ;;
  #Handle known nvcc args that have an argument
-  -rdc|-maxrregcount|--default-stream|-Xnvlink|--fmad)
+  -rdc|-maxrregcount|--default-stream|-Xnvlink|--fmad|-cudart|--cudart)
    cuda_args="$cuda_args $1 $2"
    shift
    ;;
@ -195,11 +202,11 @@ do
    cuda_args="$cuda_args $1"
    ;;
  #Handle unsupported standard flags
-  --std=c++1y|-std=c++1y|--std=c++1z|-std=c++1z|--std=gnu++1y|-std=gnu++1y|--std=gnu++1z|-std=gnu++1z|--std=c++2a|-std=c++2a|--std=c++17|-std=c++17)
+  --std=c++1y|-std=c++1y|--std=gnu++1y|-std=gnu++1y|--std=c++1z|-std=c++1z|--std=gnu++1z|-std=gnu++1z|--std=c++2a|-std=c++2a)
    fallback_std_flag="-std=c++14"
    # this is hopefully just occurring in a downstream project during CMake feature tests
    # we really have no choice here but to accept the flag and change  to an accepted C++ standard
-    echo "nvcc_wrapper does not accept standard flags $1 since partial standard flags and standards after C++14 are not supported. nvcc_wrapper will use $fallback_std_flag instead. It is undefined behavior to use this flag. This should only be occurring during CMake configuration."
+    echo "nvcc_wrapper does not accept standard flags $1 since partial standard flags and standards after C++17 are not supported. nvcc_wrapper will use $fallback_std_flag instead. It is undefined behavior to use this flag. This should only be occurring during CMake configuration."
    if [ -n "$std_flag" ]; then
       warn_std_flag
       shared_args=${shared_args/ $std_flag/}
@ -216,7 +223,25 @@ do
    fi
    std_flag=$corrected_std_flag
    shared_args="$shared_args $std_flag"
-  ;;
+    ;;
+  --std=c++17|-std=c++17)
+    if [ -n "$std_flag" ]; then
+      warn_std_flag
+      shared_args=${shared_args/ $std_flag/}
+    fi
+    # NVCC only has C++17 from version 11 on
+    cuda_main_version=$([[ $(${nvcc_compiler} --version) =~ V([0-9]+) ]] && echo ${BASH_REMATCH[1]})
+    if [ ${cuda_main_version} -lt 11 ]; then
+      fallback_std_flag="-std=c++14"
+      # this is hopefully just occurring in a downstream project during CMake feature tests
+      # we really have no choice here but to accept the flag and change  to an accepted C++ standard
+      echo "nvcc_wrapper does not accept standard flags $1 since partial standard flags and standards after C++14 are not supported. nvcc_wrapper will use $fallback_std_flag instead. It is undefined behavior to use this flag. This should only be occurring during CMake configuration."
+      std_flag=$fallback_std_flag
+    else
+      std_flag=$1
+    fi
+    shared_args="$shared_args $std_flag"
+    ;;
  --std=c++11|-std=c++11|--std=c++14|-std=c++14)
    if [ -n "$std_flag" ]; then
       warn_std_flag
@ -226,6 +251,20 @@ do
    shared_args="$shared_args $std_flag"
    ;;

+  #convert PGI standard flags to something nvcc can handle
+  --c++11|--c++14|--c++17)
+    if [ -n "$std_flag" ]; then
+       warn_std_flag
+       shared_args=${shared_args/ $std_flag/}
+    fi
+    std_flag="-std=${1#--}"
+    shared_args="$shared_args $std_flag"
+    ;;
+
+  #ignore PGI forcing ISO C++-conforming code
+  -A)
+    ;;
+
  #strip of -std=c++98 due to nvcc warnings and Tribits will place both -std=c++11 and -std=c++98
  -std=c++98|--std=c++98)
    ;;
@ -237,13 +276,17 @@ do
    ;;
  #strip -Xcompiler because we add it
  -Xcompiler)
-    if [ $first_xcompiler_arg -eq 1 ]; then
-      xcompiler_args="$2"
-      first_xcompiler_arg=0
-    else
-      xcompiler_args="$xcompiler_args,$2"
+    if [[ $2 != "-o" ]]; then
+      if [ $first_xcompiler_arg -eq 1 ]; then
+        xcompiler_args="$2"
+        first_xcompiler_arg=0
+      else
+        xcompiler_args="$xcompiler_args,$2"
+      fi
+      shift
    fi
-    shift
+    # else this we have -Xcompiler -o <filename>, in this case just drop -Xcompiler and process
+    # the -o flag with the filename (done above)
    ;;
  #strip of "-x cu" because we add that
  -x)
@ -329,7 +372,7 @@ do
    if [ $first_xcompiler_arg -eq 1 ]; then
      xcompiler_args=$1
      first_xcompiler_arg=0
-    else 
+    else
      xcompiler_args="$xcompiler_args,$1"
    fi
    ;;
@ -387,7 +430,7 @@ if [ $arch_set -ne 1 ]; then
 fi

 #Compose compilation command
-nvcc_command="nvcc $cuda_args $shared_args $xlinker_args $shared_versioned_libraries"
+nvcc_command="$nvcc_compiler $cuda_args $shared_args $xlinker_args $shared_versioned_libraries"
 if [ $first_xcompiler_arg -eq 0 ]; then
  nvcc_command="$nvcc_command -Xcompiler $xcompiler_args"
 fi
--- a/lib/kokkos/cmake/KokkosConfigCommon.cmake.in
+++ b/lib/kokkos/cmake/KokkosConfigCommon.cmake.in
@ -2,6 +2,7 @@ SET(Kokkos_DEVICES @KOKKOS_ENABLED_DEVICES@)
 SET(Kokkos_OPTIONS @KOKKOS_ENABLED_OPTIONS@)
 SET(Kokkos_TPLS @KOKKOS_ENABLED_TPLS@)
 SET(Kokkos_ARCH @KOKKOS_ENABLED_ARCH_LIST@)
+SET(Kokkos_CXX_COMPILER "@CMAKE_CXX_COMPILER@")

 # These are needed by KokkosKernels
 FOREACH(DEV ${Kokkos_DEVICES})
@ -38,7 +39,7 @@ include(FindPackageHandleStandardArgs)
 #   kokkos_check(
 #     [DEVICES <devices>...]   # Set of backends (e.g. "OpenMP" and/or "Cuda")
 #     [ARCH <archs>...]        # Target architectures (e.g. "Power9" and/or "Volta70")
-#     [OPTIONS <options>...]   # Optional settings (e.g. "PROFILING")
+#     [OPTIONS <options>...]   # Optional settings (e.g. "TUNING")
 #     [TPLS <tpls>...]         # Third party libraries
 #     [RETURN_VALUE <result>]  # Set a variable that indicates the result of the
 #                              # check instead of a fatal error
--- a/lib/kokkos/cmake/KokkosCore_config.h.in
+++ b/lib/kokkos/cmake/KokkosCore_config.h.in
@ -1,6 +1,7 @@

 #if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
-#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#error \
+    "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
 #else
 #define KOKKOS_CORE_CONFIG_H
 #endif
@ -10,7 +11,6 @@
 // KOKKOS_VERSION / 10000 is the major version
 #cmakedefine KOKKOS_VERSION @KOKKOS_VERSION@

-
 /* Execution Spaces */
 #cmakedefine KOKKOS_ENABLE_SERIAL
 #cmakedefine KOKKOS_ENABLE_OPENMP
@ -47,10 +47,9 @@
 #cmakedefine KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK
 #cmakedefine KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
 #cmakedefine KOKKOS_ENABLE_COMPILER_WARNINGS
-#cmakedefine KOKKOS_ENABLE_PROFILING
 #cmakedefine KOKKOS_ENABLE_PROFILING_LOAD_PRINT
+#cmakedefine KOKKOS_ENABLE_TUNING
 #cmakedefine KOKKOS_ENABLE_DEPRECATED_CODE
-#cmakedefine KOKKOS_ENABLE_ETI
 #cmakedefine KOKKOS_ENABLE_LARGE_MEM_TESTS
 #cmakedefine KOKKOS_ENABLE_DUALVIEW_MODIFY_CHECK
 #cmakedefine KOKKOS_ENABLE_COMPLEX_ALIGN
@ -60,7 +59,7 @@
 #cmakedefine KOKKOS_ENABLE_HWLOC
 #cmakedefine KOKKOS_USE_LIBRT
 #cmakedefine KOKKOS_ENABLE_HWBSPACE
-
+#cmakedefine KOKKOS_ENABLE_LIBDL
 #cmakedefine KOKKOS_IMPL_CUDA_CLANG_WORKAROUND

 #cmakedefine KOKKOS_COMPILER_CUDA_VERSION @KOKKOS_COMPILER_CUDA_VERSION@
@ -95,4 +94,6 @@
 #cmakedefine KOKKOS_ARCH_VOLTA70
 #cmakedefine KOKKOS_ARCH_VOLTA72
 #cmakedefine KOKKOS_ARCH_TURING75
-#cmakedefine KOKKOS_ARCH_AMD_EPYC
+#cmakedefine KOKKOS_ARCH_AMPERE80
+#cmakedefine KOKKOS_ARCH_AMD_ZEN
+#cmakedefine KOKKOS_ARCH_AMD_ZEN2
--- a/lib/kokkos/cmake/Modules/CudaToolkit.cmake
+++ b/lib/kokkos/cmake/Modules/CudaToolkit.cmake
@ -0,0 +1,958 @@
+# Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
+# file Copyright.txt or https://cmake.org/licensing for details.
+
+#[=======================================================================[.rst:
+FindCUDAToolkit
+---------------
+
+This script locates the NVIDIA CUDA toolkit and the associated libraries, but
+does not require the ``CUDA`` language be enabled for a given project. This
+module does not search for the NVIDIA CUDA Samples.
+
+Search Behavior
+^^^^^^^^^^^^^^^
+
+Finding the CUDA Toolkit requires finding the ``nvcc`` executable, which is
+searched for in the following order:
+
+1. If the ``CUDA`` language has been enabled we will use the directory
+   containing the compiler as the first search location for ``nvcc``.
+
+2. If the ``CUDAToolkit_ROOT`` cmake configuration variable (e.g.,
+   ``-DCUDAToolkit_ROOT=/some/path``) *or* environment variable is defined, it
+   will be searched.  If both an environment variable **and** a
+   configuration variable are specified, the *configuration* variable takes
+   precedence.
+
+   The directory specified here must be such that the executable ``nvcc`` can be
+   found underneath the directory specified by ``CUDAToolkit_ROOT``.  If
+   ``CUDAToolkit_ROOT`` is specified, but no ``nvcc`` is found underneath, this
+   package is marked as **not** found.  No subsequent search attempts are
+   performed.
+
+3. If the CUDA_PATH environment variable is defined, it will be searched.
+
+4. The user's path is searched for ``nvcc`` using :command:`find_program`.  If
+   this is found, no subsequent search attempts are performed.  Users are
+   responsible for ensuring that the first ``nvcc`` to show up in the path is
+   the desired path in the event that multiple CUDA Toolkits are installed.
+
+5. On Unix systems, if the symbolic link ``/usr/local/cuda`` exists, this is
+   used.  No subsequent search attempts are performed.  No default symbolic link
+   location exists for the Windows platform.
+
+6. The platform specific default install locations are searched.  If exactly one
+   candidate is found, this is used.  The default CUDA Toolkit install locations
+   searched are:
+
+   +-------------+-------------------------------------------------------------+
+   | Platform    | Search Pattern                                              |
+   +=============+=============================================================+
+   | macOS       | ``/Developer/NVIDIA/CUDA-X.Y``                              |
+   +-------------+-------------------------------------------------------------+
+   | Other Unix  | ``/usr/local/cuda-X.Y``                                     |
+   +-------------+-------------------------------------------------------------+
+   | Windows     | ``C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\vX.Y`` |
+   +-------------+-------------------------------------------------------------+
+
+   Where ``X.Y`` would be a specific version of the CUDA Toolkit, such as
+   ``/usr/local/cuda-9.0`` or
+   ``C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0``
+
+   .. note::
+
+       When multiple CUDA Toolkits are installed in the default location of a
+       system (e.g., both ``/usr/local/cuda-9.0`` and ``/usr/local/cuda-10.0``
+       exist but the ``/usr/local/cuda`` symbolic link does **not** exist), this
+       package is marked as **not** found.
+
+       There are too many factors involved in making an automatic decision in
+       the presence of multiple CUDA Toolkits being installed.  In this
+       situation, users are encouraged to either (1) set ``CUDAToolkit_ROOT`` or
+       (2) ensure that the correct ``nvcc`` executable shows up in ``$PATH`` for
+       :command:`find_program` to find.
+
+Options
+^^^^^^^
+
+``VERSION``
+    If specified, describes the version of the CUDA Toolkit to search for.
+
+``REQUIRED``
+    If specified, configuration will error if a suitable CUDA Toolkit is not
+    found.
+
+``QUIET``
+    If specified, the search for a suitable CUDA Toolkit will not produce any
+    messages.
+
+``EXACT``
+    If specified, the CUDA Toolkit is considered found only if the exact
+    ``VERSION`` specified is recovered.
+
+Imported targets
+^^^^^^^^^^^^^^^^
+
+An :ref:`imported target <Imported targets>` named ``CUDA::toolkit`` is provided.
+
+This module defines :prop_tgt:`IMPORTED` targets for each
+of the following libraries that are part of the CUDAToolkit:
+
+- :ref:`CUDA Runtime Library<cuda_toolkit_rt_lib>`
+- :ref:`CUDA Driver Library<cuda_toolkit_driver_lib>`
+- :ref:`cuBLAS<cuda_toolkit_cuBLAS>`
+- :ref:`cuFFT<cuda_toolkit_cuFFT>`
+- :ref:`cuRAND<cuda_toolkit_cuRAND>`
+- :ref:`cuSOLVER<cuda_toolkit_cuSOLVER>`
+- :ref:`cuSPARSE<cuda_toolkit_cuSPARSE>`
+- :ref:`cuPTI<cuda_toolkit_cupti>`
+- :ref:`NPP<cuda_toolkit_NPP>`
+- :ref:`nvBLAS<cuda_toolkit_nvBLAS>`
+- :ref:`nvGRAPH<cuda_toolkit_nvGRAPH>`
+- :ref:`nvJPEG<cuda_toolkit_nvJPEG>`
+- :ref:`nvidia-ML<cuda_toolkit_nvML>`
+- :ref:`nvRTC<cuda_toolkit_nvRTC>`
+- :ref:`nvToolsExt<cuda_toolkit_nvToolsExt>`
+- :ref:`OpenCL<cuda_toolkit_opencl>`
+- :ref:`cuLIBOS<cuda_toolkit_cuLIBOS>`
+
+.. _`cuda_toolkit_rt_lib`:
+
+CUDA Runtime Library
+""""""""""""""""""""
+
+The CUDA Runtime library (cudart) are what most applications will typically
+need to link against to make any calls such as `cudaMalloc`, and `cudaFree`.
+
+Targets Created:
+
+- ``CUDA::cudart``
+- ``CUDA::cudart_static``
+
+.. _`cuda_toolkit_driver_lib`:
+
+CUDA Driver Library
+""""""""""""""""""""
+
+The CUDA Driver library (cuda) are used by applications that use calls
+such as `cuMemAlloc`, and `cuMemFree`. This is generally used by advanced
+
+
+Targets Created:
+
+- ``CUDA::cuda_driver``
+- ``CUDA::cuda_driver``
+
+.. _`cuda_toolkit_cuBLAS`:
+
+cuBLAS
+""""""
+
+The `cuBLAS <https://docs.nvidia.com/cuda/cublas/index.html>`_ library.
+
+Targets Created:
+
+- ``CUDA::cublas``
+- ``CUDA::cublas_static``
+
+.. _`cuda_toolkit_cuFFT`:
+
+cuFFT
+"""""
+
+The `cuFFT <https://docs.nvidia.com/cuda/cufft/index.html>`_ library.
+
+Targets Created:
+
+- ``CUDA::cufft``
+- ``CUDA::cufftw``
+- ``CUDA::cufft_static``
+- ``CUDA::cufftw_static``
+
+cuRAND
+""""""
+
+The `cuRAND <https://docs.nvidia.com/cuda/curand/index.html>`_ library.
+
+Targets Created:
+
+- ``CUDA::curand``
+- ``CUDA::curand_static``
+
+.. _`cuda_toolkit_cuSOLVER`:
+
+cuSOLVER
+""""""""
+
+The `cuSOLVER <https://docs.nvidia.com/cuda/cusolver/index.html>`_ library.
+
+Targets Created:
+
+- ``CUDA::cusolver``
+- ``CUDA::cusolver_static``
+
+.. _`cuda_toolkit_cuSPARSE`:
+
+cuSPARSE
+""""""""
+
+The `cuSPARSE <https://docs.nvidia.com/cuda/cusparse/index.html>`_ library.
+
+Targets Created:
+
+- ``CUDA::cusparse``
+- ``CUDA::cusparse_static``
+
+.. _`cuda_toolkit_cupti`:
+
+cupti
+"""""
+
+The `NVIDIA CUDA Profiling Tools Interface <https://developer.nvidia.com/CUPTI>`_.
+
+Targets Created:
+
+- ``CUDA::cupti``
+- ``CUDA::cupti_static``
+
+.. _`cuda_toolkit_NPP`:
+
+NPP
+"""
+
+The `NPP <https://docs.nvidia.com/cuda/npp/index.html>`_ libraries.
+
+Targets Created:
+
+- `nppc`:
+
+  - ``CUDA::nppc``
+  - ``CUDA::nppc_static``
+
+- `nppial`: Arithmetic and logical operation functions in `nppi_arithmetic_and_logical_operations.h`
+
+  - ``CUDA::nppial``
+  - ``CUDA::nppial_static``
+
+- `nppicc`: Color conversion and sampling functions in `nppi_color_conversion.h`
+
+  - ``CUDA::nppicc``
+  - ``CUDA::nppicc_static``
+
+- `nppicom`: JPEG compression and decompression functions in `nppi_compression_functions.h`
+
+  - ``CUDA::nppicom``
+  - ``CUDA::nppicom_static``
+
+- `nppidei`: Data exchange and initialization functions in `nppi_data_exchange_and_initialization.h`
+
+  - ``CUDA::nppidei``
+  - ``CUDA::nppidei_static``
+
+- `nppif`: Filtering and computer vision functions in `nppi_filter_functions.h`
+
+  - ``CUDA::nppif``
+  - ``CUDA::nppif_static``
+
+- `nppig`: Geometry transformation functions found in `nppi_geometry_transforms.h`
+
+  - ``CUDA::nppig``
+  - ``CUDA::nppig_static``
+
+- `nppim`: Morphological operation functions found in `nppi_morphological_operations.h`
+
+  - ``CUDA::nppim``
+  - ``CUDA::nppim_static``
+
+- `nppist`: Statistics and linear transform in `nppi_statistics_functions.h` and `nppi_linear_transforms.h`
+
+  - ``CUDA::nppist``
+  - ``CUDA::nppist_static``
+
+- `nppisu`: Memory support functions in `nppi_support_functions.h`
+
+  - ``CUDA::nppisu``
+  - ``CUDA::nppisu_static``
+
+- `nppitc`: Threshold and compare operation functions in `nppi_threshold_and_compare_operations.h`
+
+  - ``CUDA::nppitc``
+  - ``CUDA::nppitc_static``
+
+- `npps`:
+
+  - ``CUDA::npps``
+  - ``CUDA::npps_static``
+
+.. _`cuda_toolkit_nvBLAS`:
+
+nvBLAS
+""""""
+
+The `nvBLAS <https://docs.nvidia.com/cuda/nvblas/index.html>`_ libraries.
+This is a shared library only.
+
+Targets Created:
+
+- ``CUDA::nvblas``
+
+.. _`cuda_toolkit_nvGRAPH`:
+
+nvGRAPH
+"""""""
+
+The `nvGRAPH <https://docs.nvidia.com/cuda/nvgraph/index.html>`_ library.
+
+Targets Created:
+
+- ``CUDA::nvgraph``
+- ``CUDA::nvgraph_static``
+
+
+.. _`cuda_toolkit_nvJPEG`:
+
+nvJPEG
+""""""
+
+The `nvJPEG <https://docs.nvidia.com/cuda/nvjpeg/index.html>`_ library.
+Introduced in CUDA 10.
+
+Targets Created:
+
+- ``CUDA::nvjpeg``
+- ``CUDA::nvjpeg_static``
+
+.. _`cuda_toolkit_nvRTC`:
+
+nvRTC
+"""""
+
+The `nvRTC <https://docs.nvidia.com/cuda/nvrtc/index.html>`_ (Runtime Compilation) library.
+This is a shared library only.
+
+Targets Created:
+
+- ``CUDA::nvrtc``
+
+.. _`cuda_toolkit_nvml`:
+
+nvidia-ML
+"""""""""
+
+The `NVIDIA Management Library <https://developer.nvidia.com/nvidia-management-library-nvml>`_.
+This is a shared library only.
+
+Targets Created:
+
+- ``CUDA::nvml``
+
+.. _`cuda_toolkit_nvToolsExt`:
+
+nvToolsExt
+""""""""""
+
+The `NVIDIA Tools Extension <https://docs.nvidia.com/gameworks/content/gameworkslibrary/nvtx/nvidia_tools_extension_library_nvtx.htm>`_.
+This is a shared library only.
+
+Targets Created:
+
+- ``CUDA::nvToolsExt``
+
+.. _`cuda_toolkit_opencl`:
+
+OpenCL
+""""""
+
+The `NVIDIA OpenCL Library <https://developer.nvidia.com/opencl>`_.
+This is a shared library only.
+
+Targets Created:
+
+- ``CUDA::OpenCL``
+
+.. _`cuda_toolkit_cuLIBOS`:
+
+cuLIBOS
+"""""""
+
+The cuLIBOS library is a backend thread abstraction layer library which is
+static only.  The ``CUDA::cublas_static``, ``CUDA::cusparse_static``,
+``CUDA::cufft_static``, ``CUDA::curand_static``, and (when implemented) NPP
+libraries all automatically have this dependency linked.
+
+Target Created:
+
+- ``CUDA::culibos``
+
+**Note**: direct usage of this target by consumers should not be necessary.
+
+.. _`cuda_toolkit_cuRAND`:
+
+
+
+Result variables
+^^^^^^^^^^^^^^^^
+
+``CUDAToolkit_FOUND``
+    A boolean specifying whether or not the CUDA Toolkit was found.
+
+``CUDAToolkit_VERSION``
+    The exact version of the CUDA Toolkit found (as reported by
+    ``nvcc --version``).
+
+``CUDAToolkit_VERSION_MAJOR``
+    The major version of the CUDA Toolkit.
+
+``CUDAToolkit_VERSION_MAJOR``
+    The minor version of the CUDA Toolkit.
+
+``CUDAToolkit_VERSION_PATCH``
+    The patch version of the CUDA Toolkit.
+
+``CUDAToolkit_BIN_DIR``
+    The path to the CUDA Toolkit library directory that contains the CUDA
+    executable ``nvcc``.
+
+``CUDAToolkit_INCLUDE_DIRS``
+    The path to the CUDA Toolkit ``include`` folder containing the header files
+    required to compile a project linking against CUDA.
+
+``CUDAToolkit_LIBRARY_DIR``
+    The path to the CUDA Toolkit library directory that contains the CUDA
+    Runtime library ``cudart``.
+
+``CUDAToolkit_TARGET_DIR``
+    The path to the CUDA Toolkit directory including the target architecture
+    when cross-compiling. When not cross-compiling this will be equivalant to
+    ``CUDAToolkit_ROOT_DIR``.
+
+``CUDAToolkit_NVCC_EXECUTABLE``
+    The path to the NVIDIA CUDA compiler ``nvcc``.  Note that this path may
+    **not** be the same as
+    :variable:`CMAKE_CUDA_COMPILER <CMAKE_<LANG>_COMPILER>`.  ``nvcc`` must be
+    found to determine the CUDA Toolkit version as well as determining other
+    features of the Toolkit.  This variable is set for the convenience of
+    modules that depend on this one.
+
+
+#]=======================================================================]
+
+# NOTE: much of this was simply extracted from FindCUDA.cmake.
+
+#   James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#   Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html
+#
+#   Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#   Copyright (c) 2007-2009
+#   Scientific Computing and Imaging Institute, University of Utah
+#
+#   This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#   for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+###############################################################################
+
+# For NVCC we can easily deduce the SDK binary directory from the compiler path.
+if(CMAKE_CUDA_COMPILER_LOADED AND NOT CUDAToolkit_BIN_DIR AND CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA")
+  get_filename_component(cuda_dir "${CMAKE_CUDA_COMPILER}" DIRECTORY)
+  set(CUDAToolkit_BIN_DIR "${cuda_dir}" CACHE PATH "")
+  mark_as_advanced(CUDAToolkit_BIN_DIR)
+  unset(cuda_dir)
+endif()
+
+IF(CMAKE_VERSION VERSION_LESS "3.12.0")
+  function(import_target_link_libraries target)
+    cmake_parse_arguments(HACK
+      "SYSTEM;INTERFACE;PUBLIC"
+      ""
+      ""
+      ${ARGN}
+    )
+    get_target_property(LIBS ${target} INTERFACE_LINK_LIBRARIES)
+    if (LIBS)
+      list(APPEND LIBS ${HACK_UNPARSED_ARGUMENTS})
+    else()
+      set(LIBS ${HACK_UNPARSED_ARGUMENTS})
+    endif()
+    set_target_properties(${target} PROPERTIES
+      INTERFACE_LINK_LIBRARIES "${LIBS}")
+  endfunction()
+ELSE()
+  function(import_target_link_libraries)
+    target_link_libraries(${ARGN})
+  endfunction()
+ENDIF()
+
+IF(CMAKE_VERSION VERSION_LESS "3.13.0")
+  function(import_target_link_directories target)
+    cmake_parse_arguments(HACK
+      "SYSTEM;INTERFACE;PUBLIC"
+      ""
+      ""
+      ${ARGN}
+    )
+    get_target_property(LINK_LIBS ${target} INTERFACE_LINK_LIBRARIES)
+    if (LINK_LIBS) #could be not-found
+      set(LINK_LIBS_LIST ${LINK_LIBS})
+    endif()
+    foreach(LIB ${HACK_UNPARSED_ARGUMENTS})
+      list(APPEND LINK_LIBS_LIST -L${LIB})
+    endforeach()
+    set_target_properties(${target} PROPERTIES
+      INTERFACE_LINK_LIBRARIES "${LINK_LIBS_LIST}")
+  endfunction()
+ELSE()
+  function(import_target_link_directories)
+    target_link_directories(${ARGN})
+  endfunction()
+ENDIF()
+
+IF(CMAKE_VERSION VERSION_LESS "3.12.0")
+  function(import_target_include_directories target)
+    cmake_parse_arguments(HACK
+      "SYSTEM;INTERFACE;PUBLIC"
+      ""
+      ""
+      ${ARGN}
+    )
+    get_target_property(INLUDE_DIRS ${target} INTERFACE_INCLUDE_DIRECTORIES)
+    if (INCLUDE_DIRS)
+      list(APPEND INCLUDE_DIRS ${HACK_UNPARSED_ARGUMENTS})
+    else()
+      set(INCLUDE_DIRS ${HACK_UNPARSED_ARGUMENTS})
+    endif()
+    set_target_properties(${target} PROPERTIES
+      INTERFACE_INCLUDE_DIRECTORIES "${INCLUDE_DIRS}")
+  endfunction()
+ELSE()
+  function(import_target_include_directories)
+    target_include_directories(${ARGN})
+  endfunction()
+ENDIF()
+
+# Try language- or user-provided path first.
+if(CUDAToolkit_BIN_DIR)
+  find_program(CUDAToolkit_NVCC_EXECUTABLE
+    NAMES nvcc nvcc.exe
+    PATHS ${CUDAToolkit_BIN_DIR}
+    NO_DEFAULT_PATH
+    )
+endif()
+
+# Search using CUDAToolkit_ROOT
+find_program(CUDAToolkit_NVCC_EXECUTABLE
+  NAMES nvcc nvcc.exe
+  PATHS ENV CUDA_PATH
+  PATH_SUFFIXES bin
+)
+
+# If the user specified CUDAToolkit_ROOT but nvcc could not be found, this is an error.
+if (NOT CUDAToolkit_NVCC_EXECUTABLE AND (DEFINED CUDAToolkit_ROOT OR DEFINED ENV{CUDAToolkit_ROOT}))
+  # Declare error messages now, print later depending on find_package args.
+  set(fail_base "Could not find nvcc executable in path specified by")
+  set(cuda_root_fail "${fail_base} CUDAToolkit_ROOT=${CUDAToolkit_ROOT}")
+  set(env_cuda_root_fail "${fail_base} environment variable CUDAToolkit_ROOT=$ENV{CUDAToolkit_ROOT}")
+
+  if (CUDAToolkit_FIND_REQUIRED)
+    if (DEFINED CUDAToolkit_ROOT)
+      message(FATAL_ERROR ${cuda_root_fail})
+    elseif (DEFINED ENV{CUDAToolkit_ROOT})
+      message(FATAL_ERROR ${env_cuda_root_fail})
+    endif()
+  else()
+    if (NOT CUDAToolkit_FIND_QUIETLY)
+      if (DEFINED CUDAToolkit_ROOT)
+        message(STATUS ${cuda_root_fail})
+      elseif (DEFINED ENV{CUDAToolkit_ROOT})
+        message(STATUS ${env_cuda_root_fail})
+      endif()
+    endif()
+    set(CUDAToolkit_FOUND FALSE)
+    unset(fail_base)
+    unset(cuda_root_fail)
+    unset(env_cuda_root_fail)
+    return()
+  endif()
+endif()
+
+# CUDAToolkit_ROOT cmake / env variable not specified, try platform defaults.
+#
+# - Linux: /usr/local/cuda-X.Y
+# - macOS: /Developer/NVIDIA/CUDA-X.Y
+# - Windows: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\vX.Y
+#
+# We will also search the default symlink location /usr/local/cuda first since
+# if CUDAToolkit_ROOT is not specified, it is assumed that the symlinked
+# directory is the desired location.
+if (NOT CUDAToolkit_NVCC_EXECUTABLE)
+  if (UNIX)
+    if (NOT APPLE)
+      set(platform_base "/usr/local/cuda-")
+    else()
+      set(platform_base "/Developer/NVIDIA/CUDA-")
+    endif()
+  else()
+    set(platform_base "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v")
+  endif()
+
+  # Build out a descending list of possible cuda installations, e.g.
+  file(GLOB possible_paths "${platform_base}*")
+  # Iterate the glob results and create a descending list.
+  set(possible_versions)
+  foreach (p ${possible_paths})
+    # Extract version number from end of string
+    string(REGEX MATCH "[0-9][0-9]?\\.[0-9]$" p_version ${p})
+    if (IS_DIRECTORY ${p} AND p_version)
+      list(APPEND possible_versions ${p_version})
+    endif()
+  endforeach()
+
+  # Cannot use list(SORT) because that is alphabetical, we need numerical.
+  # NOTE: this is not an efficient sorting strategy.  But even if a user had
+  # every possible version of CUDA installed, this wouldn't create any
+  # significant overhead.
+  set(versions)
+  foreach (v ${possible_versions})
+    list(LENGTH versions num_versions)
+    # First version, nothing to compare with so just append.
+    if (num_versions EQUAL 0)
+      list(APPEND versions ${v})
+    else()
+      # Loop through list.  Insert at an index when comparison is
+      # VERSION_GREATER since we want a descending list.  Duplicates will not
+      # happen since this came from a glob list of directories.
+      set(i 0)
+      set(early_terminate FALSE)
+      while (i LESS num_versions)
+        list(GET versions ${i} curr)
+        if (v VERSION_GREATER curr)
+          list(INSERT versions ${i} ${v})
+          set(early_terminate TRUE)
+          break()
+        endif()
+        math(EXPR i "${i} + 1")
+      endwhile()
+      # If it did not get inserted, place it at the end.
+      if (NOT early_terminate)
+        list(APPEND versions ${v})
+      endif()
+    endif()
+  endforeach()
+
+  # With a descending list of versions, populate possible paths to search.
+  set(search_paths)
+  foreach (v ${versions})
+    list(APPEND search_paths "${platform_base}${v}")
+  endforeach()
+
+  # Force the global default /usr/local/cuda to the front on Unix.
+  if (UNIX)
+    list(INSERT search_paths 0 "/usr/local/cuda")
+  endif()
+
+  # Now search for nvcc again using the platform default search paths.
+  find_program(CUDAToolkit_NVCC_EXECUTABLE
+    NAMES nvcc nvcc.exe
+    PATHS ${search_paths}
+    PATH_SUFFIXES bin
+  )
+
+  # We are done with these variables now, cleanup for caller.
+  unset(platform_base)
+  unset(possible_paths)
+  unset(possible_versions)
+  unset(versions)
+  unset(i)
+  unset(early_terminate)
+  unset(search_paths)
+
+  if (NOT CUDAToolkit_NVCC_EXECUTABLE)
+    if (CUDAToolkit_FIND_REQUIRED)
+      message(FATAL_ERROR "Could not find nvcc, please set CUDAToolkit_ROOT.")
+    elseif(NOT CUDAToolkit_FIND_QUIETLY)
+      message(STATUS "Could not find nvcc, please set CUDAToolkit_ROOT.")
+    endif()
+
+    set(CUDAToolkit_FOUND FALSE)
+    return()
+  endif()
+endif()
+
+if(NOT CUDAToolkit_BIN_DIR AND CUDAToolkit_NVCC_EXECUTABLE)
+  get_filename_component(cuda_dir "${CUDAToolkit_NVCC_EXECUTABLE}" DIRECTORY)
+  set(CUDAToolkit_BIN_DIR "${cuda_dir}" CACHE PATH "" FORCE)
+  mark_as_advanced(CUDAToolkit_BIN_DIR)
+  unset(cuda_dir)
+endif()
+
+if(CUDAToolkit_NVCC_EXECUTABLE AND
+   CUDAToolkit_NVCC_EXECUTABLE STREQUAL CMAKE_CUDA_COMPILER)
+  # Need to set these based off the already computed CMAKE_CUDA_COMPILER_VERSION value
+  # This if statement will always match, but is used to provide variables for MATCH 1,2,3...
+  if(CMAKE_CUDA_COMPILER_VERSION MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=])
+    set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}")
+    set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}")
+    set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}")
+    set(CUDAToolkit_VERSION "${CMAKE_CUDA_COMPILER_VERSION}")
+  endif()
+else()
+  # Compute the version by invoking nvcc
+  execute_process (COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT)
+  if(NVCC_OUT MATCHES [=[ V([0-9]+)\.([0-9]+)\.([0-9]+)]=])
+    set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}")
+    set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}")
+    set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}")
+    set(CUDAToolkit_VERSION  "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}")
+  endif()
+  unset(NVCC_OUT)
+endif()
+
+
+get_filename_component(CUDAToolkit_ROOT_DIR ${CUDAToolkit_BIN_DIR} DIRECTORY ABSOLUTE)
+
+# Handle cross compilation
+if(CMAKE_CROSSCOMPILING)
+  if(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7-a")
+    # Support for NVPACK
+    set (CUDAToolkit_TARGET_NAME "armv7-linux-androideabi")
+  elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
+    # Support for arm cross compilation
+    set(CUDAToolkit_TARGET_NAME "armv7-linux-gnueabihf")
+  elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+    # Support for aarch64 cross compilation
+    if (ANDROID_ARCH_NAME STREQUAL "arm64")
+      set(CUDAToolkit_TARGET_NAME "aarch64-linux-androideabi")
+    else()
+      set(CUDAToolkit_TARGET_NAME "aarch64-linux")
+    endif (ANDROID_ARCH_NAME STREQUAL "arm64")
+  elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+      set(CUDAToolkit_TARGET_NAME "x86_64-linux")
+  endif()
+
+  if (EXISTS "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}")
+    set(CUDAToolkit_TARGET_DIR "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}")
+    # add known CUDA target root path to the set of directories we search for programs, libraries and headers
+    list(PREPEND CMAKE_FIND_ROOT_PATH "${CUDAToolkit_TARGET_DIR}")
+
+    # Mark that we need to pop the root search path changes after we have
+    # found all cuda libraries so that searches for our cross-compilation
+    # libraries work when another cuda sdk is in CMAKE_PREFIX_PATH or
+    # PATh
+    set(_CUDAToolkit_Pop_ROOT_PATH True)
+  endif()
+else()
+  # Not cross compiling
+  set(CUDAToolkit_TARGET_DIR "${CUDAToolkit_ROOT_DIR}")
+  # Now that we have the real ROOT_DIR, find components inside it.
+  list(APPEND CMAKE_PREFIX_PATH ${CUDAToolkit_ROOT_DIR})
+
+  # Mark that we need to pop the prefix path changes after we have
+  # found the cudart library.
+  set(_CUDAToolkit_Pop_Prefix True)
+endif()
+
+
+# Find the include/ directory
+find_path(CUDAToolkit_INCLUDE_DIR
+  NAMES cuda_runtime.h
+)
+
+# And find the CUDA Runtime Library libcudart
+find_library(CUDA_CUDART
+  NAMES cudart
+  PATH_SUFFIXES lib64 lib/x64
+)
+if (NOT CUDA_CUDART)
+  find_library(CUDA_CUDART
+    NAMES cudart
+    PATH_SUFFIXES lib64/stubs lib/x64/stubs
+  )
+endif()
+
+if (NOT CUDA_CUDART AND NOT CUDAToolkit_FIND_QUIETLY)
+  message(STATUS "Unable to find cudart library.")
+endif()
+
+unset(CUDAToolkit_ROOT_DIR)
+if(_CUDAToolkit_Pop_Prefix)
+  list(REMOVE_AT CMAKE_PREFIX_PATH -1)
+  unset(_CUDAToolkit_Pop_Prefix)
+endif()
+
+#-----------------------------------------------------------------------------
+# Perform version comparison and validate all required variables are set.
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(CUDAToolkit
+  REQUIRED_VARS
+    CUDAToolkit_INCLUDE_DIR
+    CUDA_CUDART
+    CUDAToolkit_NVCC_EXECUTABLE
+  VERSION_VAR
+    CUDAToolkit_VERSION
+)
+mark_as_advanced(CUDA_CUDART
+                 CUDAToolkit_INCLUDE_DIR
+                 CUDAToolkit_NVCC_EXECUTABLE
+                 )
+
+#-----------------------------------------------------------------------------
+# Construct result variables
+if(CUDAToolkit_FOUND)
+ set(CUDAToolkit_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIR})
+ get_filename_component(CUDAToolkit_LIBRARY_DIR ${CUDA_CUDART} DIRECTORY ABSOLUTE)
+endif()
+
+#-----------------------------------------------------------------------------
+# Construct import targets
+if(CUDAToolkit_FOUND)
+
+  function(_CUDAToolkit_find_and_add_import_lib lib_name)
+    cmake_parse_arguments(arg "" "" "ALT;DEPS;EXTRA_PATH_SUFFIXES" ${ARGN})
+
+    set(search_names ${lib_name} ${arg_ALT})
+
+    find_library(CUDA_${lib_name}_LIBRARY
+      NAMES ${search_names}
+      HINTS ${CUDAToolkit_LIBRARY_DIR}
+            ENV CUDA_PATH
+      PATH_SUFFIXES nvidia/current lib64 lib/x64 lib
+                    ${arg_EXTRA_PATH_SUFFIXES}
+    )
+    # Don't try any stub directories intil we have exhausted all other
+    # search locations.
+    if(NOT CUDA_${lib_name}_LIBRARY)
+      find_library(CUDA_${lib_name}_LIBRARY
+        NAMES ${search_names}
+        HINTS ${CUDAToolkit_LIBRARY_DIR}
+              ENV CUDA_PATH
+        PATH_SUFFIXES lib64/stubs lib/x64/stubs lib/stubs stubs
+      )
+    endif()
+
+    mark_as_advanced(CUDA_${lib_name}_LIBRARY)
+
+    if (NOT TARGET CUDA::${lib_name} AND CUDA_${lib_name}_LIBRARY)
+      add_library(CUDA::${lib_name} IMPORTED INTERFACE)
+      import_target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}")
+      import_target_link_libraries(CUDA::${lib_name} INTERFACE "${CUDA_${lib_name}_LIBRARY}")
+      foreach(dep ${arg_DEPS})
+        if(TARGET CUDA::${dep})
+          import_target_link_libraries(CUDA::${lib_name} INTERFACE CUDA::${dep})
+        endif()
+      endforeach()
+    endif()
+  endfunction()
+
+  if(NOT TARGET CUDA::toolkit)
+    add_library(CUDA::toolkit IMPORTED INTERFACE)
+    import_target_include_directories(CUDA::toolkit SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}")
+    import_target_link_directories(CUDA::toolkit INTERFACE "${CUDAToolkit_LIBRARY_DIR}")
+  endif()
+
+  _CUDAToolkit_find_and_add_import_lib(cuda_driver ALT cuda)
+
+  _CUDAToolkit_find_and_add_import_lib(cudart)
+  _CUDAToolkit_find_and_add_import_lib(cudart_static)
+
+  # setup dependencies that are required for cudart_static when building
+  # on linux. These are generally only required when using the CUDA toolkit
+  # when CUDA language is disabled
+  if(NOT TARGET CUDA::cudart_static_deps
+     AND TARGET CUDA::cudart_static)
+
+    add_library(CUDA::cudart_static_deps IMPORTED INTERFACE)
+    import_target_link_libraries(CUDA::cudart_static INTERFACE CUDA::cudart_static_deps)
+
+    if(UNIX AND (CMAKE_C_COMPILER OR CMAKE_CXX_COMPILER))
+      find_package(Threads REQUIRED)
+      import_target_link_libraries(CUDA::cudart_static_deps INTERFACE Threads::Threads ${CMAKE_DL_LIBS})
+    endif()
+
+    if(UNIX AND NOT APPLE)
+      # On Linux, you must link against librt when using the static cuda runtime.
+      find_library(CUDAToolkit_rt_LIBRARY rt)
+      mark_as_advanced(CUDAToolkit_rt_LIBRARY)
+      if(NOT CUDAToolkit_rt_LIBRARY)
+        message(WARNING "Could not find librt library, needed by CUDA::cudart_static")
+      else()
+        import_target_link_libraries(CUDA::cudart_static_deps INTERFACE ${CUDAToolkit_rt_LIBRARY})
+      endif()
+    endif()
+  endif()
+
+  _CUDAToolkit_find_and_add_import_lib(culibos) # it's a static library
+  foreach (cuda_lib cublas cufft curand cusparse nppc nvjpeg)
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib})
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS culibos)
+  endforeach()
+
+  # cuFFTW depends on cuFFT
+  _CUDAToolkit_find_and_add_import_lib(cufftw DEPS cufft)
+  _CUDAToolkit_find_and_add_import_lib(cufftw DEPS cufft_static)
+
+  # cuSOLVER depends on cuBLAS, and cuSPARSE
+  _CUDAToolkit_find_and_add_import_lib(cusolver DEPS cublas cusparse)
+  _CUDAToolkit_find_and_add_import_lib(cusolver_static DEPS cublas_static cusparse_static culibos)
+
+  # nvGRAPH depends on cuRAND, and cuSOLVER.
+  _CUDAToolkit_find_and_add_import_lib(nvgraph DEPS curand cusolver)
+  _CUDAToolkit_find_and_add_import_lib(nvgraph_static DEPS curand_static cusolver_static)
+
+  # Process the majority of the NPP libraries.
+  foreach (cuda_lib nppial nppicc nppidei nppif nppig nppim nppist nppitc npps nppicom nppisu)
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib} DEPS nppc)
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS nppc_static)
+  endforeach()
+
+  _CUDAToolkit_find_and_add_import_lib(cupti
+                                       EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/
+                                                           ../extras/CUPTI/lib/)
+  _CUDAToolkit_find_and_add_import_lib(cupti_static
+                                       EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/
+                                                           ../extras/CUPTI/lib/)
+
+  _CUDAToolkit_find_and_add_import_lib(nvrtc DEPS cuda_driver)
+
+  _CUDAToolkit_find_and_add_import_lib(nvml ALT nvidia-ml nvml)
+
+  if(WIN32)
+    # nvtools can be installed outside the CUDA toolkit directory
+    # so prefer the NVTOOLSEXT_PATH windows only environment variable
+    # In addition on windows the most common name is nvToolsExt64_1
+    find_library(CUDA_nvToolsExt_LIBRARY
+      NAMES nvToolsExt64_1 nvToolsExt64 nvToolsExt
+      PATHS ENV NVTOOLSEXT_PATH
+            ENV CUDA_PATH
+      PATH_SUFFIXES lib/x64 lib
+    )
+  endif()
+  _CUDAToolkit_find_and_add_import_lib(nvToolsExt ALT nvToolsExt64)
+
+  _CUDAToolkit_find_and_add_import_lib(OpenCL)
+endif()
+
+if(_CUDAToolkit_Pop_ROOT_PATH)
+  list(REMOVE_AT CMAKE_FIND_ROOT_PATH 0)
+  unset(_CUDAToolkit_Pop_ROOT_PATH)
+endif()
--- a/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake
+++ b/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake
@ -1,17 +1,37 @@
-
-IF (KOKKOS_CXX_COMPILER_ID STREQUAL Clang)
-   # Note: "stubs" suffix allows CMake to find the dummy
-   # libcuda.so provided by the NVIDIA CUDA Toolkit for
-   # cross-compiling CUDA on a host without a GPU.
-   KOKKOS_FIND_IMPORTED(CUDA INTERFACE
-    LIBRARIES cudart cuda
-    LIBRARY_PATHS ENV LD_LIBRARY_PATH ENV CUDA_PATH /usr/local/cuda
-    LIBRARY_SUFFIXES lib lib64 lib/stubs lib64/stubs
-    ALLOW_SYSTEM_PATH_FALLBACK
-   )
-ELSE()
-   KOKKOS_CREATE_IMPORTED_TPL(CUDA INTERFACE
-    LINK_LIBRARIES cuda
-   )
+IF (NOT CUDAToolkit_ROOT)
+  IF (NOT CUDA_ROOT)
+    SET(CUDA_ROOT $ENV{CUDA_ROOT})
+  ENDIF()
+  IF(CUDA_ROOT)
+    SET(CUDAToolkit_ROOT ${CUDA_ROOT})
+  ENDIF()
 ENDIF()

+IF(CMAKE_VERSION VERSION_GREATER_EQUAL "3.17.0")
+  find_package(CUDAToolkit)
+ELSE()
+  include(${CMAKE_CURRENT_LIST_DIR}/CudaToolkit.cmake)
+ENDIF()
+
+
+IF (TARGET CUDA::cudart)
+  SET(FOUND_CUDART TRUE)
+  KOKKOS_EXPORT_IMPORTED_TPL(CUDA::cudart)
+ELSE()
+  SET(FOUND_CUDART FALSE)
+ENDIF()
+
+IF (TARGET CUDA::cuda_driver)
+  SET(FOUND_CUDA_DRIVER TRUE)
+  KOKKOS_EXPORT_IMPORTED_TPL(CUDA::cuda_driver)
+ELSE()
+  SET(FOUND_CUDA_DRIVVER FALSE)
+ENDIF()
+
+include(FindPackageHandleStandardArgs)
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUDA DEFAULT_MSG FOUND_CUDART FOUND_CUDA_DRIVER)
+IF (FOUND_CUDA_DRIVER AND FOUND_CUDART)
+  KOKKOS_CREATE_IMPORTED_TPL(CUDA INTERFACE
+    LINK_LIBRARIES CUDA::cuda_driver CUDA::cudart
+  )
+ENDIF()
--- a/lib/kokkos/core/src/eti/OpenMP/Kokkos_OpenMP_ViewCopyETIInst_int64_t_int64_t_LayoutStride_Rank5.cpp
+++ b/lib/kokkos/core/src/eti/OpenMP/Kokkos_OpenMP_ViewCopyETIInst_int64_t_int64_t_LayoutStride_Rank5.cpp
@ -1,3 +1,4 @@
+/*
 //@HEADER
 // ************************************************************************
 //
@ -8,8 +9,6 @@
 // Under the terms of Contract DE-NA0003525 with NTESS,
 // the U.S. Government retains certain rights in this software.
 //
-// Kokkos is licensed under 3-clause BSD terms of use:
-//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -41,18 +40,43 @@
 //
 // ************************************************************************
 //@HEADER
+*/

-#define KOKKOS_IMPL_COMPILING_LIBRARY true
-#include <Kokkos_Core.hpp>
-namespace Kokkos {
-namespace Impl {
-KOKKOS_IMPL_VIEWCOPY_ETI_INST(int64_t*****, LayoutStride, LayoutRight, OpenMP,
-                              int64_t)
-KOKKOS_IMPL_VIEWCOPY_ETI_INST(int64_t*****, LayoutStride, LayoutLeft, OpenMP,
-                              int64_t)
-KOKKOS_IMPL_VIEWCOPY_ETI_INST(int64_t*****, LayoutStride, LayoutStride, OpenMP,
-                              int64_t)
-KOKKOS_IMPL_VIEWFILL_ETI_INST(int64_t*****, LayoutStride, OpenMP, int64_t)
+#include <iostream>

-}  // namespace Impl
-}  // namespace Kokkos
+int main() {
+  cudaDeviceProp device_properties;
+  const cudaError_t error = cudaGetDeviceProperties(&device_properties,
+                                                    /*device*/ 0);
+  if (error != cudaSuccess) {
+    std::cout << "CUDA error: " << cudaGetErrorString(error) << '\n';
+    return error;
+  }
+  unsigned int const compute_capability =
+      device_properties.major * 10 + device_properties.minor;
+#ifdef SM_ONLY
+  std::cout << compute_capability;
+#else
+  switch (compute_capability) {
+      // clang-format off
+    case 30: std::cout << "Set -DKokkos_ARCH_KEPLER30=ON ." << std::endl; break;
+    case 32: std::cout << "Set -DKokkos_ARCH_KEPLER32=ON ." << std::endl; break;
+    case 35: std::cout << "Set -DKokkos_ARCH_KEPLER35=ON ." << std::endl; break;
+    case 37: std::cout << "Set -DKokkos_ARCH_KEPLER37=ON ." << std::endl; break;
+    case 50: std::cout << "Set -DKokkos_ARCH_MAXWELL50=ON ." << std::endl; break;
+    case 52: std::cout << "Set -DKokkos_ARCH_MAXWELL52=ON ." << std::endl; break;
+    case 53: std::cout << "Set -DKokkos_ARCH_MAXWELL53=ON ." << std::endl; break;
+    case 60: std::cout << "Set -DKokkos_ARCH_PASCAL60=ON ." << std::endl; break;
+    case 61: std::cout << "Set -DKokkos_ARCH_PASCAL61=ON ." << std::endl; break;
+    case 70: std::cout << "Set -DKokkos_ARCH_VOLTA70=ON ." << std::endl; break;
+    case 72: std::cout << "Set -DKokkos_ARCH_VOLTA72=ON ." << std::endl; break;
+    case 75: std::cout << "Set -DKokkos_ARCH_TURING75=ON ." << std::endl; break;
+    case 80: std::cout << "Set -DKokkos_ARCH_AMPERE80=ON ." << std::endl; break;
+    default:
+      std::cout << "Compute capability " << compute_capability
+                << " is not supported" << std::endl;
+      // clang-format on
+  }
+#endif
+  return 0;
+}
--- a/lib/kokkos/cmake/fake_tribits.cmake
+++ b/lib/kokkos/cmake/fake_tribits.cmake
@ -88,7 +88,7 @@ FUNCTION(KOKKOS_ADD_TEST)
  if (KOKKOS_HAS_TRILINOS)
    CMAKE_PARSE_ARGUMENTS(TEST
      ""
-      "EXE;NAME"
+      "EXE;NAME;TOOL"
      ""
      ${ARGN})
    IF(TEST_EXE)
@ -104,10 +104,15 @@ FUNCTION(KOKKOS_ADD_TEST)
      NUM_MPI_PROCS 1
      ${TEST_UNPARSED_ARGUMENTS}
    )
+
+    if(TEST_TOOL)
+      add_dependencies(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool
+      set_property(TEST ${TEST_NAME} APPEND_STRING PROPERTY ENVIRONMENT "KOKKOS_PROFILE_LIBRARY=$<TARGET_FILE:${TEST_TOOL}>")
+    endif()
  else()
    CMAKE_PARSE_ARGUMENTS(TEST
      "WILL_FAIL"
-      "FAIL_REGULAR_EXPRESSION;PASS_REGULAR_EXPRESSION;EXE;NAME"
+      "FAIL_REGULAR_EXPRESSION;PASS_REGULAR_EXPRESSION;EXE;NAME;TOOL"
      "CATEGORIES;CMD_ARGS"
      ${ARGN})
    # To match Tribits, we should always be receiving
@ -135,6 +140,10 @@ FUNCTION(KOKKOS_ADD_TEST)
    IF(TEST_PASS_REGULAR_EXPRESSION)
      SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${TEST_PASS_REGULAR_EXPRESSION})
    ENDIF()
+    if(TEST_TOOL)
+      add_dependencies(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool
+      set_property(TEST ${TEST_NAME} APPEND_STRING PROPERTY ENVIRONMENT "KOKKOS_PROFILE_LIBRARY=$<TARGET_FILE:${TEST_TOOL}>")
+    endif()
    VERIFY_EMPTY(KOKKOS_ADD_TEST ${TEST_UNPARSED_ARGUMENTS})
  endif()
 ENDFUNCTION()
--- a/lib/kokkos/cmake/kokkos_arch.cmake
+++ b/lib/kokkos/cmake/kokkos_arch.cmake
@ -2,11 +2,14 @@
 FUNCTION(KOKKOS_ARCH_OPTION SUFFIX DEV_TYPE DESCRIPTION)
  #all optimizations off by default
  KOKKOS_OPTION(ARCH_${SUFFIX} OFF BOOL "Optimize for ${DESCRIPTION} (${DEV_TYPE})")
-  IF (KOKKOS_ARCH_${SUFFIX})
+  SET(KOKKOS_ARCH_${SUFFIX} ${KOKKOS_ARCH_${SUFFIX}} PARENT_SCOPE)
+  SET(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE)
+  SET(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE)
+  SET(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE)
+  IF(KOKKOS_ARCH_${SUFFIX})
    LIST(APPEND KOKKOS_ENABLED_ARCH_LIST ${SUFFIX})
    SET(KOKKOS_ENABLED_ARCH_LIST ${KOKKOS_ENABLED_ARCH_LIST} PARENT_SCOPE)
  ENDIF()
-  SET(KOKKOS_ARCH_${SUFFIX} ${KOKKOS_ARCH_${SUFFIX}} PARENT_SCOPE)
 ENDFUNCTION()


@ -15,6 +18,10 @@ KOKKOS_CFG_DEPENDS(ARCH COMPILER_ID)
 KOKKOS_CFG_DEPENDS(ARCH DEVICES)
 KOKKOS_CFG_DEPENDS(ARCH OPTIONS)

+KOKKOS_CHECK_DEPRECATED_OPTIONS(
+  ARCH_EPYC   "Please replace EPYC with ZEN or ZEN2, depending on your platform"
+  ARCH_RYZEN  "Please replace RYZEN with ZEN or ZEN2, depending on your platform"
+)

 #-------------------------------------------------------------------------------
 # List of possible host architectures.
@ -51,9 +58,12 @@ KOKKOS_ARCH_OPTION(PASCAL61        GPU  "NVIDIA Pascal generation CC 6.1")
 KOKKOS_ARCH_OPTION(VOLTA70         GPU  "NVIDIA Volta generation CC 7.0")
 KOKKOS_ARCH_OPTION(VOLTA72         GPU  "NVIDIA Volta generation CC 7.2")
 KOKKOS_ARCH_OPTION(TURING75        GPU  "NVIDIA Turing generation CC 7.5")
-KOKKOS_ARCH_OPTION(EPYC            HOST "AMD Epyc architecture")
+KOKKOS_ARCH_OPTION(AMPERE80        GPU  "NVIDIA Ampere generation CC 8.0")
+KOKKOS_ARCH_OPTION(ZEN             HOST "AMD Zen architecture")
+KOKKOS_ARCH_OPTION(ZEN2            HOST "AMD Zen2 architecture")
 KOKKOS_ARCH_OPTION(VEGA900         GPU  "AMD GPU MI25 GFX900")
 KOKKOS_ARCH_OPTION(VEGA906         GPU  "AMD GPU MI50/MI60 GFX906")
+KOKKOS_ARCH_OPTION(INTEL_GEN       GPU  "Intel GPUs Gen9+")

 IF (KOKKOS_ENABLE_CUDA)
 #Regardless of version, make sure we define the general architecture name
@ -75,6 +85,10 @@ IF (KOKKOS_ENABLE_CUDA)
  IF (KOKKOS_ARCH_VOLTA70 OR KOKKOS_ARCH_VOLTA72)
    SET(KOKKOS_ARCH_VOLTA ON)
  ENDIF()
+
+  IF (KOKKOS_ARCH_AMPERE80)
+    SET(KOKKOS_ARCH_AMPERE ON)
+  ENDIF()
 ENDIF()


@ -88,9 +102,10 @@ IF(KOKKOS_ENABLE_COMPILER_WARNINGS)
    ${COMMON_WARNINGS})

  COMPILER_SPECIFIC_FLAGS(
-    PGI NO-VALUE-SPECIFIED
-    GNU     ${GNU_WARNINGS}
-    DEFAULT ${COMMON_WARNINGS}
+    COMPILER_ID CMAKE_CXX_COMPILER_ID
+    PGI         NO-VALUE-SPECIFIED
+    GNU         ${GNU_WARNINGS}
+    DEFAULT     ${COMMON_WARNINGS}
  )
 ENDIF()

@ -102,6 +117,9 @@ GLOBAL_SET(KOKKOS_CUDA_OPTIONS)
 IF (KOKKOS_ENABLE_CUDA_LAMBDA)
  IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA)
    GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-expt-extended-lambda")
+    IF(KOKKOS_COMPILER_CUDA_VERSION GREATER_EQUAL 110)
+      GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-Wext-lambda-captures-this")
+    ENDIF()
  ENDIF()
 ENDIF()

@ -113,7 +131,6 @@ ENDIF()

 IF (KOKKOS_CXX_COMPILER_ID STREQUAL Clang)
  SET(CUDA_ARCH_FLAG "--cuda-gpu-arch")
-  SET(AMDGPU_ARCH_FLAG "--amdgpu-target")
  GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -x cuda)
  IF (KOKKOS_ENABLE_CUDA)
     SET(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND ON CACHE BOOL "enable CUDA Clang workarounds" FORCE)
@ -133,6 +150,15 @@ IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA)
  ENDIF()
 ENDIF()

+
+#------------------------------- KOKKOS_HIP_OPTIONS ---------------------------
+#clear anything that might be in the cache
+GLOBAL_SET(KOKKOS_AMDGPU_OPTIONS)
+IF(KOKKOS_CXX_COMPILER_ID STREQUAL HIP)
+  SET(AMDGPU_ARCH_FLAG "--amdgpu-target")
+ENDIF()
+
+
 IF (KOKKOS_ARCH_ARMV80)
  COMPILER_SPECIFIC_FLAGS(
    Cray NO-VALUE-SPECIFIED
@ -167,12 +193,21 @@ IF (KOKKOS_ARCH_ARMV8_THUNDERX2)
  )
 ENDIF()

-IF (KOKKOS_ARCH_EPYC)
+IF (KOKKOS_ARCH_ZEN)
  COMPILER_SPECIFIC_FLAGS(
    Intel   -mavx2
    DEFAULT -march=znver1 -mtune=znver1
  )
-  SET(KOKKOS_ARCH_AMD_EPYC ON)
+  SET(KOKKOS_ARCH_AMD_ZEN  ON)
+  SET(KOKKOS_ARCH_AMD_AVX2 ON)
+ENDIF()
+
+IF (KOKKOS_ARCH_ZEN2)
+  COMPILER_SPECIFIC_FLAGS(
+    Intel   -mavx2
+    DEFAULT -march=znver2 -mtune=znver2
+  )
+  SET(KOKKOS_ARCH_AMD_ZEN2 ON)
  SET(KOKKOS_ARCH_AMD_AVX2 ON)
 ENDIF()

@ -216,14 +251,6 @@ IF (KOKKOS_ARCH_BDW)
  )
 ENDIF()

-IF (KOKKOS_ARCH_EPYC)
-  SET(KOKKOS_ARCH_AMD_AVX2 ON)
-  COMPILER_SPECIFIC_FLAGS(
-    Intel   -mvax2
-    DEFAULT  -march=znver1 -mtune=znver1
-  )
-ENDIF()
-
 IF (KOKKOS_ARCH_KNL)
  #avx512-mic
  SET(KOKKOS_ARCH_AVX512MIC ON) #not a cache variable
@ -253,7 +280,7 @@ IF (KOKKOS_ARCH_SKX)
  )
 ENDIF()

-IF (KOKKOS_ARCH_WSM OR KOKKOS_ARCH_SNB OR KOKKOS_ARCH_HSW OR KOKKOS_ARCH_BDW OR KOKKOS_ARCH_KNL OR KOKKOS_ARCH_SKX OR KOKKOS_ARCH_EPYC)
+IF (KOKKOS_ARCH_WSM OR KOKKOS_ARCH_SNB OR KOKKOS_ARCH_HSW OR KOKKOS_ARCH_BDW OR KOKKOS_ARCH_KNL OR KOKKOS_ARCH_SKX OR KOKKOS_ARCH_ZEN OR KOKKOS_ARCH_ZEN2)
  SET(KOKKOS_USE_ISA_X86_64 ON)
 ENDIF()

@ -296,6 +323,21 @@ IF (Kokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE)
  )
 ENDIF()

+# Clang needs mcx16 option enabled for Windows atomic functions
+IF (CMAKE_CXX_COMPILER_ID STREQUAL Clang AND WIN32)
+  COMPILER_SPECIFIC_OPTIONS(
+    Clang -mcx16
+  )
+ENDIF()
+
+# MSVC ABI has many deprecation warnings, so ignore them
+IF (CMAKE_CXX_COMPILER_ID STREQUAL MSVC OR "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC")
+  COMPILER_SPECIFIC_DEFS(
+    Clang _CRT_SECURE_NO_WARNINGS
+  )
+ENDIF()
+
+
 #Right now we cannot get the compiler ID when cross-compiling, so just check
 #that HIP is enabled
 IF (Kokkos_ENABLE_HIP)
@ -324,11 +366,15 @@ FUNCTION(CHECK_CUDA_ARCH ARCH FLAG)
    ELSE()
      SET(KOKKOS_CUDA_ARCH_FLAG ${FLAG} PARENT_SCOPE)
      GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}")
-      IF(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE)
+      IF(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE OR KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA)
        GLOBAL_APPEND(KOKKOS_LINK_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}")
      ENDIF()
    ENDIF()
  ENDIF()
+  LIST(APPEND KOKKOS_CUDA_ARCH_FLAGS ${FLAG})
+  SET(KOKKOS_CUDA_ARCH_FLAGS ${KOKKOS_CUDA_ARCH_FLAGS} PARENT_SCOPE)
+  LIST(APPEND KOKKOS_CUDA_ARCH_LIST ${ARCH})
+  SET(KOKKOS_CUDA_ARCH_LIST ${KOKKOS_CUDA_ARCH_LIST} PARENT_SCOPE)
 ENDFUNCTION()


@ -346,6 +392,7 @@ CHECK_CUDA_ARCH(PASCAL61  sm_61)
 CHECK_CUDA_ARCH(VOLTA70   sm_70)
 CHECK_CUDA_ARCH(VOLTA72   sm_72)
 CHECK_CUDA_ARCH(TURING75  sm_75)
+CHECK_CUDA_ARCH(AMPERE80  sm_80)

 SET(AMDGPU_ARCH_ALREADY_SPECIFIED "")
 FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG)
@ -372,12 +419,19 @@ ENDFUNCTION()
 CHECK_AMDGPU_ARCH(VEGA900 gfx900) # Radeon Instinct MI25
 CHECK_AMDGPU_ARCH(VEGA906 gfx906) # Radeon Instinct MI50 and MI60

+IF(KOKKOS_ENABLE_HIP AND NOT AMDGPU_ARCH_ALREADY_SPECIFIED)
+  MESSAGE(SEND_ERROR "HIP enabled but no AMD GPU architecture currently enabled. "
+                     "Please enable one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'.")
+ENDIF()
+
 IF (KOKKOS_ENABLE_OPENMPTARGET)
  SET(CLANG_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG})
  IF (CLANG_CUDA_ARCH)
+    STRING(REPLACE "sm_" "cc" PGI_CUDA_ARCH ${CLANG_CUDA_ARCH})
    COMPILER_SPECIFIC_FLAGS(
      Clang -Xopenmp-target -march=${CLANG_CUDA_ARCH} -fopenmp-targets=nvptx64-nvidia-cuda
      XL -qtgtarch=${KOKKOS_CUDA_ARCH_FLAG}
+      PGI -gpu=${PGI_CUDA_ARCH}
    )
  ENDIF()
  SET(CLANG_AMDGPU_ARCH ${KOKKOS_AMDGPU_ARCH_FLAG})
@ -386,10 +440,39 @@ IF (KOKKOS_ENABLE_OPENMPTARGET)
      Clang -Xopenmp-target=amdgcn-amd-amdhsa -march=${CLANG_AMDGPU_ARCH} -fopenmp-targets=amdgcn-amd-amdhsa
    )
  ENDIF()
+  IF (KOKKOS_ARCH_INTEL_GEN)
+    COMPILER_SPECIFIC_FLAGS(
+      IntelClang -fopenmp-targets=spir64 -D__STRICT_ANSI__
+    )
+  ENDIF()
 ENDIF()

 IF(KOKKOS_ENABLE_CUDA AND NOT CUDA_ARCH_ALREADY_SPECIFIED)
-  MESSAGE(SEND_ERROR "CUDA enabled but no NVIDIA GPU architecture currently enabled.  Please give one -DKokkos_ARCH_{..}=ON' to enable an NVIDIA GPU architecture.")
+  # Try to autodetect the CUDA Compute Capability by asking the device
+  SET(_BINARY_TEST_DIR ${CMAKE_CURRENT_BINARY_DIR}/cmake/compile_tests/CUDAComputeCapabilityWorkdir)
+  FILE(REMOVE_RECURSE ${_BINARY_TEST_DIR})
+  FILE(MAKE_DIRECTORY ${_BINARY_TEST_DIR})
+
+  TRY_RUN(
+    _RESULT
+    _COMPILE_RESULT
+    ${_BINARY_TEST_DIR}
+    ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc
+    COMPILE_DEFINITIONS -DSM_ONLY
+    RUN_OUTPUT_VARIABLE _CUDA_COMPUTE_CAPABILITY)
+  LIST(FIND KOKKOS_CUDA_ARCH_FLAGS sm_${_CUDA_COMPUTE_CAPABILITY} FLAG_INDEX)
+  IF(_COMPILE_RESULT AND _RESULT EQUAL 0 AND NOT FLAG_INDEX EQUAL -1)
+    MESSAGE(STATUS "Detected CUDA Compute Capability ${_CUDA_COMPUTE_CAPABILITY}")
+    LIST(GET KOKKOS_CUDA_ARCH_LIST ${FLAG_INDEX} ARCHITECTURE)
+    KOKKOS_SET_OPTION(ARCH_${ARCHITECTURE} ON)
+    CHECK_CUDA_ARCH(${ARCHITECTURE} sm_${_CUDA_COMPUTE_CAPABILITY})
+    LIST(APPEND KOKKOS_ENABLED_ARCH_LIST ${ARCHITECTURE})
+  ELSE()
+    MESSAGE(SEND_ERROR "CUDA enabled but no NVIDIA GPU architecture currently enabled and auto-detection failed. "
+                       "Please give one -DKokkos_ARCH_{..}=ON' to enable an NVIDIA GPU architecture.\n"
+                       "You can yourself try to compile ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc and run the executable. "
+                       "If you are cross-compiling, you should try to do this on a compute node.")
+  ENDIF()
 ENDIF()

 #CMake verbose is kind of pointless
@ -453,4 +536,3 @@ MESSAGE(STATUS "Architectures:")
 FOREACH(Arch ${KOKKOS_ENABLED_ARCH_LIST})
  MESSAGE(STATUS " ${Arch}")
 ENDFOREACH()
-
--- a/lib/kokkos/cmake/kokkos_compiler_id.cmake
+++ b/lib/kokkos/cmake/kokkos_compiler_id.cmake
@ -4,33 +4,54 @@ SET(KOKKOS_CXX_COMPILER ${CMAKE_CXX_COMPILER})
 SET(KOKKOS_CXX_COMPILER_ID ${CMAKE_CXX_COMPILER_ID})
 SET(KOKKOS_CXX_COMPILER_VERSION ${CMAKE_CXX_COMPILER_VERSION})

-# Check if the compiler is nvcc (which really means nvcc_wrapper).
-EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version
-                COMMAND grep nvcc
-                COMMAND wc -l
-                OUTPUT_VARIABLE INTERNAL_HAVE_COMPILER_NVCC
-                OUTPUT_STRIP_TRAILING_WHITESPACE)
+IF(Kokkos_ENABLE_CUDA)
+  # Check if the compiler is nvcc (which really means nvcc_wrapper).
+  EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version
+                  OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION
+                  OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+  STRING(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION} )
+
+  STRING(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "nvcc" INTERNAL_COMPILER_VERSION_CONTAINS_NVCC)


-STRING(REGEX REPLACE "^ +" ""
-       INTERNAL_HAVE_COMPILER_NVCC "${INTERNAL_HAVE_COMPILER_NVCC}")
-
+  STRING(REGEX REPLACE "^ +" ""
+         INTERNAL_HAVE_COMPILER_NVCC "${INTERNAL_HAVE_COMPILER_NVCC}")
+  IF(${INTERNAL_COMPILER_VERSION_CONTAINS_NVCC} GREATER -1)
+    SET(INTERNAL_HAVE_COMPILER_NVCC true)
+  ELSE()
+    SET(INTERNAL_HAVE_COMPILER_NVCC false)
+  ENDIF()
+ENDIF()

 IF(INTERNAL_HAVE_COMPILER_NVCC)
+  # Save the host compiler id before overwriting it.
+  SET(KOKKOS_CXX_HOST_COMPILER_ID ${KOKKOS_CXX_COMPILER_ID})
+
  # SET the compiler id to nvcc.  We use the value used by CMake 3.8.
  SET(KOKKOS_CXX_COMPILER_ID NVIDIA CACHE STRING INTERNAL FORCE)

-  # SET nvcc's compiler version.
-  EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version
-                  COMMAND grep release
-                  OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION
-                  OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-  STRING(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+$"
-         TEMP_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION})
+  STRING(REGEX MATCH "V[0-9]+\\.[0-9]+\\.[0-9]+"
+         TEMP_CXX_COMPILER_VERSION ${INTERNAL_COMPILER_VERSION_ONE_LINE})
+  STRING(SUBSTRING ${TEMP_CXX_COMPILER_VERSION} 1 -1 TEMP_CXX_COMPILER_VERSION)
  SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE)
+  MESSAGE(STATUS "Compiler Version: ${KOKKOS_CXX_COMPILER_VERSION}")
 ENDIF()

+IF(Kokkos_ENABLE_HIP)
+  # get HIP version
+  EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version
+                  OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION
+                  OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+  STRING(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION} )
+  SET(KOKKOS_CXX_COMPILER_ID HIP CACHE STRING INTERNAL FORCE)
+
+  STRING(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+"
+         TEMP_CXX_COMPILER_VERSION ${INTERNAL_COMPILER_VERSION_ONE_LINE})
+  SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE)
+  MESSAGE(STATUS "Compiler Version: ${KOKKOS_CXX_COMPILER_VERSION}")
+ENDIF()

 IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang)
  # The Cray compiler reports as Clang to most versions of CMake
@ -42,6 +63,16 @@ IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang)
  IF (INTERNAL_HAVE_CRAY_COMPILER) #not actually Clang
    SET(KOKKOS_CLANG_IS_CRAY TRUE)
  ENDIF()
+  # The clang based Intel compiler reports as Clang to most versions of CMake
+  EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version
+                  COMMAND grep icpx
+                  COMMAND wc -l
+                  OUTPUT_VARIABLE INTERNAL_HAVE_INTEL_COMPILER
+                  OUTPUT_STRIP_TRAILING_WHITESPACE)
+  IF (INTERNAL_HAVE_INTEL_COMPILER) #not actually Clang
+    SET(KOKKOS_CLANG_IS_INTEL TRUE)
+    SET(KOKKOS_CXX_COMPILER_ID IntelClang CACHE STRING INTERNAL FORCE)
+  ENDIF()
 ENDIF()

 IF(KOKKOS_CXX_COMPILER_ID STREQUAL Cray OR KOKKOS_CLANG_IS_CRAY)
@ -65,6 +96,7 @@ SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    Clang      3.5.2 or higher"
 SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    GCC        4.8.4 or higher")
 SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    Intel     15.0.2 or higher")
 SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    NVCC      9.0.69 or higher")
+SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    HIPCC      3.5.0 or higher")
 SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    PGI         17.1 or higher\n")

 IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang)
@ -84,6 +116,10 @@ ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA)
    MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}")
  ENDIF()
  SET(CMAKE_CXX_EXTENSIONS OFF CACHE BOOL "Kokkos turns off CXX extensions" FORCE)
+ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL HIP)
+  IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 3.5.0)
+    MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}")
+  ENDIF()
 ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL PGI)
  IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 17.1)
    MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}")
--- a/lib/kokkos/cmake/kokkos_corner_cases.cmake
+++ b/lib/kokkos/cmake/kokkos_corner_cases.cmake
@ -1,4 +1,4 @@
-IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND KOKKOS_ENABLE_OPENMP AND NOT KOKKOS_CLANG_IS_CRAY)
+IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND KOKKOS_ENABLE_OPENMP AND NOT KOKKOS_CLANG_IS_CRAY AND NOT "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC")
  # The clang "version" doesn't actually tell you what runtimes and tools
  # were built into Clang. We should therefore make sure that libomp
  # was actually built into Clang. Otherwise the user will get nonsensical
@ -49,11 +49,11 @@ ENDIF()

 IF (KOKKOS_CXX_STANDARD STREQUAL 17)
  IF (KOKKOS_CXX_COMPILER_ID STREQUAL GNU AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 7)
-    MESSAGE(FATAL_ERROR "You have requested c++17 support for GCC ${KOKKOS_CXX_COMPILER_VERSION}. Although CMake has allowed this and GCC accepts -std=c++1z/c++17, GCC <= 6 does not properly support *this capture. Please reduce the C++ standard to 14 or upgrade the compiler if you do need 17 support")
+    MESSAGE(FATAL_ERROR "You have requested c++17 support for GCC ${KOKKOS_CXX_COMPILER_VERSION}. Although CMake has allowed this and GCC accepts -std=c++1z/c++17, GCC <= 6 does not properly support *this capture. Please reduce the C++ standard to 14 or upgrade the compiler if you do need C++17 support.")
  ENDIF()

-  IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA)
-    MESSAGE(FATAL_ERROR "You have requested c++17 support for NVCC. Please reduce the C++ standard to 14. No versions of NVCC currently support 17.")
+  IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 11)
+    MESSAGE(FATAL_ERROR "You have requested c++17 support for NVCC ${KOKKOS_CXX_COMPILER_VERSION}. NVCC only supports C++17 from version 11 on. Please reduce the C++ standard to 14 or upgrade the compiler if you need C++17 support.")
  ENDIF()
 ENDIF()

--- a/lib/kokkos/cmake/kokkos_enable_devices.cmake
+++ b/lib/kokkos/cmake/kokkos_enable_devices.cmake
@ -36,25 +36,51 @@ IF(KOKKOS_ENABLE_OPENMP)
  IF(KOKKOS_CLANG_IS_CRAY)
    SET(ClangOpenMPFlag -fopenmp)
  ENDIF()
-  COMPILER_SPECIFIC_FLAGS(
-    Clang      ${ClangOpenMPFlag}
-    AppleClang -Xpreprocessor -fopenmp
-    PGI        -mp
-    NVIDIA     -Xcompiler -fopenmp
-    Cray       NO-VALUE-SPECIFIED
-    XL         -qsmp=omp
-    DEFAULT    -fopenmp
-  )
-  COMPILER_SPECIFIC_LIBS(
-    AppleClang -lomp
-  )
+  IF(KOKKOS_CLANG_IS_INTEL)
+    SET(ClangOpenMPFlag -fiopenmp)
+  ENDIF()
+  IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC")
+    #expression /openmp yields error, so add a specific Clang flag
+    COMPILER_SPECIFIC_OPTIONS(Clang /clang:-fopenmp)
+    #link omp library from LLVM lib dir
+    get_filename_component(LLVM_BIN_DIR ${CMAKE_CXX_COMPILER_AR} DIRECTORY)
+    COMPILER_SPECIFIC_LIBS(Clang "${LLVM_BIN_DIR}/../lib/libomp.lib")
+  ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA)
+    COMPILER_SPECIFIC_FLAGS(
+      COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+      Clang      -Xcompiler ${ClangOpenMPFlag}
+      PGI        -Xcompiler -mp
+      Cray       NO-VALUE-SPECIFIED
+      XL         -Xcompiler -qsmp=omp
+      DEFAULT    -Xcompiler -fopenmp
+    )
+  ELSE()
+    COMPILER_SPECIFIC_FLAGS(
+      Clang      ${ClangOpenMPFlag}
+      AppleClang -Xpreprocessor -fopenmp
+      PGI        -mp
+      Cray       NO-VALUE-SPECIFIED
+      XL         -qsmp=omp
+      DEFAULT    -fopenmp
+    )
+    COMPILER_SPECIFIC_LIBS(
+      AppleClang -lomp
+    )
+  ENDIF()
 ENDIF()

 KOKKOS_DEVICE_OPTION(OPENMPTARGET OFF DEVICE "Whether to build the OpenMP target backend")
 IF (KOKKOS_ENABLE_OPENMPTARGET)
+SET(ClangOpenMPFlag -fopenmp=libomp)
+  IF(KOKKOS_CLANG_IS_CRAY)
+    SET(ClangOpenMPFlag -fopenmp)
+  ENDIF()
+
  COMPILER_SPECIFIC_FLAGS(
-    Clang      -fopenmp -fopenmp=libomp
+    Clang      ${ClangOpenMPFlag} -Wno-openmp-mapping
+    IntelClang -fiopenmp -Wno-openmp-mapping
    XL         -qsmp=omp -qoffload -qnoeh
+    PGI        -mp=gpu
    DEFAULT    -fopenmp
  )
  COMPILER_SPECIFIC_DEFS(
@ -65,6 +91,9 @@ IF (KOKKOS_ENABLE_OPENMPTARGET)
 #  COMPILER_SPECIFIC_LIBS(
 #    Clang -lopenmptarget
 #  )
+   IF(KOKKOS_CXX_STANDARD LESS 17)
+     MESSAGE(FATAL_ERROR "OpenMPTarget backend requires C++17 or newer")
+   ENDIF()
 ENDIF()

 IF(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_CUDA)
@ -76,6 +105,9 @@ KOKKOS_DEVICE_OPTION(CUDA ${CUDA_DEFAULT} DEVICE "Whether to build CUDA backend"

 IF (KOKKOS_ENABLE_CUDA)
  GLOBAL_SET(KOKKOS_DONT_ALLOW_EXTENSIONS "CUDA enabled")
+  IF(WIN32)
+    GLOBAL_APPEND(KOKKOS_COMPILE_OPTIONS -x cu)
+  ENDIF()
 ENDIF()

 # We want this to default to OFF for cache reasons, but if no
--- a/lib/kokkos/cmake/kokkos_enable_options.cmake
+++ b/lib/kokkos/cmake/kokkos_enable_options.cmake
@ -45,10 +45,9 @@ UNSET(_UPPERCASE_CMAKE_BUILD_TYPE)
 KOKKOS_ENABLE_OPTION(LARGE_MEM_TESTS      OFF "Whether to perform extra large memory tests")
 KOKKOS_ENABLE_OPTION(DEBUG_BOUNDS_CHECK   OFF "Whether to use bounds checking - will increase runtime")
 KOKKOS_ENABLE_OPTION(COMPILER_WARNINGS    OFF "Whether to print all compiler warnings")
-KOKKOS_ENABLE_OPTION(PROFILING            ON  "Whether to create bindings for profiling tools")
 KOKKOS_ENABLE_OPTION(PROFILING_LOAD_PRINT OFF "Whether to print information about which profiling tools got loaded")
+KOKKOS_ENABLE_OPTION(TUNING               OFF "Whether to create bindings for tuning tools")
 KOKKOS_ENABLE_OPTION(AGGRESSIVE_VECTORIZATION OFF "Whether to aggressively vectorize loops")
-KOKKOS_ENABLE_OPTION(DEPRECATED_CODE          OFF "Whether to enable deprecated code")

 IF (KOKKOS_ENABLE_CUDA)
  SET(KOKKOS_COMPILER_CUDA_VERSION "${KOKKOS_COMPILER_VERSION_MAJOR}${KOKKOS_COMPILER_VERSION_MINOR}")
--- a/lib/kokkos/cmake/kokkos_functions.cmake
+++ b/lib/kokkos/cmake/kokkos_functions.cmake
@ -47,6 +47,13 @@ FUNCTION(kokkos_option CAMEL_SUFFIX DEFAULT TYPE DOCSTRING)
  SET(CAMEL_NAME Kokkos_${CAMEL_SUFFIX})
  STRING(TOUPPER ${CAMEL_NAME} UC_NAME)

+  LIST(APPEND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX})
+  SET(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE)
+  LIST(APPEND KOKKOS_OPTION_VALUES "${DOCSTRING}")
+  SET(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE)
+  LIST(APPEND KOKKOS_OPTION_TYPES ${TYPE})
+  SET(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE)
+
  # Make sure this appears in the cache with the appropriate DOCSTRING
  SET(${CAMEL_NAME} ${DEFAULT} CACHE ${TYPE} ${DOCSTRING})

@ -73,7 +80,21 @@ FUNCTION(kokkos_option CAMEL_SUFFIX DEFAULT TYPE DOCSTRING)
  ELSE()
    SET(${UC_NAME} ${DEFAULT} PARENT_SCOPE)
  ENDIF()
+ENDFUNCTION()

+FUNCTION(kokkos_set_option CAMEL_SUFFIX VALUE)
+  LIST(FIND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX} OPTION_INDEX)
+  IF(OPTION_INDEX EQUAL -1)
+    MESSAGE(FATAL_ERROR "Couldn't set value for Kokkos_${CAMEL_SUFFIX}")
+  ENDIF()
+  SET(CAMEL_NAME Kokkos_${CAMEL_SUFFIX})
+  STRING(TOUPPER ${CAMEL_NAME} UC_NAME)
+
+  LIST(GET KOKKOS_OPTION_VALUES ${OPTION_INDEX} DOCSTRING)
+  LIST(GET KOKKOS_OPTION_TYPES ${OPTION_INDEX} TYPE)
+  SET(${CAMEL_NAME} ${VALUE} CACHE ${TYPE} ${DOCSTRING} FORCE)
+  MESSAGE(STATUS "Setting ${CAMEL_NAME}=${VALUE}")
+  SET(${UC_NAME} ${VALUE} PARENT_SCOPE)
 ENDFUNCTION()

 FUNCTION(kokkos_append_config_line LINE)
@ -109,8 +130,8 @@ ENDMACRO()

 MACRO(kokkos_export_imported_tpl NAME)
  IF (NOT KOKKOS_HAS_TRILINOS)
-    GET_TARGET_PROPERTY(LIB_TYPE ${NAME} TYPE)
-    IF (${LIB_TYPE} STREQUAL "INTERFACE_LIBRARY")
+    GET_TARGET_PROPERTY(LIB_IMPORTED ${NAME} IMPORTED)
+    IF (NOT LIB_IMPORTED)
      # This is not an imported target
      # This an interface library that we created
      INSTALL(
@ -123,12 +144,18 @@ MACRO(kokkos_export_imported_tpl NAME)
    ELSE()
      #make sure this also gets "exported" in the config file
      KOKKOS_APPEND_CONFIG_LINE("IF(NOT TARGET ${NAME})")
-      KOKKOS_APPEND_CONFIG_LINE("ADD_LIBRARY(${NAME} UNKNOWN IMPORTED)")
-      KOKKOS_APPEND_CONFIG_LINE("SET_TARGET_PROPERTIES(${NAME} PROPERTIES")

-      GET_TARGET_PROPERTY(TPL_LIBRARY ${NAME} IMPORTED_LOCATION)
-      IF(TPL_LIBRARY)
-        KOKKOS_APPEND_CONFIG_LINE("IMPORTED_LOCATION ${TPL_LIBRARY}")
+      GET_TARGET_PROPERTY(LIB_TYPE ${NAME} TYPE)
+      IF (${LIB_TYPE} STREQUAL "INTERFACE_LIBRARY")
+        KOKKOS_APPEND_CONFIG_LINE("ADD_LIBRARY(${NAME} INTERFACE IMPORTED)")
+        KOKKOS_APPEND_CONFIG_LINE("SET_TARGET_PROPERTIES(${NAME} PROPERTIES")
+      ELSE()
+        KOKKOS_APPEND_CONFIG_LINE("ADD_LIBRARY(${NAME} UNKNOWN IMPORTED)")
+        KOKKOS_APPEND_CONFIG_LINE("SET_TARGET_PROPERTIES(${NAME} PROPERTIES")
+        GET_TARGET_PROPERTY(TPL_LIBRARY ${NAME} IMPORTED_LOCATION)
+        IF(TPL_LIBRARY)
+          KOKKOS_APPEND_CONFIG_LINE("IMPORTED_LOCATION ${TPL_LIBRARY}")
+        ENDIF()
      ENDIF()

      GET_TARGET_PROPERTY(TPL_INCLUDES ${NAME} INTERFACE_INCLUDE_DIRECTORIES)
@ -737,18 +764,22 @@ FUNCTION(kokkos_link_tpl TARGET)
 ENDFUNCTION()

 FUNCTION(COMPILER_SPECIFIC_OPTIONS_HELPER)
-  SET(COMPILERS NVIDIA PGI XL DEFAULT Cray Intel Clang AppleClang GNU)
+  SET(COMPILERS NVIDIA PGI XL DEFAULT Cray Intel Clang AppleClang IntelClang GNU HIP)
  CMAKE_PARSE_ARGUMENTS(
    PARSE
    "LINK_OPTIONS;COMPILE_OPTIONS;COMPILE_DEFINITIONS;LINK_LIBRARIES"
-    ""
+    "COMPILER_ID"
    "${COMPILERS}"
    ${ARGN})
  IF(PARSE_UNPARSED_ARGUMENTS)
    MESSAGE(SEND_ERROR "'${PARSE_UNPARSED_ARGUMENTS}' argument(s) not recognized when providing compiler specific options")
  ENDIF()

-  SET(COMPILER ${KOKKOS_CXX_COMPILER_ID})
+  IF(PARSE_COMPILER_ID)
+    SET(COMPILER ${${PARSE_COMPILER_ID}})
+  ELSE()
+    SET(COMPILER ${KOKKOS_CXX_COMPILER_ID})
+  ENDIF()

  SET(COMPILER_SPECIFIC_FLAGS_TMP)
  FOREACH(COMP ${COMPILERS})
@ -792,6 +823,14 @@ FUNCTION(COMPILER_SPECIFIC_FLAGS)
  COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} COMPILE_OPTIONS LINK_OPTIONS)
 ENDFUNCTION(COMPILER_SPECIFIC_FLAGS)

+FUNCTION(COMPILER_SPECIFIC_OPTIONS)
+  COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} COMPILE_OPTIONS)
+ENDFUNCTION(COMPILER_SPECIFIC_OPTIONS)
+
+FUNCTION(COMPILER_SPECIFIC_LINK_OPTIONS)
+  COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} LINK_OPTIONS)
+ENDFUNCTION(COMPILER_SPECIFIC_LINK_OPTIONS)
+
 FUNCTION(COMPILER_SPECIFIC_DEFS)
  COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} COMPILE_DEFINITIONS)
 ENDFUNCTION(COMPILER_SPECIFIC_DEFS)
@ -799,3 +838,36 @@ ENDFUNCTION(COMPILER_SPECIFIC_DEFS)
 FUNCTION(COMPILER_SPECIFIC_LIBS)
  COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} LINK_LIBRARIES)
 ENDFUNCTION(COMPILER_SPECIFIC_LIBS)
+
+# Given a list of the form
+#  key1;value1;key2;value2,...
+# Create a list of all keys in a variable named ${KEY_LIST_NAME}
+# and set the value for each key in a variable ${VAR_PREFIX}key1,...
+# kokkos_key_value_map(ARCH ALL_ARCHES key1;value1;key2;value2)
+# would produce a list variable ALL_ARCHES=key1;key2
+# and individual variables ARCHkey1=value1 and ARCHkey2=value2
+MACRO(KOKKOS_KEY_VALUE_MAP VAR_PREFIX KEY_LIST_NAME)
+  SET(PARSE_KEY ON)
+  SET(${KEY_LIST_NAME})
+  FOREACH(ENTRY ${ARGN})
+    IF(PARSE_KEY)
+      SET(CURRENT_KEY ${ENTRY})
+      SET(PARSE_KEY OFF)
+      LIST(APPEND ${KEY_LIST_NAME} ${CURRENT_KEY})
+    ELSE()
+      SET(${VAR_PREFIX}${CURRENT_KEY} ${ENTRY})
+      SET(PARSE_KEY ON)
+    ENDIF()
+  ENDFOREACH()
+ENDMACRO()
+
+FUNCTION(KOKKOS_CHECK_DEPRECATED_OPTIONS)
+  KOKKOS_KEY_VALUE_MAP(DEPRECATED_MSG_ DEPRECATED_LIST ${ARGN})
+  FOREACH(OPTION_SUFFIX ${DEPRECATED_LIST})
+    SET(OPTION_NAME Kokkos_${OPTION_SUFFIX})
+    SET(OPTION_MESSAGE ${DEPRECATED_MSG_${OPTION_SUFFIX}})
+    IF(DEFINED ${OPTION_NAME}) # This variable has been given by the user as on or off
+      MESSAGE(SEND_ERROR "Removed option ${OPTION_NAME} has been given with value ${${OPTION_NAME}}. ${OPT_MESSAGE}")
+    ENDIF()
+  ENDFOREACH()
+ENDFUNCTION()
--- a/lib/kokkos/cmake/kokkos_install.cmake
+++ b/lib/kokkos/cmake/kokkos_install.cmake
@ -1,5 +1,5 @@
 INCLUDE(CMakePackageConfigHelpers)
-IF (NOT KOKKOS_HAS_TRILINOS)
+IF (NOT KOKKOS_HAS_TRILINOS AND NOT Kokkos_INSTALL_TESTING)
  INCLUDE(GNUInstallDirs)

  #Set all the variables needed for KokkosConfig.cmake
--- a/lib/kokkos/cmake/kokkos_test_cxx_std.cmake
+++ b/lib/kokkos/cmake/kokkos_test_cxx_std.cmake
@ -28,19 +28,30 @@ FUNCTION(kokkos_set_cxx_standard_feature standard)
    GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME})
  ELSEIF(NOT KOKKOS_USE_CXX_EXTENSIONS AND ${STANDARD_NAME})
    MESSAGE(STATUS "Using ${${STANDARD_NAME}} for C++${standard} standard as feature")
+    IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND (KOKKOS_CXX_HOST_COMPILER_ID STREQUAL GNU OR KOKKOS_CXX_HOST_COMPILER_ID STREQUAL Clang))
+      SET(SUPPORTED_NVCC_FLAGS "-std=c++11;-std=c++14;-std=c++17")
+      IF (NOT ${${STANDARD_NAME}} IN_LIST SUPPORTED_NVCC_FLAGS)
+        MESSAGE(FATAL_ERROR "CMake wants to use ${${STANDARD_NAME}} which is not supported by NVCC. Using a more recent host compiler or a more recent CMake version might help.")
+      ENDIF()
+    ENDIF()
    GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME})
-  ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC")
+  ELSEIF (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" OR "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC")
    #MSVC doesn't need a command line flag, that doesn't mean it has no support
    MESSAGE(STATUS "Using no flag for C++${standard} standard as feature")
    GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME})
+  ELSEIF((KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA") AND WIN32)
+    MESSAGE(STATUS "Using no flag for C++${standard} standard as feature")
+    GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE "")
  ELSE()
    #nope, we can't do anything here
    MESSAGE(WARNING "C++${standard} is not supported as a compiler feature. We will choose custom flags for now, but this behavior has been deprecated. Please open an issue at https://github.com/kokkos/kokkos/issues reporting that ${KOKKOS_CXX_COMPILER_ID} ${KOKKOS_CXX_COMPILER_VERSION} failed for ${KOKKOS_CXX_STANDARD}, preferrably including your CMake command.")
    GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE "")
  ENDIF()

-  IF(NOT ${FEATURE_NAME} IN_LIST CMAKE_CXX_COMPILE_FEATURES)
-    MESSAGE(FATAL_ERROR "Compiler ${KOKKOS_CXX_COMPILER_ID} should support ${FEATURE_NAME}, but CMake reports feature not supported")
+  IF(NOT WIN32)
+    IF(NOT ${FEATURE_NAME} IN_LIST CMAKE_CXX_COMPILE_FEATURES)
+     MESSAGE(FATAL_ERROR "Compiler ${KOKKOS_CXX_COMPILER_ID} should support ${FEATURE_NAME}, but CMake reports feature not supported")
+    ENDIF()
  ENDIF()
 ENDFUNCTION()

@ -123,7 +134,7 @@ IF (NOT KOKKOS_CXX_STANDARD_FEATURE)
  ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Intel)
    INCLUDE(${KOKKOS_SRC_PATH}/cmake/intel.cmake)
    kokkos_set_intel_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD})
-  ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC")
+  ELSEIF((KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") OR ((KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA") AND WIN32))
    INCLUDE(${KOKKOS_SRC_PATH}/cmake/msvc.cmake)
    kokkos_set_msvc_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD})
  ELSE()
--- a/lib/kokkos/cmake/kokkos_tpls.cmake
+++ b/lib/kokkos/cmake/kokkos_tpls.cmake
@ -13,10 +13,10 @@ KOKKOS_TPL_OPTION(LIBNUMA Off)
 KOKKOS_TPL_OPTION(MEMKIND Off)
 KOKKOS_TPL_OPTION(CUDA    Off)
 KOKKOS_TPL_OPTION(LIBRT   Off)
-KOKKOS_TPL_OPTION(LIBDL   On)
-
-IF(KOKKOS_ENABLE_PROFILING AND NOT KOKKOS_ENABLE_LIBDL)
-  MESSAGE(SEND_ERROR "Kokkos_ENABLE_PROFILING requires Kokkos_ENABLE_LIBDL=ON")
+IF (WIN32)
+  KOKKOS_TPL_OPTION(LIBDL Off)
+ELSE()
+  KOKKOS_TPL_OPTION(LIBDL On)
 ENDIF()

 IF(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_HPX)
--- a/lib/kokkos/cmake/kokkos_tribits.cmake
+++ b/lib/kokkos/cmake/kokkos_tribits.cmake
@ -21,10 +21,6 @@ IF (KOKKOS_HAS_TRILINOS)
    SET(${PROJECT_NAME}_ENABLE_DEBUG OFF)
  ENDIF()

-  IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_CXX11)
-    SET(${PROJECT_NAME}_ENABLE_CXX11 ON)
-  ENDIF()
-
  IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_TESTS)
    SET(${PROJECT_NAME}_ENABLE_TESTS OFF)
  ENDIF()
@ -134,7 +130,7 @@ FUNCTION(KOKKOS_ADD_EXECUTABLE ROOT_NAME)
    VERIFY_EMPTY(KOKKOS_ADD_EXECUTABLE ${PARSE_UNPARSED_ARGUMENTS})
    #All executables must link to all the kokkos targets
    #This is just private linkage because exe is final
-    TARGET_LINK_LIBRARIES(${EXE_NAME} PRIVATE kokkos)
+    TARGET_LINK_LIBRARIES(${EXE_NAME} PRIVATE Kokkos::kokkos)
  endif()
 ENDFUNCTION()

@ -174,16 +170,42 @@ FUNCTION(KOKKOS_SET_EXE_PROPERTY ROOT_NAME)
 ENDFUNCTION()

 MACRO(KOKKOS_SETUP_BUILD_ENVIRONMENT)
- INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_compiler_id.cmake)
- INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_enable_devices.cmake)
- INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_enable_options.cmake)
- INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_test_cxx_std.cmake)
- INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_arch.cmake)
- IF (NOT KOKKOS_HAS_TRILINOS)
-  SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${Kokkos_SOURCE_DIR}/cmake/Modules/")
-  INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_tpls.cmake)
- ENDIF()
- INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_corner_cases.cmake)
+  # This is needed for both regular build and install tests
+  INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_compiler_id.cmake)
+  #set an internal option, if not already set
+  SET(Kokkos_INSTALL_TESTING OFF CACHE INTERNAL "Whether to build tests and examples against installation")
+  IF (Kokkos_INSTALL_TESTING)
+    SET(KOKKOS_ENABLE_TESTS ON)
+    SET(KOKKOS_ENABLE_EXAMPLES ON)
+    # This looks a little weird, but what we are doing
+    # is to NOT build Kokkos but instead look for an
+    # installed Kokkos - then build examples and tests
+    # against that installed Kokkos
+    FIND_PACKAGE(Kokkos REQUIRED)
+    # Just grab the configuration from the installation
+    FOREACH(DEV ${Kokkos_DEVICES})
+      SET(KOKKOS_ENABLE_${DEV} ON)
+    ENDFOREACH()
+    FOREACH(OPT ${Kokkos_OPTIONS})
+      SET(KOKKOS_ENABLE_${OPT} ON)
+    ENDFOREACH()
+    FOREACH(TPL ${Kokkos_TPLS})
+      SET(KOKKOS_ENABLE_${TPL} ON)
+    ENDFOREACH()
+    FOREACH(ARCH ${Kokkos_ARCH})
+      SET(KOKKOS_ARCH_${ARCH} ON)
+    ENDFOREACH()
+  ELSE()
+    INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_enable_devices.cmake)
+    INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_enable_options.cmake)
+    INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_test_cxx_std.cmake)
+    INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_arch.cmake)
+    IF (NOT KOKKOS_HAS_TRILINOS)
+      SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${Kokkos_SOURCE_DIR}/cmake/Modules/")
+      INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_tpls.cmake)
+    ENDIF()
+    INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_corner_cases.cmake)
+  ENDIF()
 ENDMACRO()

 MACRO(KOKKOS_ADD_TEST_EXECUTABLE ROOT_NAME)
@ -310,28 +332,40 @@ FUNCTION(KOKKOS_INTERNAL_ADD_LIBRARY LIBRARY_NAME)
    LIST(REMOVE_DUPLICATES PARSE_SOURCES)
  ENDIF()

+  IF(PARSE_STATIC)
+    SET(LINK_TYPE STATIC)
+  ENDIF()
+
+  IF(PARSE_SHARED)
+    SET(LINK_TYPE SHARED)
+  ENDIF()
+
+  # MSVC and other platforms want to have
+  # the headers included as source files
+  # for better dependency detection
  ADD_LIBRARY(
    ${LIBRARY_NAME}
+    ${LINK_TYPE}
    ${PARSE_HEADERS}
    ${PARSE_SOURCES}
  )

  KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL(${LIBRARY_NAME})

-  INSTALL(
-    FILES  ${PARSE_HEADERS}
-    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
-    COMPONENT ${PACKAGE_NAME}
-  )
-
  #In case we are building in-tree, add an alias name
  #that matches the install Kokkos:: name
  ADD_LIBRARY(Kokkos::${LIBRARY_NAME} ALIAS ${LIBRARY_NAME})
 ENDFUNCTION()

 FUNCTION(KOKKOS_ADD_LIBRARY LIBRARY_NAME)
+  CMAKE_PARSE_ARGUMENTS(PARSE
+    "ADD_BUILD_OPTIONS"
+    ""
+    ""
+    ${ARGN}
+  )
  IF (KOKKOS_HAS_TRILINOS)
-    TRIBITS_ADD_LIBRARY(${LIBRARY_NAME} ${ARGN})
+    TRIBITS_ADD_LIBRARY(${LIBRARY_NAME} ${PARSE_UNPARSED_ARGUMENTS})
    #Stolen from Tribits - it can add prefixes
    SET(TRIBITS_LIBRARY_NAME_PREFIX "${${PROJECT_NAME}_LIBRARY_NAME_PREFIX}")
    SET(TRIBITS_LIBRARY_NAME ${TRIBITS_LIBRARY_NAME_PREFIX}${LIBRARY_NAME})
@ -346,8 +380,10 @@ FUNCTION(KOKKOS_ADD_LIBRARY LIBRARY_NAME)
    #KOKKOS_SET_LIBRARY_PROPERTIES(${TRIBITS_LIBRARY_NAME} PLAIN_STYLE)
  ELSE()
    KOKKOS_INTERNAL_ADD_LIBRARY(
-      ${LIBRARY_NAME} ${ARGN})
-    KOKKOS_SET_LIBRARY_PROPERTIES(${LIBRARY_NAME})
+      ${LIBRARY_NAME} ${PARSE_UNPARSED_ARGUMENTS})
+    IF (PARSE_ADD_BUILD_OPTIONS)
+      KOKKOS_SET_LIBRARY_PROPERTIES(${LIBRARY_NAME})
+    ENDIF()
  ENDIF()
 ENDFUNCTION()

@ -364,17 +400,6 @@ ELSE()

  ADD_LIBRARY(${NAME} INTERFACE)
  KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL(${NAME})
-
-  INSTALL(
-    FILES  ${PARSE_HEADERS}
-    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
-  )
-
-  INSTALL(
-    FILES  ${PARSE_HEADERS}
-    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
-    COMPONENT ${PACKAGE_NAME}
-  )
 ENDIF()
 ENDFUNCTION()

--- a/lib/kokkos/config/yaml/volta.yaml
+++ b/lib/kokkos/config/yaml/volta.yaml
@ -0,0 +1,4 @@
+packages:
+ kokkos:
+  variants: +cuda +openmp +volta70 +cuda_lambda +wrapper ^cuda@10.1
+  compiler: [gcc@7.2.0]
--- a/lib/kokkos/containers/CMakeLists.txt
+++ b/lib/kokkos/containers/CMakeLists.txt
@ -2,7 +2,9 @@

 KOKKOS_SUBPACKAGE(Containers)

-ADD_SUBDIRECTORY(src)
+IF (NOT Kokkos_INSTALL_TESTING)
+  ADD_SUBDIRECTORY(src)
+ENDIF()

 KOKKOS_ADD_TEST_DIRECTORIES(unit_tests)
 KOKKOS_ADD_TEST_DIRECTORIES(performance_tests)
--- a/lib/kokkos/containers/performance_tests/Makefile
+++ b/lib/kokkos/containers/performance_tests/Makefile
@ -31,10 +31,10 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
 	TEST_TARGETS += test-cuda
 endif

-ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
-	OBJ_ROCM = TestROCm.o TestMain.o gtest-all.o
-	TARGETS += KokkosContainers_PerformanceTest_ROCm
-	TEST_TARGETS += test-rocm
+ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
+	OBJ_HIP = TestHIP.o TestMain.o gtest-all.o
+	TARGETS += KokkosContainers_PerformanceTest_HIP
+	TEST_TARGETS += test-hip
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
--- a/lib/kokkos/containers/performance_tests/TestDynRankView.hpp
+++ b/lib/kokkos/containers/performance_tests/TestDynRankView.hpp
@ -58,7 +58,7 @@ namespace Performance {
 // View functor
 template <typename DeviceType>
 struct InitViewFunctor {
-  typedef Kokkos::View<double ***, DeviceType> inviewtype;
+  using inviewtype = Kokkos::View<double ***, DeviceType>;
  inviewtype _inview;

  InitViewFunctor(inviewtype &inview_) : _inview(inview_) {}
@ -73,10 +73,10 @@ struct InitViewFunctor {
  }

  struct SumComputationTest {
-    typedef Kokkos::View<double ***, DeviceType> inviewtype;
+    using inviewtype = Kokkos::View<double ***, DeviceType>;
    inviewtype _inview;

-    typedef Kokkos::View<double *, DeviceType> outviewtype;
+    using outviewtype = Kokkos::View<double *, DeviceType>;
    outviewtype _outview;

    KOKKOS_INLINE_FUNCTION
@ -96,7 +96,7 @@ struct InitViewFunctor {

 template <typename DeviceType>
 struct InitStrideViewFunctor {
-  typedef Kokkos::View<double ***, Kokkos::LayoutStride, DeviceType> inviewtype;
+  using inviewtype = Kokkos::View<double ***, Kokkos::LayoutStride, DeviceType>;
  inviewtype _inview;

  InitStrideViewFunctor(inviewtype &inview_) : _inview(inview_) {}
@ -113,7 +113,7 @@ struct InitStrideViewFunctor {

 template <typename DeviceType>
 struct InitViewRank7Functor {
-  typedef Kokkos::View<double *******, DeviceType> inviewtype;
+  using inviewtype = Kokkos::View<double *******, DeviceType>;
  inviewtype _inview;

  InitViewRank7Functor(inviewtype &inview_) : _inview(inview_) {}
@ -131,7 +131,7 @@ struct InitViewRank7Functor {
 // DynRankView functor
 template <typename DeviceType>
 struct InitDynRankViewFunctor {
-  typedef Kokkos::DynRankView<double, DeviceType> inviewtype;
+  using inviewtype = Kokkos::DynRankView<double, DeviceType>;
  inviewtype _inview;

  InitDynRankViewFunctor(inviewtype &inview_) : _inview(inview_) {}
@ -146,10 +146,10 @@ struct InitDynRankViewFunctor {
  }

  struct SumComputationTest {
-    typedef Kokkos::DynRankView<double, DeviceType> inviewtype;
+    using inviewtype = Kokkos::DynRankView<double, DeviceType>;
    inviewtype _inview;

-    typedef Kokkos::DynRankView<double, DeviceType> outviewtype;
+    using outviewtype = Kokkos::DynRankView<double, DeviceType>;
    outviewtype _outview;

    KOKKOS_INLINE_FUNCTION
@ -169,8 +169,8 @@ struct InitDynRankViewFunctor {

 template <typename DeviceType>
 void test_dynrankview_op_perf(const int par_size) {
-  typedef DeviceType execution_space;
-  typedef typename execution_space::size_type size_type;
+  using execution_space = DeviceType;
+  using size_type       = typename execution_space::size_type;
  const size_type dim_2 = 90;
  const size_type dim_3 = 30;

@ -184,7 +184,7 @@ void test_dynrankview_op_perf(const int par_size) {
  {
    Kokkos::View<double ***, DeviceType> testview("testview", par_size, dim_2,
                                                  dim_3);
-    typedef InitViewFunctor<DeviceType> FunctorType;
+    using FunctorType = InitViewFunctor<DeviceType>;

    timer.reset();
    Kokkos::RangePolicy<DeviceType> policy(0, par_size);
@ -204,7 +204,7 @@ void test_dynrankview_op_perf(const int par_size) {

    Kokkos::View<double ***, Kokkos::LayoutStride, DeviceType> teststrideview =
        Kokkos::subview(testview, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL);
-    typedef InitStrideViewFunctor<DeviceType> FunctorStrideType;
+    using FunctorStrideType = InitStrideViewFunctor<DeviceType>;

    timer.reset();
    Kokkos::parallel_for(policy, FunctorStrideType(teststrideview));
@ -216,7 +216,7 @@ void test_dynrankview_op_perf(const int par_size) {
  {
    Kokkos::View<double *******, DeviceType> testview("testview", par_size,
                                                      dim_2, dim_3, 1, 1, 1, 1);
-    typedef InitViewRank7Functor<DeviceType> FunctorType;
+    using FunctorType = InitViewRank7Functor<DeviceType>;

    timer.reset();
    Kokkos::RangePolicy<DeviceType> policy(0, par_size);
@ -229,7 +229,7 @@ void test_dynrankview_op_perf(const int par_size) {
  {
    Kokkos::DynRankView<double, DeviceType> testdrview("testdrview", par_size,
                                                       dim_2, dim_3);
-    typedef InitDynRankViewFunctor<DeviceType> FunctorType;
+    using FunctorType = InitDynRankViewFunctor<DeviceType>;

    timer.reset();
    Kokkos::RangePolicy<DeviceType> policy(0, par_size);
--- a/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp
+++ b/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp
@ -65,9 +65,9 @@ union helper {

 template <typename Device>
 struct generate_ids {
-  typedef Device execution_space;
-  typedef typename execution_space::size_type size_type;
-  typedef Kokkos::View<uint32_t*, execution_space> local_id_view;
+  using execution_space = Device;
+  using size_type       = typename execution_space::size_type;
+  using local_id_view   = Kokkos::View<uint32_t*, execution_space>;

  local_id_view local_2_global;

@ -96,13 +96,12 @@ struct generate_ids {

 template <typename Device>
 struct fill_map {
-  typedef Device execution_space;
-  typedef typename execution_space::size_type size_type;
-  typedef Kokkos::View<const uint32_t*, execution_space,
-                       Kokkos::MemoryRandomAccess>
-      local_id_view;
-  typedef Kokkos::UnorderedMap<uint32_t, size_type, execution_space>
-      global_id_view;
+  using execution_space = Device;
+  using size_type       = typename execution_space::size_type;
+  using local_id_view   = Kokkos::View<const uint32_t*, execution_space,
+                                     Kokkos::MemoryRandomAccess>;
+  using global_id_view =
+      Kokkos::UnorderedMap<uint32_t, size_type, execution_space>;

  global_id_view global_2_local;
  local_id_view local_2_global;
@ -120,18 +119,17 @@ struct fill_map {

 template <typename Device>
 struct find_test {
-  typedef Device execution_space;
-  typedef typename execution_space::size_type size_type;
-  typedef Kokkos::View<const uint32_t*, execution_space,
-                       Kokkos::MemoryRandomAccess>
-      local_id_view;
-  typedef Kokkos::UnorderedMap<const uint32_t, const size_type, execution_space>
-      global_id_view;
+  using execution_space = Device;
+  using size_type       = typename execution_space::size_type;
+  using local_id_view   = Kokkos::View<const uint32_t*, execution_space,
+                                     Kokkos::MemoryRandomAccess>;
+  using global_id_view =
+      Kokkos::UnorderedMap<const uint32_t, const size_type, execution_space>;

  global_id_view global_2_local;
  local_id_view local_2_global;

-  typedef size_t value_type;
+  using value_type = size_t;

  find_test(global_id_view gIds, local_id_view lIds, value_type& num_errors)
      : global_2_local(gIds), local_2_global(lIds) {
@ -156,12 +154,12 @@ struct find_test {

 template <typename Device>
 void test_global_to_local_ids(unsigned num_ids) {
-  typedef Device execution_space;
-  typedef typename execution_space::size_type size_type;
+  using execution_space = Device;
+  using size_type       = typename execution_space::size_type;

-  typedef Kokkos::View<uint32_t*, execution_space> local_id_view;
-  typedef Kokkos::UnorderedMap<uint32_t, size_type, execution_space>
-      global_id_view;
+  using local_id_view = Kokkos::View<uint32_t*, execution_space>;
+  using global_id_view =
+      Kokkos::UnorderedMap<uint32_t, size_type, execution_space>;

  // size
  std::cout << num_ids << ", ";
--- a/lib/kokkos/containers/performance_tests/TestScatterView.hpp
+++ b/lib/kokkos/containers/performance_tests/TestScatterView.hpp
@ -50,14 +50,14 @@

 namespace Perf {

-template <typename ExecSpace, typename Layout, int duplication,
-          int contribution>
+template <typename ExecSpace, typename Layout, typename Duplication,
+          typename Contribution>
 void test_scatter_view(int m, int n) {
  Kokkos::View<double * [3], Layout, ExecSpace> original_view("original_view",
                                                              n);
  {
    auto scatter_view = Kokkos::Experimental::create_scatter_view<
-        Kokkos::Experimental::ScatterSum, duplication, contribution>(
+        Kokkos::Experimental::ScatterSum, Duplication, Contribution>(
        original_view);
    Kokkos::Experimental::UniqueToken<
        ExecSpace, Kokkos::Experimental::UniqueTokenScope::Global>
--- a/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp
+++ b/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp
@ -55,9 +55,9 @@ namespace Perf {

 template <typename Device, bool Near>
 struct UnorderedMapTest {
-  typedef Device execution_space;
-  typedef Kokkos::UnorderedMap<uint32_t, uint32_t, execution_space> map_type;
-  typedef typename map_type::histogram_type histogram_type;
+  using execution_space = Device;
+  using map_type = Kokkos::UnorderedMap<uint32_t, uint32_t, execution_space>;
+  using histogram_type = typename map_type::histogram_type;

  struct value_type {
    uint32_t failed_count;
--- a/lib/kokkos/containers/src/CMakeLists.txt
+++ b/lib/kokkos/containers/src/CMakeLists.txt
@ -9,6 +9,10 @@ KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})

 SET(KOKKOS_CONTAINERS_SRCS)
 APPEND_GLOB(KOKKOS_CONTAINERS_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.cpp)
+SET(KOKKOS_CONTAINER_HEADERS)
+APPEND_GLOB(KOKKOS_CONTAINERS_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.hpp)
+APPEND_GLOB(KOKKOS_CONTAINERS_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp)
+

 INSTALL (
  DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/"
@ -19,6 +23,7 @@ INSTALL (
 KOKKOS_ADD_LIBRARY(
  kokkoscontainers
  SOURCES ${KOKKOS_CONTAINERS_SRCS}
+  HEADERS ${KOKKOS_CONTAINER_HEADERS}
 )

 SET_TARGET_PROPERTIES(kokkoscontainers PROPERTIES VERSION ${Kokkos_VERSION})
--- a/lib/kokkos/containers/src/Kokkos_Bitset.hpp
+++ b/lib/kokkos/containers/src/Kokkos_Bitset.hpp
@ -73,8 +73,8 @@ void deep_copy(ConstBitset<DstDevice>& dst, ConstBitset<SrcDevice> const& src);
 template <typename Device>
 class Bitset {
 public:
-  typedef Device execution_space;
-  typedef unsigned size_type;
+  using execution_space = Device;
+  using size_type       = unsigned;

  enum { BIT_SCAN_REVERSE = 1u };
  enum { MOVE_HINT_BACKWARD = 2u };
@ -137,9 +137,9 @@ class Bitset {

    if (m_last_block_mask) {
      // clear the unused bits in the last block
-      typedef Kokkos::Impl::DeepCopy<typename execution_space::memory_space,
-                                     Kokkos::HostSpace>
-          raw_deep_copy;
+      using raw_deep_copy =
+          Kokkos::Impl::DeepCopy<typename execution_space::memory_space,
+                                 Kokkos::HostSpace>;
      raw_deep_copy(m_blocks.data() + (m_blocks.extent(0) - 1u),
                    &m_last_block_mask, sizeof(unsigned));
    }
@ -234,6 +234,10 @@ class Bitset {
    return find_any_helper(block_idx, offset, block, scan_direction);
  }

+  KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const {
+    return m_blocks.is_allocated();
+  }
+
 private:
  KOKKOS_FORCEINLINE_FUNCTION
  Kokkos::pair<bool, unsigned> find_any_helper(unsigned block_idx,
@ -304,8 +308,8 @@ class Bitset {
 template <typename Device>
 class ConstBitset {
 public:
-  typedef Device execution_space;
-  typedef unsigned size_type;
+  using execution_space = Device;
+  using size_type       = unsigned;

 private:
  enum { block_size = static_cast<unsigned>(sizeof(unsigned) * CHAR_BIT) };
@ -380,9 +384,9 @@ void deep_copy(Bitset<DstDevice>& dst, Bitset<SrcDevice> const& src) {
        "Error: Cannot deep_copy bitsets of different sizes!");
  }

-  typedef Kokkos::Impl::DeepCopy<typename DstDevice::memory_space,
-                                 typename SrcDevice::memory_space>
-      raw_deep_copy;
+  using raw_deep_copy =
+      Kokkos::Impl::DeepCopy<typename DstDevice::memory_space,
+                             typename SrcDevice::memory_space>;
  raw_deep_copy(dst.m_blocks.data(), src.m_blocks.data(),
                sizeof(unsigned) * src.m_blocks.extent(0));
 }
@ -394,9 +398,9 @@ void deep_copy(Bitset<DstDevice>& dst, ConstBitset<SrcDevice> const& src) {
        "Error: Cannot deep_copy bitsets of different sizes!");
  }

-  typedef Kokkos::Impl::DeepCopy<typename DstDevice::memory_space,
-                                 typename SrcDevice::memory_space>
-      raw_deep_copy;
+  using raw_deep_copy =
+      Kokkos::Impl::DeepCopy<typename DstDevice::memory_space,
+                             typename SrcDevice::memory_space>;
  raw_deep_copy(dst.m_blocks.data(), src.m_blocks.data(),
                sizeof(unsigned) * src.m_blocks.extent(0));
 }
@ -408,9 +412,9 @@ void deep_copy(ConstBitset<DstDevice>& dst, ConstBitset<SrcDevice> const& src) {
        "Error: Cannot deep_copy bitsets of different sizes!");
  }

-  typedef Kokkos::Impl::DeepCopy<typename DstDevice::memory_space,
-                                 typename SrcDevice::memory_space>
-      raw_deep_copy;
+  using raw_deep_copy =
+      Kokkos::Impl::DeepCopy<typename DstDevice::memory_space,
+                             typename SrcDevice::memory_space>;
  raw_deep_copy(dst.m_blocks.data(), src.m_blocks.data(),
                sizeof(unsigned) * src.m_blocks.extent(0));
 }
--- a/lib/kokkos/containers/src/Kokkos_DualView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DualView.hpp
@ -100,99 +100,91 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
 public:
  //! \name Typedefs for device types and various Kokkos::View specializations.
  //@{
-  typedef ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> traits;
+  using traits = ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type>;

  //! The Kokkos Host Device type;
-  typedef typename traits::host_mirror_space host_mirror_space;
+  using host_mirror_space = typename traits::host_mirror_space;

  //! The type of a Kokkos::View on the device.
-  typedef View<typename traits::data_type, Arg1Type, Arg2Type, Arg3Type> t_dev;
+  using t_dev = View<typename traits::data_type, Arg1Type, Arg2Type, Arg3Type>;

  /// \typedef t_host
  /// \brief The type of a Kokkos::View host mirror of \c t_dev.
-  typedef typename t_dev::HostMirror t_host;
+  using t_host = typename t_dev::HostMirror;

  //! The type of a const View on the device.
  //! The type of a Kokkos::View on the device.
-  typedef View<typename traits::const_data_type, Arg1Type, Arg2Type, Arg3Type>
-      t_dev_const;
+  using t_dev_const =
+      View<typename traits::const_data_type, Arg1Type, Arg2Type, Arg3Type>;

  /// \typedef t_host_const
  /// \brief The type of a const View host mirror of \c t_dev_const.
-  typedef typename t_dev_const::HostMirror t_host_const;
+  using t_host_const = typename t_dev_const::HostMirror;

  //! The type of a const, random-access View on the device.
-  typedef View<typename traits::const_data_type, typename traits::array_layout,
-               typename traits::device_type,
-               Kokkos::MemoryTraits<Kokkos::RandomAccess> >
-      t_dev_const_randomread;
+  using t_dev_const_randomread =
+      View<typename traits::const_data_type, typename traits::array_layout,
+           typename traits::device_type,
+           Kokkos::MemoryTraits<Kokkos::RandomAccess> >;

  /// \typedef t_host_const_randomread
  /// \brief The type of a const, random-access View host mirror of
  ///   \c t_dev_const_randomread.
-  typedef typename t_dev_const_randomread::HostMirror t_host_const_randomread;
+  using t_host_const_randomread = typename t_dev_const_randomread::HostMirror;

  //! The type of an unmanaged View on the device.
-  typedef View<typename traits::data_type, typename traits::array_layout,
-               typename traits::device_type, MemoryUnmanaged>
-      t_dev_um;
+  using t_dev_um =
+      View<typename traits::data_type, typename traits::array_layout,
+           typename traits::device_type, MemoryUnmanaged>;

  //! The type of an unmanaged View host mirror of \c t_dev_um.
-  typedef View<typename t_host::data_type, typename t_host::array_layout,
-               typename t_host::device_type, MemoryUnmanaged>
-      t_host_um;
+  using t_host_um =
+      View<typename t_host::data_type, typename t_host::array_layout,
+           typename t_host::device_type, MemoryUnmanaged>;

  //! The type of a const unmanaged View on the device.
-  typedef View<typename traits::const_data_type, typename traits::array_layout,
-               typename traits::device_type, MemoryUnmanaged>
-      t_dev_const_um;
+  using t_dev_const_um =
+      View<typename traits::const_data_type, typename traits::array_layout,
+           typename traits::device_type, MemoryUnmanaged>;

  //! The type of a const unmanaged View host mirror of \c t_dev_const_um.
-  typedef View<typename t_host::const_data_type, typename t_host::array_layout,
-               typename t_host::device_type, MemoryUnmanaged>
-      t_host_const_um;
+  using t_host_const_um =
+      View<typename t_host::const_data_type, typename t_host::array_layout,
+           typename t_host::device_type, MemoryUnmanaged>;

  //! The type of a const, random-access View on the device.
-  typedef View<typename t_host::const_data_type, typename t_host::array_layout,
-               typename t_host::device_type,
-               Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >
-      t_dev_const_randomread_um;
+  using t_dev_const_randomread_um =
+      View<typename t_host::const_data_type, typename t_host::array_layout,
+           typename t_host::device_type,
+           Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess> >;

  /// \typedef t_host_const_randomread
  /// \brief The type of a const, random-access View host mirror of
  ///   \c t_dev_const_randomread.
-  typedef
-      typename t_dev_const_randomread::HostMirror t_host_const_randomread_um;
-
-  //@}
-  //! \name The two View instances.
-  //@{
-
-  t_dev d_view;
-  t_host h_view;
+  using t_host_const_randomread_um =
+      typename t_dev_const_randomread::HostMirror;

  //@}
  //! \name Counters to keep track of changes ("modified" flags)
  //@{

-#ifndef KOKKOS_ENABLE_DEPRECATED_CODE
 protected:
  // modified_flags[0] -> host
  // modified_flags[1] -> device
-  typedef View<unsigned int[2], LayoutLeft, Kokkos::HostSpace> t_modified_flags;
+  using t_modified_flags = View<unsigned int[2], LayoutLeft, Kokkos::HostSpace>;
  t_modified_flags modified_flags;

 public:
-#else
-  typedef View<unsigned int[2], LayoutLeft, typename t_host::execution_space>
-      t_modified_flags;
-  typedef View<unsigned int, LayoutLeft, typename t_host::execution_space>
-      t_modified_flag;
-  t_modified_flags modified_flags;
-  t_modified_flag modified_host, modified_device;
-#endif
-
  //@}
+
+  // Moved this specifically after modified_flags to resolve an alignment issue
+  // on MSVC/NVCC
+  //! \name The two View instances.
+  //@{
+  t_dev d_view;
+  t_host h_view;
+  //@}
+
  //! \name Constructors
  //@{

@ -201,14 +193,7 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
  /// Both device and host View objects are constructed using their
  /// default constructors.  The "modified" flags are both initialized
  /// to "unmodified."
-#ifndef KOKKOS_ENABLE_DEPRECATED_CODE
  DualView() = default;
-#else
-  DualView() : modified_flags(t_modified_flags("DualView::modified_flags")) {
-    modified_host   = t_modified_flag(modified_flags, 0);
-    modified_device = t_modified_flag(modified_flags, 1);
-  }
-#endif

  /// \brief Constructor that allocates View objects on both host and device.
  ///
@ -228,15 +213,10 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
           const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
           const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
           const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG)
-      : d_view(label, n0, n1, n2, n3, n4, n5, n6, n7),
+      : modified_flags(t_modified_flags("DualView::modified_flags")),
+        d_view(label, n0, n1, n2, n3, n4, n5, n6, n7),
        h_view(create_mirror_view(d_view))  // without UVM, host View mirrors
-        ,
-        modified_flags(t_modified_flags("DualView::modified_flags")) {
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-    modified_host   = t_modified_flag(modified_flags, 0);
-    modified_device = t_modified_flag(modified_flags, 1);
-#endif
-  }
+  {}

  /// \brief Constructor that allocates View objects on both host and device.
  ///
@ -260,15 +240,10 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
           const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
           const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
           const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG)
-      : d_view(arg_prop, n0, n1, n2, n3, n4, n5, n6, n7),
+      : modified_flags(t_modified_flags("DualView::modified_flags")),
+        d_view(arg_prop, n0, n1, n2, n3, n4, n5, n6, n7),
        h_view(create_mirror_view(d_view))  // without UVM, host View mirrors
-        ,
-        modified_flags(t_modified_flags("DualView::modified_flags")) {
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-    modified_host   = t_modified_flag(modified_flags, 0);
-    modified_device = t_modified_flag(modified_flags, 1);
-#endif
-  }
+  {}

  explicit inline DualView(const ViewAllocateWithoutInitializing& arg_prop,
                           const size_t arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
@ -288,30 +263,16 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
  //! Copy constructor (shallow copy)
  template <class SS, class LS, class DS, class MS>
  DualView(const DualView<SS, LS, DS, MS>& src)
-      : d_view(src.d_view),
-        h_view(src.h_view),
-        modified_flags(src.modified_flags)
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-        ,
-        modified_host(src.modified_host),
-        modified_device(src.modified_device)
-#endif
-  {
-  }
+      : modified_flags(src.modified_flags),
+        d_view(src.d_view),
+        h_view(src.h_view) {}

  //! Subview constructor
  template <class SD, class S1, class S2, class S3, class Arg0, class... Args>
  DualView(const DualView<SD, S1, S2, S3>& src, const Arg0& arg0, Args... args)
-      : d_view(Kokkos::subview(src.d_view, arg0, args...)),
-        h_view(Kokkos::subview(src.h_view, arg0, args...)),
-        modified_flags(src.modified_flags)
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-        ,
-        modified_host(src.modified_host),
-        modified_device(src.modified_device)
-#endif
-  {
-  }
+      : modified_flags(src.modified_flags),
+        d_view(Kokkos::subview(src.d_view, arg0, args...)),
+        h_view(Kokkos::subview(src.h_view, arg0, args...)) {}

  /// \brief Create DualView from existing device and host View objects.
  ///
@ -324,9 +285,9 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
  /// \param d_view_ Device View
  /// \param h_view_ Host View (must have type t_host = t_dev::HostMirror)
  DualView(const t_dev& d_view_, const t_host& h_view_)
-      : d_view(d_view_),
-        h_view(h_view_),
-        modified_flags(t_modified_flags("DualView::modified_flags")) {
+      : modified_flags(t_modified_flags("DualView::modified_flags")),
+        d_view(d_view_),
+        h_view(h_view_) {
    if (int(d_view.rank) != int(h_view.rank) ||
        d_view.extent(0) != h_view.extent(0) ||
        d_view.extent(1) != h_view.extent(1) ||
@ -348,10 +309,6 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
      Kokkos::Impl::throw_runtime_exception(
          "DualView constructed with incompatible views");
    }
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-    modified_host   = t_modified_flag(modified_flags, 0);
-    modified_device = t_modified_flag(modified_flags, 1);
-#endif
  }

  //@}
@ -367,20 +324,25 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
  ///
  /// For example, suppose you create a DualView on Cuda, like this:
  /// \code
-  /// typedef Kokkos::DualView<float, Kokkos::LayoutRight, Kokkos::Cuda>
-  /// dual_view_type; dual_view_type DV ("my dual view", 100); \endcode If you
-  /// want to get the CUDA device View, do this: \code typename
-  /// dual_view_type::t_dev cudaView = DV.view<Kokkos::Cuda> (); \endcode and if
-  /// you want to get the host mirror of that View, do this: \code typedef
-  /// typename Kokkos::HostSpace::execution_space host_device_type; typename
-  /// dual_view_type::t_host hostView = DV.view<host_device_type> (); \endcode
+  ///   using dual_view_type =
+  ///       Kokkos::DualView<float, Kokkos::LayoutRight, Kokkos::Cuda>;
+  ///   dual_view_type DV ("my dual view", 100);
+  /// \endcode
+  /// If you want to get the CUDA device View, do this:
+  /// \code
+  ///   typename dual_view_type::t_dev cudaView = DV.view<Kokkos::Cuda> ();
+  /// \endcode
+  /// and if you want to get the host mirror of that View, do this:
+  /// \code
+  ///   using host_device_type = typename Kokkos::HostSpace::execution_space;
+  ///   typename dual_view_type::t_host hostView = DV.view<host_device_type> ();
+  /// \endcode
  template <class Device>
  KOKKOS_INLINE_FUNCTION const typename Impl::if_c<
      std::is_same<typename t_dev::memory_space,
                   typename Device::memory_space>::value,
      t_dev, t_host>::type&
  view() const {
-#ifndef KOKKOS_ENABLE_DEPRECATED_CODE
    constexpr bool device_is_memspace =
        std::is_same<Device, typename Device::memory_space>::value;
    constexpr bool device_is_execspace =
@ -415,7 +377,6 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
              (device_exec_is_t_dev_exec || device_exec_is_t_host_exec))),
        "Template parameter to .view() must exactly match one of the "
        "DualView's device types or one of the execution or memory spaces");
-#endif

    return Impl::if_c<std::is_same<typename t_dev::memory_space,
                                   typename Device::memory_space>::value,
@ -428,6 +389,10 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
  KOKKOS_INLINE_FUNCTION
  t_dev view_device() const { return d_view; }

+  KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const {
+    return (d_view.is_allocated() && h_view.is_allocated());
+  }
+
  template <class Device>
  static int get_device_side() {
    constexpr bool device_is_memspace =
@ -453,7 +418,6 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
        std::is_same<typename Device::memory_space,
                     typename t_host::device_type>::value;

-#ifndef KOKKOS_ENABLE_DEPRECATED_CODE
    static_assert(
        device_is_t_dev_device || device_is_t_host_device ||
            (device_is_memspace &&
@ -465,13 +429,8 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
              (device_exec_is_t_dev_exec || device_exec_is_t_host_exec))),
        "Template parameter to .sync() must exactly match one of the "
        "DualView's device types or one of the execution or memory spaces");
-#endif

-#ifndef KOKKOS_ENABLE_DEPRECATED_CODE
    int dev = -1;
-#else
-    int dev = 0;
-#endif
    if (device_is_t_dev_device)
      dev = 1;
    else if (device_is_t_host_device)
@ -822,11 +781,6 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
  //! \name Methods for getting capacity, stride, or dimension(s).
  //@{

-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-  //! The allocation size (same as Kokkos::View::capacity).
-  size_t capacity() const { return d_view.span(); }
-#endif
-
  //! The allocation size (same as Kokkos::View::span).
  KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return d_view.span(); }

@ -854,29 +808,6 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
    return static_cast<int>(d_view.extent(r));
  }

-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-  /*  Deprecate all 'dimension' functions in favor of
-   *  ISO/C++ vocabulary 'extent'.
-   */
-
-  /* \brief return size of dimension 0 */
-  size_t dimension_0() const { return d_view.extent(0); }
-  /* \brief return size of dimension 1 */
-  size_t dimension_1() const { return d_view.extent(1); }
-  /* \brief return size of dimension 2 */
-  size_t dimension_2() const { return d_view.extent(2); }
-  /* \brief return size of dimension 3 */
-  size_t dimension_3() const { return d_view.extent(3); }
-  /* \brief return size of dimension 4 */
-  size_t dimension_4() const { return d_view.extent(4); }
-  /* \brief return size of dimension 5 */
-  size_t dimension_5() const { return d_view.extent(5); }
-  /* \brief return size of dimension 6 */
-  size_t dimension_6() const { return d_view.extent(6); }
-  /* \brief return size of dimension 7 */
-  size_t dimension_7() const { return d_view.extent(7); }
-#endif
-
  //@}
 };

@ -893,13 +824,12 @@ namespace Impl {

 template <class D, class A1, class A2, class A3, class... Args>
 struct DualViewSubview {
-  typedef typename Kokkos::Impl::ViewMapping<
-      void, Kokkos::ViewTraits<D, A1, A2, A3>, Args...>::traits_type dst_traits;
+  using dst_traits = typename Kokkos::Impl::ViewMapping<
+      void, Kokkos::ViewTraits<D, A1, A2, A3>, Args...>::traits_type;

-  typedef Kokkos::DualView<
+  using type = Kokkos::DualView<
      typename dst_traits::data_type, typename dst_traits::array_layout,
-      typename dst_traits::device_type, typename dst_traits::memory_traits>
-      type;
+      typename dst_traits::device_type, typename dst_traits::memory_traits>;
 };

 } /* namespace Impl */
--- a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp
@ -349,8 +349,8 @@ class ViewMapping<
 public:
  enum { is_assignable = is_assignable_value_type && is_assignable_layout };

-  typedef ViewMapping<DstTraits, typename DstTraits::specialize> DstType;
-  typedef ViewMapping<SrcTraits, typename SrcTraits::specialize> SrcType;
+  using DstType = ViewMapping<DstTraits, typename DstTraits::specialize>;
+  using SrcType = ViewMapping<SrcTraits, typename SrcTraits::specialize>;

  template <typename DT, typename... DP, typename ST, typename... SP>
  KOKKOS_INLINE_FUNCTION static void assign(
@ -365,13 +365,13 @@ class ViewMapping<

    // Removed dimension checks...

-    typedef typename DstType::offset_type dst_offset_type;
+    using dst_offset_type   = typename DstType::offset_type;
    dst.m_map.m_impl_offset = dst_offset_type(
        std::integral_constant<unsigned, 0>(),
        src.layout());  // Check this for integer input1 for padding, etc
    dst.m_map.m_impl_handle = Kokkos::Impl::ViewDataHandle<DstTraits>::assign(
-        src.m_map.m_impl_handle, src.m_track);
-    dst.m_track.assign(src.m_track, DstTraits::is_managed);
+        src.m_map.m_impl_handle, src.m_track.m_tracker);
+    dst.m_track.assign(src.m_track.m_tracker, DstTraits::is_managed);
    dst.m_rank = src.Rank;
  }
 };
@ -415,16 +415,16 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
  friend class Kokkos::Impl::ViewMapping;

 public:
-  typedef ViewTraits<DataType, Properties...> drvtraits;
+  using drvtraits = ViewTraits<DataType, Properties...>;

-  typedef View<DataType*******, Properties...> view_type;
+  using view_type = View<DataType*******, Properties...>;

-  typedef ViewTraits<DataType*******, Properties...> traits;
+  using traits = ViewTraits<DataType*******, Properties...>;

 private:
-  typedef Kokkos::Impl::ViewMapping<traits, typename traits::specialize>
-      map_type;
-  typedef Kokkos::Impl::SharedAllocationTracker track_type;
+  using map_type =
+      Kokkos::Impl::ViewMapping<traits, typename traits::specialize>;
+  using track_type = Kokkos::Impl::SharedAllocationTracker;

  track_type m_track;
  map_type m_map;
@ -440,28 +440,24 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
  // 7 data_type of the traits

  /** \brief  Compatible view of array of scalar types */
-  typedef DynRankView<
+  using array_type = DynRankView<
      typename drvtraits::scalar_array_type, typename drvtraits::array_layout,
-      typename drvtraits::device_type, typename drvtraits::memory_traits>
-      array_type;
+      typename drvtraits::device_type, typename drvtraits::memory_traits>;

  /** \brief  Compatible view of const data type */
-  typedef DynRankView<
+  using const_type = DynRankView<
      typename drvtraits::const_data_type, typename drvtraits::array_layout,
-      typename drvtraits::device_type, typename drvtraits::memory_traits>
-      const_type;
+      typename drvtraits::device_type, typename drvtraits::memory_traits>;

  /** \brief  Compatible view of non-const data type */
-  typedef DynRankView<
+  using non_const_type = DynRankView<
      typename drvtraits::non_const_data_type, typename drvtraits::array_layout,
-      typename drvtraits::device_type, typename drvtraits::memory_traits>
-      non_const_type;
+      typename drvtraits::device_type, typename drvtraits::memory_traits>;

  /** \brief  Compatible HostMirror view */
-  typedef DynRankView<typename drvtraits::non_const_data_type,
-                      typename drvtraits::array_layout,
-                      typename drvtraits::host_mirror_space>
-      HostMirror;
+  using HostMirror = DynRankView<typename drvtraits::non_const_data_type,
+                                 typename drvtraits::array_layout,
+                                 typename drvtraits::host_mirror_space>;

  //----------------------------------------
  // Domain rank and extents
@ -493,42 +489,6 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
   *  ISO/C++ vocabulary 'extent'.
   */

-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-  template <typename iType>
-  KOKKOS_INLINE_FUNCTION constexpr
-      typename std::enable_if<std::is_integral<iType>::value, size_t>::type
-      dimension(const iType& r) const {
-    return extent(r);
-  }
-
-  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_0() const {
-    return m_map.dimension_0();
-  }
-  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_1() const {
-    return m_map.dimension_1();
-  }
-  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_2() const {
-    return m_map.dimension_2();
-  }
-  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_3() const {
-    return m_map.dimension_3();
-  }
-  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_4() const {
-    return m_map.dimension_4();
-  }
-  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_5() const {
-    return m_map.dimension_5();
-  }
-  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_6() const {
-    return m_map.dimension_6();
-  }
-  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_7() const {
-    return m_map.dimension_7();
-  }
-#endif
-
-  //----------------------------------------
-
  KOKKOS_INLINE_FUNCTION constexpr size_t size() const {
    return m_map.extent(0) * m_map.extent(1) * m_map.extent(2) *
           m_map.extent(3) * m_map.extent(4) * m_map.extent(5) *
@ -568,8 +528,8 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
  //----------------------------------------
  // Range span is the span which contains all members.

-  typedef typename map_type::reference_type reference_type;
-  typedef typename map_type::pointer_type pointer_type;
+  using reference_type = typename map_type::reference_type;
+  using pointer_type   = typename map_type::pointer_type;

  enum {
    reference_type_is_lvalue_reference =
@ -577,39 +537,18 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
  };

  KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_map.span(); }
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-  // Deprecated, use 'span()' instead
-  KOKKOS_INLINE_FUNCTION constexpr size_t capacity() const {
-    return m_map.span();
-  }
-#endif
  KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const {
    return m_map.span_is_contiguous();
  }
  KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const {
    return m_map.data();
  }
-
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-  // Deprecated, use 'span_is_contigous()' instead
-  KOKKOS_INLINE_FUNCTION constexpr bool is_contiguous() const {
-    return m_map.span_is_contiguous();
+  KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const {
+    return (m_map.data() != nullptr);
  }
-  // Deprecated, use 'data()' instead
-  KOKKOS_INLINE_FUNCTION constexpr pointer_type ptr_on_device() const {
-    return m_map.data();
-  }
-#endif

  //----------------------------------------
  // Allow specializations to query their specialized map
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-  KOKKOS_INLINE_FUNCTION
-  const Kokkos::Impl::ViewMapping<traits, typename traits::specialize>&
-  implementation_map() const {
-    return m_map;
-  }
-#endif
  KOKKOS_INLINE_FUNCTION
  const Kokkos::Impl::ViewMapping<traits, typename traits::specialize>&
  impl_map() const {
@ -709,12 +648,11 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
    const size_t dim_scalar = m_map.dimension_scalar();
    const size_t bytes      = this->span() / dim_scalar;

-    typedef Kokkos::View<
+    using tmp_view_type = Kokkos::View<
        DataType*, typename traits::array_layout, typename traits::device_type,
        Kokkos::MemoryTraits<traits::memory_traits::is_unmanaged |
                             traits::memory_traits::is_random_access |
-                             traits::memory_traits::is_atomic> >
-        tmp_view_type;
+                             traits::memory_traits::is_atomic> >;
    tmp_view_type rankone_view(this->data(), bytes, dim_scalar);
    return rankone_view(i0);
  }
@ -1102,10 +1040,9 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
  template <class RT, class... RP>
  KOKKOS_INLINE_FUNCTION DynRankView(const DynRankView<RT, RP...>& rhs)
      : m_track(rhs.m_track, traits::is_managed), m_map(), m_rank(rhs.m_rank) {
-    typedef typename DynRankView<RT, RP...>::traits SrcTraits;
-    typedef Kokkos::Impl::ViewMapping<traits, SrcTraits,
-                                      typename traits::specialize>
-        Mapping;
+    using SrcTraits = typename DynRankView<RT, RP...>::traits;
+    using Mapping   = Kokkos::Impl::ViewMapping<traits, SrcTraits,
+                                              typename traits::specialize>;
    static_assert(Mapping::is_assignable,
                  "Incompatible DynRankView copy construction");
    Mapping::assign(m_map, rhs.m_map, rhs.m_track);
@ -1114,10 +1051,9 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
  template <class RT, class... RP>
  KOKKOS_INLINE_FUNCTION DynRankView& operator=(
      const DynRankView<RT, RP...>& rhs) {
-    typedef typename DynRankView<RT, RP...>::traits SrcTraits;
-    typedef Kokkos::Impl::ViewMapping<traits, SrcTraits,
-                                      typename traits::specialize>
-        Mapping;
+    using SrcTraits = typename DynRankView<RT, RP...>::traits;
+    using Mapping   = Kokkos::Impl::ViewMapping<traits, SrcTraits,
+                                              typename traits::specialize>;
    static_assert(Mapping::is_assignable,
                  "Incompatible DynRankView copy construction");
    Mapping::assign(m_map, rhs.m_map, rhs.m_track);
@ -1130,10 +1066,10 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
  template <class RT, class... RP>
  KOKKOS_INLINE_FUNCTION DynRankView(const View<RT, RP...>& rhs)
      : m_track(), m_map(), m_rank(rhs.Rank) {
-    typedef typename View<RT, RP...>::traits SrcTraits;
-    typedef Kokkos::Impl::ViewMapping<traits, SrcTraits,
-                                      Kokkos::Impl::ViewToDynRankViewTag>
-        Mapping;
+    using SrcTraits = typename View<RT, RP...>::traits;
+    using Mapping =
+        Kokkos::Impl::ViewMapping<traits, SrcTraits,
+                                  Kokkos::Impl::ViewToDynRankViewTag>;
    static_assert(Mapping::is_assignable,
                  "Incompatible View to DynRankView copy construction");
    Mapping::assign(*this, rhs);
@ -1141,10 +1077,10 @@ class DynRankView : public ViewTraits<DataType, Properties...> {

  template <class RT, class... RP>
  KOKKOS_INLINE_FUNCTION DynRankView& operator=(const View<RT, RP...>& rhs) {
-    typedef typename View<RT, RP...>::traits SrcTraits;
-    typedef Kokkos::Impl::ViewMapping<traits, SrcTraits,
-                                      Kokkos::Impl::ViewToDynRankViewTag>
-        Mapping;
+    using SrcTraits = typename View<RT, RP...>::traits;
+    using Mapping =
+        Kokkos::Impl::ViewMapping<traits, SrcTraits,
+                                  Kokkos::Impl::ViewToDynRankViewTag>;
    static_assert(Mapping::is_assignable,
                  "Incompatible View to DynRankView copy assignment");
    Mapping::assign(*this, rhs);
@ -1177,11 +1113,11 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
                   template computeRank<typename traits::array_layout, P...>(
                       arg_prop, arg_layout)) {
    // Append layout and spaces if not input
-    typedef Kokkos::Impl::ViewCtorProp<P...> alloc_prop_input;
+    using alloc_prop_input = Kokkos::Impl::ViewCtorProp<P...>;

    // use 'std::integral_constant<unsigned,I>' for non-types
    // to avoid duplicate class error.
-    typedef Kokkos::Impl::ViewCtorProp<
+    using alloc_prop = Kokkos::Impl::ViewCtorProp<
        P...,
        typename std::conditional<alloc_prop_input::has_label,
                                  std::integral_constant<unsigned, 0>,
@ -1193,19 +1129,13 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
        typename std::conditional<
            alloc_prop_input::has_execution_space,
            std::integral_constant<unsigned, 2>,
-            typename traits::device_type::execution_space>::type>
-        alloc_prop;
+            typename traits::device_type::execution_space>::type>;

    static_assert(traits::is_managed,
                  "View allocation constructor requires managed memory");

    if (alloc_prop::initialize &&
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-        !alloc_prop::execution_space::is_initialized()
-#else
-        !alloc_prop::execution_space::impl_is_initialized()
-#endif
-    ) {
+        !alloc_prop::execution_space::impl_is_initialized()) {
      // If initializing view data then
      // the execution space must be initialized.
      Kokkos::Impl::throw_runtime_exception(
@ -1499,36 +1429,34 @@ struct ViewMapping<
           unsigned(R4) + unsigned(R5) + unsigned(R6)
  };

-  typedef Kokkos::LayoutStride array_layout;
+  using array_layout = Kokkos::LayoutStride;

-  typedef typename SrcTraits::value_type value_type;
+  using value_type = typename SrcTraits::value_type;

-  typedef value_type******* data_type;
+  using data_type = value_type*******;

 public:
-  typedef Kokkos::ViewTraits<data_type, array_layout,
-                             typename SrcTraits::device_type,
-                             typename SrcTraits::memory_traits>
-      traits_type;
+  using traits_type = Kokkos::ViewTraits<data_type, array_layout,
+                                         typename SrcTraits::device_type,
+                                         typename SrcTraits::memory_traits>;

-  typedef Kokkos::View<data_type, array_layout, typename SrcTraits::device_type,
-                       typename SrcTraits::memory_traits>
-      type;
+  using type =
+      Kokkos::View<data_type, array_layout, typename SrcTraits::device_type,
+                   typename SrcTraits::memory_traits>;

  template <class MemoryTraits>
  struct apply {
    static_assert(Kokkos::Impl::is_memory_traits<MemoryTraits>::value, "");

-    typedef Kokkos::ViewTraits<data_type, array_layout,
-                               typename SrcTraits::device_type, MemoryTraits>
-        traits_type;
+    using traits_type =
+        Kokkos::ViewTraits<data_type, array_layout,
+                           typename SrcTraits::device_type, MemoryTraits>;

-    typedef Kokkos::View<data_type, array_layout,
-                         typename SrcTraits::device_type, MemoryTraits>
-        type;
+    using type = Kokkos::View<data_type, array_layout,
+                              typename SrcTraits::device_type, MemoryTraits>;
  };

-  typedef typename SrcTraits::dimension dimension;
+  using dimension = typename SrcTraits::dimension;

  template <class Arg0 = int, class Arg1 = int, class Arg2 = int,
            class Arg3 = int, class Arg4 = int, class Arg5 = int,
@ -1544,18 +1472,17 @@ struct ViewMapping<
    }
  };

-  typedef Kokkos::DynRankView<value_type, array_layout,
-                              typename SrcTraits::device_type,
-                              typename SrcTraits::memory_traits>
-      ret_type;
+  using ret_type = Kokkos::DynRankView<value_type, array_layout,
+                                       typename SrcTraits::device_type,
+                                       typename SrcTraits::memory_traits>;

  template <typename T, class... P>
  KOKKOS_INLINE_FUNCTION static ret_type subview(
      const unsigned src_rank, Kokkos::DynRankView<T, P...> const& src,
      Args... args) {
-    typedef ViewMapping<traits_type, typename traits_type::specialize> DstType;
+    using DstType = ViewMapping<traits_type, typename traits_type::specialize>;

-    typedef typename std::conditional<
+    using DstDimType = typename std::conditional<
        (rank == 0), ViewDimension<>,
        typename std::conditional<
            (rank == 1), ViewDimension<0>,
@ -1570,10 +1497,10 @@ struct ViewMapping<
                            typename std::conditional<
                                (rank == 6), ViewDimension<0, 0, 0, 0, 0, 0>,
                                ViewDimension<0, 0, 0, 0, 0, 0, 0> >::type>::
-                            type>::type>::type>::type>::type>::type DstDimType;
+                            type>::type>::type>::type>::type>::type;

-    typedef ViewOffset<DstDimType, Kokkos::LayoutStride> dst_offset_type;
-    typedef typename DstType::handle_type dst_handle_type;
+    using dst_offset_type = ViewOffset<DstDimType, Kokkos::LayoutStride>;
+    using dst_handle_type = typename DstType::handle_type;

    ret_type dst;

@ -1636,9 +1563,9 @@ subdynrankview(const Kokkos::DynRankView<D, P...>& src, Args... args) {
        "DynRankView");
  }

-  typedef Kokkos::Impl::ViewMapping<Kokkos::Impl::DynRankSubviewTag,
-                                    Kokkos::ViewTraits<D*******, P...>, Args...>
-      metafcn;
+  using metafcn =
+      Kokkos::Impl::ViewMapping<Kokkos::Impl::DynRankSubviewTag,
+                                Kokkos::ViewTraits<D*******, P...>, Args...>;

  return metafcn::subview(src.rank(), src, args...);
 }
@ -1659,8 +1586,8 @@ template <class LT, class... LP, class RT, class... RP>
 KOKKOS_INLINE_FUNCTION bool operator==(const DynRankView<LT, LP...>& lhs,
                                       const DynRankView<RT, RP...>& rhs) {
  // Same data, layout, dimensions
-  typedef ViewTraits<LT, LP...> lhs_traits;
-  typedef ViewTraits<RT, RP...> rhs_traits;
+  using lhs_traits = ViewTraits<LT, LP...>;
+  using rhs_traits = ViewTraits<RT, RP...>;

  return std::is_same<typename lhs_traits::const_value_type,
                      typename rhs_traits::const_value_type>::value &&
@ -1691,7 +1618,7 @@ namespace Impl {

 template <class OutputView, typename Enable = void>
 struct DynRankViewFill {
-  typedef typename OutputView::traits::const_value_type const_value_type;
+  using const_value_type = typename OutputView::traits::const_value_type;

  const OutputView output;
  const_value_type input;
@ -1722,15 +1649,11 @@ struct DynRankViewFill {

  DynRankViewFill(const OutputView& arg_out, const_value_type& arg_in)
      : output(arg_out), input(arg_in) {
-    typedef typename OutputView::execution_space execution_space;
-    typedef Kokkos::RangePolicy<execution_space> Policy;
+    using execution_space = typename OutputView::execution_space;
+    using Policy          = Kokkos::RangePolicy<execution_space>;

-    const Kokkos::Impl::ParallelFor<DynRankViewFill, Policy> closure(
-        *this, Policy(0, output.extent(0)));
-
-    closure.execute();
-
-    execution_space().fence();
+    Kokkos::parallel_for("Kokkos::DynRankViewFill", Policy(0, output.extent(0)),
+                         *this);
  }
 };

@ -1770,11 +1693,9 @@ struct DynRankViewRemap {
        n5(std::min((size_t)arg_out.extent(5), (size_t)arg_in.extent(5))),
        n6(std::min((size_t)arg_out.extent(6), (size_t)arg_in.extent(6))),
        n7(std::min((size_t)arg_out.extent(7), (size_t)arg_in.extent(7))) {
-    typedef Kokkos::RangePolicy<ExecSpace> Policy;
-    const Kokkos::Impl::ParallelFor<DynRankViewRemap, Policy> closure(
-        *this, Policy(0, n0));
-    closure.execute();
-    // ExecSpace().fence(); // ??
+    using Policy = Kokkos::RangePolicy<ExecSpace>;
+
+    Kokkos::parallel_for("Kokkos::DynRankViewRemap", Policy(0, n0), *this);
  }

  KOKKOS_INLINE_FUNCTION
@ -1814,7 +1735,9 @@ inline void deep_copy(
                   typename ViewTraits<DT, DP...>::value_type>::value,
      "deep_copy requires non-const type");

+  Kokkos::fence();
  Kokkos::Impl::DynRankViewFill<DynRankView<DT, DP...> >(dst, value);
+  Kokkos::fence();
 }

 /** \brief  Deep copy into a value in Host memory from a view.  */
@ -1828,10 +1751,12 @@ inline void deep_copy(
    Kokkos::abort("");
  }

-  typedef ViewTraits<ST, SP...> src_traits;
-  typedef typename src_traits::memory_space src_memory_space;
+  using src_traits       = ViewTraits<ST, SP...>;
+  using src_memory_space = typename src_traits::memory_space;
+  Kokkos::fence();
  Kokkos::Impl::DeepCopy<HostSpace, src_memory_space>(&dst, src.data(),
                                                      sizeof(ST));
+  Kokkos::fence();
 }

 //----------------------------------------------------------------------------
@ -1851,13 +1776,13 @@ inline void deep_copy(
                   typename DstType::traits::non_const_value_type>::value,
      "deep_copy requires non-const destination type");

-  typedef DstType dst_type;
-  typedef SrcType src_type;
+  using dst_type = DstType;
+  using src_type = SrcType;

-  typedef typename dst_type::execution_space dst_execution_space;
-  typedef typename src_type::execution_space src_execution_space;
-  typedef typename dst_type::memory_space dst_memory_space;
-  typedef typename src_type::memory_space src_memory_space;
+  using dst_execution_space = typename dst_type::execution_space;
+  using src_execution_space = typename src_type::execution_space;
+  using dst_memory_space    = typename dst_type::memory_space;
+  using src_memory_space    = typename src_type::memory_space;

  enum {
    DstExecCanAccessSrc =
@ -1878,9 +1803,11 @@ inline void deep_copy(
    // If same type, equal layout, equal dimensions, equal span, and contiguous
    // memory then can byte-wise copy
    if (rank(src) == 0 && rank(dst) == 0) {
-      typedef typename dst_type::value_type value_type;
+      using value_type = typename dst_type::value_type;
+      Kokkos::fence();
      Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space>(
          dst.data(), src.data(), sizeof(value_type));
+      Kokkos::fence();
    } else if (std::is_same<
                   typename DstType::traits::value_type,
                   typename SrcType::traits::non_const_value_type>::value &&
@ -1902,9 +1829,10 @@ inline void deep_copy(
               dst.extent(6) == src.extent(6) &&
               dst.extent(7) == src.extent(7)) {
      const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
-
+      Kokkos::fence();
      Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space>(
          dst.data(), src.data(), nbytes);
+      Kokkos::fence();
    } else if (std::is_same<
                   typename DstType::traits::value_type,
                   typename SrcType::traits::non_const_value_type>::value &&
@ -1931,22 +1859,29 @@ inline void deep_copy(
               dst.stride_6() == src.stride_6() &&
               dst.stride_7() == src.stride_7()) {
      const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
-
+      Kokkos::fence();
      Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space>(
          dst.data(), src.data(), nbytes);
+      Kokkos::fence();
    } else if (DstExecCanAccessSrc) {
      // Copying data between views in accessible memory spaces and either
      // non-contiguous or incompatible shape.
+      Kokkos::fence();
      Kokkos::Impl::DynRankViewRemap<dst_type, src_type>(dst, src);
+      Kokkos::fence();
    } else if (SrcExecCanAccessDst) {
      // Copying data between views in accessible memory spaces and either
      // non-contiguous or incompatible shape.
+      Kokkos::fence();
      Kokkos::Impl::DynRankViewRemap<dst_type, src_type, src_execution_space>(
          dst, src);
+      Kokkos::fence();
    } else {
      Kokkos::Impl::throw_runtime_exception(
          "deep_copy given views that would require a temporary allocation");
    }
+  } else {
+    Kokkos::fence();
  }
 }

@ -1962,45 +1897,45 @@ namespace Impl {
 template <class Space, class T, class... P>
 struct MirrorDRViewType {
  // The incoming view_type
-  typedef typename Kokkos::DynRankView<T, P...> src_view_type;
+  using src_view_type = typename Kokkos::DynRankView<T, P...>;
  // The memory space for the mirror view
-  typedef typename Space::memory_space memory_space;
+  using memory_space = typename Space::memory_space;
  // Check whether it is the same memory space
  enum {
    is_same_memspace =
        std::is_same<memory_space, typename src_view_type::memory_space>::value
  };
  // The array_layout
-  typedef typename src_view_type::array_layout array_layout;
+  using array_layout = typename src_view_type::array_layout;
  // The data type (we probably want it non-const since otherwise we can't even
  // deep_copy to it.
-  typedef typename src_view_type::non_const_data_type data_type;
+  using data_type = typename src_view_type::non_const_data_type;
  // The destination view type if it is not the same memory space
-  typedef Kokkos::DynRankView<data_type, array_layout, Space> dest_view_type;
+  using dest_view_type = Kokkos::DynRankView<data_type, array_layout, Space>;
  // If it is the same memory_space return the existsing view_type
  // This will also keep the unmanaged trait if necessary
-  typedef typename std::conditional<is_same_memspace, src_view_type,
-                                    dest_view_type>::type view_type;
+  using view_type = typename std::conditional<is_same_memspace, src_view_type,
+                                              dest_view_type>::type;
 };

 template <class Space, class T, class... P>
 struct MirrorDRVType {
  // The incoming view_type
-  typedef typename Kokkos::DynRankView<T, P...> src_view_type;
+  using src_view_type = typename Kokkos::DynRankView<T, P...>;
  // The memory space for the mirror view
-  typedef typename Space::memory_space memory_space;
+  using memory_space = typename Space::memory_space;
  // Check whether it is the same memory space
  enum {
    is_same_memspace =
        std::is_same<memory_space, typename src_view_type::memory_space>::value
  };
  // The array_layout
-  typedef typename src_view_type::array_layout array_layout;
+  using array_layout = typename src_view_type::array_layout;
  // The data type (we probably want it non-const since otherwise we can't even
  // deep_copy to it.
-  typedef typename src_view_type::non_const_data_type data_type;
+  using data_type = typename src_view_type::non_const_data_type;
  // The destination view type if it is not the same memory space
-  typedef Kokkos::DynRankView<data_type, array_layout, Space> view_type;
+  using view_type = Kokkos::DynRankView<data_type, array_layout, Space>;
 };

 }  // namespace Impl
@ -2012,8 +1947,8 @@ inline typename DynRankView<T, P...>::HostMirror create_mirror(
        std::is_same<typename ViewTraits<T, P...>::specialize, void>::value &&
        !std::is_same<typename Kokkos::ViewTraits<T, P...>::array_layout,
                      Kokkos::LayoutStride>::value>::type* = nullptr) {
-  typedef DynRankView<T, P...> src_type;
-  typedef typename src_type::HostMirror dst_type;
+  using src_type = DynRankView<T, P...>;
+  using dst_type = typename src_type::HostMirror;

  return dst_type(std::string(src.label()).append("_mirror"),
                  Impl::reconstructLayout(src.layout(), src.rank()));
@ -2026,8 +1961,8 @@ inline typename DynRankView<T, P...>::HostMirror create_mirror(
        std::is_same<typename ViewTraits<T, P...>::specialize, void>::value &&
        std::is_same<typename Kokkos::ViewTraits<T, P...>::array_layout,
                     Kokkos::LayoutStride>::value>::type* = 0) {
-  typedef DynRankView<T, P...> src_type;
-  typedef typename src_type::HostMirror dst_type;
+  using src_type = DynRankView<T, P...>;
+  using dst_type = typename src_type::HostMirror;

  return dst_type(std::string(src.label()).append("_mirror"),
                  Impl::reconstructLayout(src.layout(), src.rank()));
@ -2066,7 +2001,7 @@ inline typename DynRankView<T, P...>::HostMirror create_mirror_view(
              typename DynRankView<T, P...>::HostMirror::memory_space>::value &&
          std::is_same<typename DynRankView<T, P...>::data_type,
                       typename DynRankView<T, P...>::HostMirror::data_type>::
-              value)>::type* = 0) {
+              value)>::type* = nullptr) {
  return Kokkos::create_mirror(src);
 }

@ -2085,7 +2020,8 @@ template <class Space, class T, class... P>
 typename Impl::MirrorDRViewType<Space, T, P...>::view_type create_mirror_view(
    const Space&, const Kokkos::DynRankView<T, P...>& src,
    typename std::enable_if<
-        !Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace>::type* = 0) {
+        !Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace>::type* =
+        nullptr) {
  return typename Impl::MirrorDRViewType<Space, T, P...>::view_type(
      src.label(), Impl::reconstructLayout(src.layout(), src.rank()));
 }
@ -2112,7 +2048,8 @@ create_mirror_view_and_copy(
    const Space&, const Kokkos::DynRankView<T, P...>& src,
    std::string const& name = "",
    typename std::enable_if<
-        !Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace>::type* = 0) {
+        !Impl::MirrorDRViewType<Space, T, P...>::is_same_memspace>::type* =
+        nullptr) {
  using Mirror = typename Impl::MirrorDRViewType<Space, T, P...>::view_type;
  std::string label = name.empty() ? src.label() : name;
  auto mirror       = Mirror(Kokkos::ViewAllocateWithoutInitializing(label),
@ -2139,7 +2076,7 @@ inline void resize(DynRankView<T, P...>& v,
                   const size_t n5 = KOKKOS_INVALID_INDEX,
                   const size_t n6 = KOKKOS_INVALID_INDEX,
                   const size_t n7 = KOKKOS_INVALID_INDEX) {
-  typedef DynRankView<T, P...> drview_type;
+  using drview_type = DynRankView<T, P...>;

  static_assert(Kokkos::ViewTraits<T, P...>::is_managed,
                "Can only resize managed views");
@ -2163,7 +2100,7 @@ inline void realloc(DynRankView<T, P...>& v,
                    const size_t n5 = KOKKOS_INVALID_INDEX,
                    const size_t n6 = KOKKOS_INVALID_INDEX,
                    const size_t n7 = KOKKOS_INVALID_INDEX) {
-  typedef DynRankView<T, P...> drview_type;
+  using drview_type = DynRankView<T, P...>;

  static_assert(Kokkos::ViewTraits<T, P...>::is_managed,
                "Can only realloc managed views");
--- a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
@ -85,13 +85,13 @@ struct ChunkArraySpace<Kokkos::Experimental::HIPSpace> {
 template <typename DataType, typename... P>
 class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
 public:
-  typedef Kokkos::ViewTraits<DataType, P...> traits;
+  using traits = Kokkos::ViewTraits<DataType, P...>;

 private:
  template <class, class...>
  friend class DynamicView;

-  typedef Kokkos::Impl::SharedAllocationTracker track_type;
+  using track_type = Kokkos::Impl::SharedAllocationTracker;

  static_assert(traits::rank == 1 && traits::rank_dynamic == 1,
                "DynamicView must be rank-one");
@ -118,8 +118,8 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {

 private:
  track_type m_track;
-  typename traits::value_type**
-      m_chunks;            // array of pointers to 'chunks' of memory
+  typename traits::value_type** m_chunks =
+      nullptr;             // array of pointers to 'chunks' of memory
  unsigned m_chunk_shift;  // ceil(log2(m_chunk_size))
  unsigned m_chunk_mask;   // m_chunk_size - 1
  unsigned m_chunk_max;  // number of entries in the chunk array - each pointing
@ -130,38 +130,36 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
  //----------------------------------------------------------------------

  /** \brief  Compatible view of array of scalar types */
-  typedef DynamicView<typename traits::data_type, typename traits::device_type>
-      array_type;
+  using array_type =
+      DynamicView<typename traits::data_type, typename traits::device_type>;

  /** \brief  Compatible view of const data type */
-  typedef DynamicView<typename traits::const_data_type,
-                      typename traits::device_type>
-      const_type;
+  using const_type = DynamicView<typename traits::const_data_type,
+                                 typename traits::device_type>;

  /** \brief  Compatible view of non-const data type */
-  typedef DynamicView<typename traits::non_const_data_type,
-                      typename traits::device_type>
-      non_const_type;
+  using non_const_type = DynamicView<typename traits::non_const_data_type,
+                                     typename traits::device_type>;

  /** \brief  Must be accessible everywhere */
-  typedef DynamicView HostMirror;
+  using HostMirror = DynamicView;

  /** \brief Unified types */
-  typedef Kokkos::Device<typename traits::device_type::execution_space,
-                         Kokkos::AnonymousSpace>
-      uniform_device;
-  typedef array_type uniform_type;
-  typedef const_type uniform_const_type;
-  typedef array_type uniform_runtime_type;
-  typedef const_type uniform_runtime_const_type;
-  typedef DynamicView<typename traits::data_type, uniform_device>
-      uniform_nomemspace_type;
-  typedef DynamicView<typename traits::const_data_type, uniform_device>
-      uniform_const_nomemspace_type;
-  typedef DynamicView<typename traits::data_type, uniform_device>
-      uniform_runtime_nomemspace_type;
-  typedef DynamicView<typename traits::const_data_type, uniform_device>
-      uniform_runtime_const_nomemspace_type;
+  using uniform_device =
+      Kokkos::Device<typename traits::device_type::execution_space,
+                     Kokkos::AnonymousSpace>;
+  using uniform_type               = array_type;
+  using uniform_const_type         = const_type;
+  using uniform_runtime_type       = array_type;
+  using uniform_runtime_const_type = const_type;
+  using uniform_nomemspace_type =
+      DynamicView<typename traits::data_type, uniform_device>;
+  using uniform_const_nomemspace_type =
+      DynamicView<typename traits::const_data_type, uniform_device>;
+  using uniform_runtime_nomemspace_type =
+      DynamicView<typename traits::data_type, uniform_device>;
+  using uniform_runtime_const_nomemspace_type =
+      DynamicView<typename traits::const_data_type, uniform_device>;

  //----------------------------------------------------------------------

@ -193,17 +191,6 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
    return r == 0 ? size() : 1;
  }

-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-  KOKKOS_INLINE_FUNCTION size_t dimension_0() const { return size(); }
-  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_1() const { return 1; }
-  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_2() const { return 1; }
-  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_3() const { return 1; }
-  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_4() const { return 1; }
-  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_5() const { return 1; }
-  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_6() const { return 1; }
-  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_7() const { return 1; }
-#endif
-
  KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { return 0; }
  KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { return 0; }
  KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { return 0; }
@ -231,8 +218,8 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
  //----------------------------------------------------------------------
  // Range span is the span which contains all members.

-  typedef typename traits::value_type& reference_type;
-  typedef typename traits::value_type* pointer_type;
+  using reference_type = typename traits::value_type&;
+  using pointer_type   = typename traits::value_type*;

  enum {
    reference_type_is_lvalue_reference =
@ -299,8 +286,8 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
          typename Impl::ChunkArraySpace<
              typename traits::memory_space>::memory_space>::accessible>::type
  resize_serial(IntType const& n) {
-    typedef typename traits::value_type local_value_type;
-    typedef local_value_type* value_pointer_type;
+    using local_value_type   = typename traits::value_type;
+    using value_pointer_type = local_value_type*;

    const uintptr_t NC =
        (n + m_chunk_mask) >>
@ -332,6 +319,17 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
    *(pc + 1) = n;
  }

+  KOKKOS_INLINE_FUNCTION bool is_allocated() const {
+    if (m_chunks == nullptr) {
+      return false;
+    } else {
+      // *m_chunks[m_chunk_max] stores the current number of chunks being used
+      uintptr_t* const pc =
+          reinterpret_cast<uintptr_t*>(m_chunks + m_chunk_max);
+      return (*(pc + 1) > 0);
+    }
+  }
+
  //----------------------------------------------------------------------

  ~DynamicView()                  = default;
@ -349,8 +347,8 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
        m_chunk_mask(rhs.m_chunk_mask),
        m_chunk_max(rhs.m_chunk_max),
        m_chunk_size(rhs.m_chunk_size) {
-    typedef typename DynamicView<RT, RP...>::traits SrcTraits;
-    typedef Kokkos::Impl::ViewMapping<traits, SrcTraits, void> Mapping;
+    using SrcTraits = typename DynamicView<RT, RP...>::traits;
+    using Mapping   = Kokkos::Impl::ViewMapping<traits, SrcTraits, void>;
    static_assert(Mapping::is_assignable,
                  "Incompatible DynamicView copy construction");
  }
@ -373,9 +371,7 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
    }

    void execute(bool arg_destroy) {
-      typedef Kokkos::RangePolicy<typename HostSpace::execution_space> Range;
-      // typedef Kokkos::RangePolicy< typename Impl::ChunkArraySpace< typename
-      // traits::memory_space >::memory_space::execution_space > Range ;
+      using Range = Kokkos::RangePolicy<typename HostSpace::execution_space>;

      m_destroy = arg_destroy;

@ -431,12 +427,11 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
                    m_chunk_shift)  // max num pointers-to-chunks in array
        ,
        m_chunk_size(2 << (m_chunk_shift - 1)) {
-    typedef typename Impl::ChunkArraySpace<
-        typename traits::memory_space>::memory_space chunk_array_memory_space;
+    using chunk_array_memory_space = typename Impl::ChunkArraySpace<
+        typename traits::memory_space>::memory_space;
    // A functor to deallocate all of the chunks upon final destruction
-    typedef Kokkos::Impl::SharedAllocationRecord<chunk_array_memory_space,
-                                                 Destroy>
-        record_type;
+    using record_type =
+        Kokkos::Impl::SharedAllocationRecord<chunk_array_memory_space, Destroy>;

    // Allocate chunk pointers and allocation counter
    record_type* const record =
@ -471,11 +466,11 @@ create_mirror_view(const Kokkos::Experimental::DynamicView<T, P...>& src) {
 template <class T, class... DP, class... SP>
 inline void deep_copy(const View<T, DP...>& dst,
                      const Kokkos::Experimental::DynamicView<T, SP...>& src) {
-  typedef View<T, DP...> dst_type;
-  typedef Kokkos::Experimental::DynamicView<T, SP...> src_type;
+  using dst_type = View<T, DP...>;
+  using src_type = Kokkos::Experimental::DynamicView<T, SP...>;

-  typedef typename ViewTraits<T, DP...>::execution_space dst_execution_space;
-  typedef typename ViewTraits<T, SP...>::memory_space src_memory_space;
+  using dst_execution_space = typename ViewTraits<T, DP...>::execution_space;
+  using src_memory_space    = typename ViewTraits<T, SP...>::memory_space;

  enum {
    DstExecCanAccessSrc =
@ -496,11 +491,11 @@ inline void deep_copy(const View<T, DP...>& dst,
 template <class T, class... DP, class... SP>
 inline void deep_copy(const Kokkos::Experimental::DynamicView<T, DP...>& dst,
                      const View<T, SP...>& src) {
-  typedef Kokkos::Experimental::DynamicView<T, SP...> dst_type;
-  typedef View<T, DP...> src_type;
+  using dst_type = Kokkos::Experimental::DynamicView<T, SP...>;
+  using src_type = View<T, DP...>;

-  typedef typename ViewTraits<T, DP...>::execution_space dst_execution_space;
-  typedef typename ViewTraits<T, SP...>::memory_space src_memory_space;
+  using dst_execution_space = typename ViewTraits<T, DP...>::execution_space;
+  using src_memory_space    = typename ViewTraits<T, SP...>::memory_space;

  enum {
    DstExecCanAccessSrc =
@ -522,10 +517,10 @@ namespace Impl {
 template <class Arg0, class... DP, class... SP>
 struct CommonSubview<Kokkos::Experimental::DynamicView<DP...>,
                     Kokkos::Experimental::DynamicView<SP...>, 1, Arg0> {
-  typedef Kokkos::Experimental::DynamicView<DP...> DstType;
-  typedef Kokkos::Experimental::DynamicView<SP...> SrcType;
-  typedef DstType dst_subview_type;
-  typedef SrcType src_subview_type;
+  using DstType          = Kokkos::Experimental::DynamicView<DP...>;
+  using SrcType          = Kokkos::Experimental::DynamicView<SP...>;
+  using dst_subview_type = DstType;
+  using src_subview_type = SrcType;
  dst_subview_type dst_sub;
  src_subview_type src_sub;
  CommonSubview(const DstType& dst, const SrcType& src, const Arg0& /*arg0*/)
@ -535,9 +530,9 @@ struct CommonSubview<Kokkos::Experimental::DynamicView<DP...>,
 template <class... DP, class SrcType, class Arg0>
 struct CommonSubview<Kokkos::Experimental::DynamicView<DP...>, SrcType, 1,
                     Arg0> {
-  typedef Kokkos::Experimental::DynamicView<DP...> DstType;
-  typedef DstType dst_subview_type;
-  typedef typename Kokkos::Subview<SrcType, Arg0> src_subview_type;
+  using DstType          = Kokkos::Experimental::DynamicView<DP...>;
+  using dst_subview_type = DstType;
+  using src_subview_type = typename Kokkos::Subview<SrcType, Arg0>;
  dst_subview_type dst_sub;
  src_subview_type src_sub;
  CommonSubview(const DstType& dst, const SrcType& src, const Arg0& arg0)
@ -547,9 +542,9 @@ struct CommonSubview<Kokkos::Experimental::DynamicView<DP...>, SrcType, 1,
 template <class DstType, class... SP, class Arg0>
 struct CommonSubview<DstType, Kokkos::Experimental::DynamicView<SP...>, 1,
                     Arg0> {
-  typedef Kokkos::Experimental::DynamicView<SP...> SrcType;
-  typedef typename Kokkos::Subview<DstType, Arg0> dst_subview_type;
-  typedef SrcType src_subview_type;
+  using SrcType          = Kokkos::Experimental::DynamicView<SP...>;
+  using dst_subview_type = typename Kokkos::Subview<DstType, Arg0>;
+  using src_subview_type = SrcType;
  dst_subview_type dst_sub;
  src_subview_type src_sub;
  CommonSubview(const DstType& dst, const SrcType& src, const Arg0& arg0)
@ -559,11 +554,11 @@ struct CommonSubview<DstType, Kokkos::Experimental::DynamicView<SP...>, 1,
 template <class... DP, class ViewTypeB, class Layout, class ExecSpace,
          typename iType>
 struct ViewCopy<Kokkos::Experimental::DynamicView<DP...>, ViewTypeB, Layout,
-                ExecSpace, 1, iType, false> {
+                ExecSpace, 1, iType> {
  Kokkos::Experimental::DynamicView<DP...> a;
  ViewTypeB b;

-  typedef Kokkos::RangePolicy<ExecSpace, Kokkos::IndexType<iType>> policy_type;
+  using policy_type = Kokkos::RangePolicy<ExecSpace, Kokkos::IndexType<iType>>;

  ViewCopy(const Kokkos::Experimental::DynamicView<DP...>& a_,
           const ViewTypeB& b_)
@ -580,11 +575,11 @@ template <class... DP, class... SP, class Layout, class ExecSpace,
          typename iType>
 struct ViewCopy<Kokkos::Experimental::DynamicView<DP...>,
                Kokkos::Experimental::DynamicView<SP...>, Layout, ExecSpace, 1,
-                iType, false> {
+                iType> {
  Kokkos::Experimental::DynamicView<DP...> a;
  Kokkos::Experimental::DynamicView<SP...> b;

-  typedef Kokkos::RangePolicy<ExecSpace, Kokkos::IndexType<iType>> policy_type;
+  using policy_type = Kokkos::RangePolicy<ExecSpace, Kokkos::IndexType<iType>>;

  ViewCopy(const Kokkos::Experimental::DynamicView<DP...>& a_,
           const Kokkos::Experimental::DynamicView<SP...>& b_)
--- a/lib/kokkos/containers/src/Kokkos_ErrorReporter.hpp
+++ b/lib/kokkos/containers/src/Kokkos_ErrorReporter.hpp
@ -56,9 +56,9 @@ namespace Experimental {
 template <typename ReportType, typename DeviceType>
 class ErrorReporter {
 public:
-  typedef ReportType report_type;
-  typedef DeviceType device_type;
-  typedef typename device_type::execution_space execution_space;
+  using report_type     = ReportType;
+  using device_type     = DeviceType;
+  using execution_space = typename device_type::execution_space;

  ErrorReporter(int max_results)
      : m_numReportsAttempted(""),
@ -103,10 +103,10 @@ class ErrorReporter {
  }

 private:
-  typedef Kokkos::View<report_type *, execution_space> reports_view_t;
-  typedef Kokkos::DualView<report_type *, execution_space> reports_dualview_t;
+  using reports_view_t     = Kokkos::View<report_type *, execution_space>;
+  using reports_dualview_t = Kokkos::DualView<report_type *, execution_space>;

-  typedef typename reports_dualview_t::host_mirror_space host_mirror_space;
+  using host_mirror_space = typename reports_dualview_t::host_mirror_space;
  Kokkos::View<int, execution_space> m_numReportsAttempted;
  reports_dualview_t m_reports;
  Kokkos::DualView<int *, execution_space> m_reporters;
--- a/lib/kokkos/containers/src/Kokkos_Functional.hpp
+++ b/lib/kokkos/containers/src/Kokkos_Functional.hpp
@ -52,10 +52,10 @@ namespace Kokkos {

 template <typename T>
 struct pod_hash {
-  typedef T argument_type;
-  typedef T first_argument_type;
-  typedef uint32_t second_argument_type;
-  typedef uint32_t result_type;
+  using argument_type        = T;
+  using first_argument_type  = T;
+  using second_argument_type = uint32_t;
+  using result_type          = uint32_t;

  KOKKOS_FORCEINLINE_FUNCTION
  uint32_t operator()(T const& t) const {
@ -70,9 +70,9 @@ struct pod_hash {

 template <typename T>
 struct pod_equal_to {
-  typedef T first_argument_type;
-  typedef T second_argument_type;
-  typedef bool result_type;
+  using first_argument_type  = T;
+  using second_argument_type = T;
+  using result_type          = bool;

  KOKKOS_FORCEINLINE_FUNCTION
  bool operator()(T const& a, T const& b) const {
@ -82,9 +82,9 @@ struct pod_equal_to {

 template <typename T>
 struct pod_not_equal_to {
-  typedef T first_argument_type;
-  typedef T second_argument_type;
-  typedef bool result_type;
+  using first_argument_type  = T;
+  using second_argument_type = T;
+  using result_type          = bool;

  KOKKOS_FORCEINLINE_FUNCTION
  bool operator()(T const& a, T const& b) const {
@ -94,9 +94,9 @@ struct pod_not_equal_to {

 template <typename T>
 struct equal_to {
-  typedef T first_argument_type;
-  typedef T second_argument_type;
-  typedef bool result_type;
+  using first_argument_type  = T;
+  using second_argument_type = T;
+  using result_type          = bool;

  KOKKOS_FORCEINLINE_FUNCTION
  bool operator()(T const& a, T const& b) const { return a == b; }
@ -104,9 +104,9 @@ struct equal_to {

 template <typename T>
 struct not_equal_to {
-  typedef T first_argument_type;
-  typedef T second_argument_type;
-  typedef bool result_type;
+  using first_argument_type  = T;
+  using second_argument_type = T;
+  using result_type          = bool;

  KOKKOS_FORCEINLINE_FUNCTION
  bool operator()(T const& a, T const& b) const { return a != b; }
@ -114,9 +114,9 @@ struct not_equal_to {

 template <typename T>
 struct greater {
-  typedef T first_argument_type;
-  typedef T second_argument_type;
-  typedef bool result_type;
+  using first_argument_type  = T;
+  using second_argument_type = T;
+  using result_type          = bool;

  KOKKOS_FORCEINLINE_FUNCTION
  bool operator()(T const& a, T const& b) const { return a > b; }
@ -124,9 +124,9 @@ struct greater {

 template <typename T>
 struct less {
-  typedef T first_argument_type;
-  typedef T second_argument_type;
-  typedef bool result_type;
+  using first_argument_type  = T;
+  using second_argument_type = T;
+  using result_type          = bool;

  KOKKOS_FORCEINLINE_FUNCTION
  bool operator()(T const& a, T const& b) const { return a < b; }
@ -134,9 +134,9 @@ struct less {

 template <typename T>
 struct greater_equal {
-  typedef T first_argument_type;
-  typedef T second_argument_type;
-  typedef bool result_type;
+  using first_argument_type  = T;
+  using second_argument_type = T;
+  using result_type          = bool;

  KOKKOS_FORCEINLINE_FUNCTION
  bool operator()(T const& a, T const& b) const { return a >= b; }
@ -144,9 +144,9 @@ struct greater_equal {

 template <typename T>
 struct less_equal {
-  typedef T first_argument_type;
-  typedef T second_argument_type;
-  typedef bool result_type;
+  using first_argument_type  = T;
+  using second_argument_type = T;
+  using result_type          = bool;

  KOKKOS_FORCEINLINE_FUNCTION
  bool operator()(T const& a, T const& b) const { return a <= b; }
--- a/lib/kokkos/containers/src/Kokkos_OffsetView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_OffsetView.hpp
@ -51,10 +51,10 @@ namespace Impl {

 template <class ViewType>
 struct GetOffsetViewTypeFromViewType {
-  typedef OffsetView<
-      typename ViewType::data_type, typename ViewType::array_layout,
-      typename ViewType::device_type, typename ViewType::memory_traits>
-      type;
+  using type =
+      OffsetView<typename ViewType::data_type, typename ViewType::array_layout,
+                 typename ViewType::device_type,
+                 typename ViewType::memory_traits>;
 };

 template <unsigned, class MapType, class BeginsType>
@ -180,7 +180,7 @@ void runtime_check_rank_device(const size_t rank_dynamic, const size_t rank,
 template <class DataType, class... Properties>
 class OffsetView : public ViewTraits<DataType, Properties...> {
 public:
-  typedef ViewTraits<DataType, Properties...> traits;
+  using traits = ViewTraits<DataType, Properties...>;

 private:
  template <class, class...>
@ -190,12 +190,12 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
  template <class, class...>
  friend class Kokkos::Impl::ViewMapping;

-  typedef Kokkos::Impl::ViewMapping<traits, void> map_type;
-  typedef Kokkos::Impl::SharedAllocationTracker track_type;
+  using map_type   = Kokkos::Impl::ViewMapping<traits, void>;
+  using track_type = Kokkos::Impl::SharedAllocationTracker;

 public:
  enum { Rank = map_type::Rank };
-  typedef Kokkos::Array<int64_t, Rank> begins_type;
+  using begins_type = Kokkos::Array<int64_t, Rank>;

  template <
      typename iType,
@ -223,28 +223,27 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
 public:
  //----------------------------------------
  /** \brief  Compatible view of array of scalar types */
-  typedef OffsetView<
-      typename traits::scalar_array_type, typename traits::array_layout,
-      typename traits::device_type, typename traits::memory_traits>
-      array_type;
+  using array_type =
+      OffsetView<typename traits::scalar_array_type,
+                 typename traits::array_layout, typename traits::device_type,
+                 typename traits::memory_traits>;

  /** \brief  Compatible view of const data type */
-  typedef OffsetView<
-      typename traits::const_data_type, typename traits::array_layout,
-      typename traits::device_type, typename traits::memory_traits>
-      const_type;
+  using const_type =
+      OffsetView<typename traits::const_data_type,
+                 typename traits::array_layout, typename traits::device_type,
+                 typename traits::memory_traits>;

  /** \brief  Compatible view of non-const data type */
-  typedef OffsetView<
-      typename traits::non_const_data_type, typename traits::array_layout,
-      typename traits::device_type, typename traits::memory_traits>
-      non_const_type;
+  using non_const_type =
+      OffsetView<typename traits::non_const_data_type,
+                 typename traits::array_layout, typename traits::device_type,
+                 typename traits::memory_traits>;

  /** \brief  Compatible HostMirror view */
-  typedef OffsetView<typename traits::non_const_data_type,
-                     typename traits::array_layout,
-                     typename traits::host_mirror_space>
-      HostMirror;
+  using HostMirror = OffsetView<typename traits::non_const_data_type,
+                                typename traits::array_layout,
+                                typename traits::host_mirror_space>;

  //----------------------------------------
  // Domain rank and extents
@ -335,8 +334,8 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
  //----------------------------------------
  // Range span is the span which contains all members.

-  typedef typename map_type::reference_type reference_type;
-  typedef typename map_type::pointer_type pointer_type;
+  using reference_type = typename map_type::reference_type;
+  using pointer_type   = typename map_type::pointer_type;

  enum {
    reference_type_is_lvalue_reference =
@ -347,6 +346,9 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
  KOKKOS_INLINE_FUNCTION bool span_is_contiguous() const {
    return m_map.span_is_contiguous();
  }
+  KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const {
+    return m_map.data() != nullptr;
+  }
  KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const {
    return m_map.data();
  }
@ -841,10 +843,9 @@ class OffsetView : public ViewTraits<DataType, Properties...> {

  // interoperability with View
 private:
-  typedef View<typename traits::scalar_array_type,
-               typename traits::array_layout, typename traits::device_type,
-               typename traits::memory_traits>
-      view_type;
+  using view_type =
+      View<typename traits::scalar_array_type, typename traits::array_layout,
+           typename traits::device_type, typename traits::memory_traits>;

 public:
  KOKKOS_INLINE_FUNCTION
@ -856,8 +857,8 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
  template <class RT, class... RP>
  KOKKOS_INLINE_FUNCTION OffsetView(const View<RT, RP...>& aview)
      : m_track(aview.impl_track()), m_map() {
-    typedef typename OffsetView<RT, RP...>::traits SrcTraits;
-    typedef Kokkos::Impl::ViewMapping<traits, SrcTraits, void> Mapping;
+    using SrcTraits = typename OffsetView<RT, RP...>::traits;
+    using Mapping   = Kokkos::Impl::ViewMapping<traits, SrcTraits, void>;
    static_assert(Mapping::is_assignable,
                  "Incompatible OffsetView copy construction");
    Mapping::assign(m_map, aview.impl_map(), m_track);
@ -871,8 +872,8 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
  KOKKOS_INLINE_FUNCTION OffsetView(const View<RT, RP...>& aview,
                                    const index_list_type& minIndices)
      : m_track(aview.impl_track()), m_map() {
-    typedef typename OffsetView<RT, RP...>::traits SrcTraits;
-    typedef Kokkos::Impl::ViewMapping<traits, SrcTraits, void> Mapping;
+    using SrcTraits = typename OffsetView<RT, RP...>::traits;
+    using Mapping   = Kokkos::Impl::ViewMapping<traits, SrcTraits, void>;
    static_assert(Mapping::is_assignable,
                  "Incompatible OffsetView copy construction");
    Mapping::assign(m_map, aview.impl_map(), m_track);
@ -894,8 +895,8 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
  KOKKOS_INLINE_FUNCTION OffsetView(const View<RT, RP...>& aview,
                                    const begins_type& beg)
      : m_track(aview.impl_track()), m_map(), m_begins(beg) {
-    typedef typename OffsetView<RT, RP...>::traits SrcTraits;
-    typedef Kokkos::Impl::ViewMapping<traits, SrcTraits, void> Mapping;
+    using SrcTraits = typename OffsetView<RT, RP...>::traits;
+    using Mapping   = Kokkos::Impl::ViewMapping<traits, SrcTraits, void>;
    static_assert(Mapping::is_assignable,
                  "Incompatible OffsetView copy construction");
    Mapping::assign(m_map, aview.impl_map(), m_track);
@ -917,8 +918,8 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
      : m_track(rhs.m_track, traits::is_managed),
        m_map(),
        m_begins(rhs.m_begins) {
-    typedef typename OffsetView<RT, RP...>::traits SrcTraits;
-    typedef Kokkos::Impl::ViewMapping<traits, SrcTraits, void> Mapping;
+    using SrcTraits = typename OffsetView<RT, RP...>::traits;
+    using Mapping   = Kokkos::Impl::ViewMapping<traits, SrcTraits, void>;
    static_assert(Mapping::is_assignable,
                  "Incompatible OffsetView copy construction");
    Mapping::assign(m_map, rhs.m_map, rhs.m_track);  // swb what about assign?
@ -1215,11 +1216,11 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
    for (size_t i = 0; i < Rank; ++i) m_begins[i] = minIndices.begin()[i];

    // Append layout and spaces if not input
-    typedef Kokkos::Impl::ViewCtorProp<P...> alloc_prop_input;
+    using alloc_prop_input = Kokkos::Impl::ViewCtorProp<P...>;

    // use 'std::integral_constant<unsigned,I>' for non-types
    // to avoid duplicate class error.
-    typedef Kokkos::Impl::ViewCtorProp<
+    using alloc_prop = Kokkos::Impl::ViewCtorProp<
        P...,
        typename std::conditional<alloc_prop_input::has_label,
                                  std::integral_constant<unsigned, 0>,
@ -1231,19 +1232,13 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
        typename std::conditional<
            alloc_prop_input::has_execution_space,
            std::integral_constant<unsigned, 2>,
-            typename traits::device_type::execution_space>::type>
-        alloc_prop;
+            typename traits::device_type::execution_space>::type>;

    static_assert(traits::is_managed,
                  "OffsetView allocation constructor requires managed memory");

    if (alloc_prop::initialize &&
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-        !alloc_prop::execution_space::is_initialized()
-#else
-        !alloc_prop::execution_space::impl_is_initialized()
-#endif
-    ) {
+        !alloc_prop::execution_space::impl_is_initialized()) {
      // If initializing view data then
      // the execution space must be initialized.
      Kokkos::Impl::throw_runtime_exception(
@ -1764,8 +1759,8 @@ template <class LT, class... LP, class RT, class... RP>
 KOKKOS_INLINE_FUNCTION bool operator==(const OffsetView<LT, LP...>& lhs,
                                       const OffsetView<RT, RP...>& rhs) {
  // Same data, layout, dimensions
-  typedef ViewTraits<LT, LP...> lhs_traits;
-  typedef ViewTraits<RT, RP...> rhs_traits;
+  using lhs_traits = ViewTraits<LT, LP...>;
+  using rhs_traits = ViewTraits<RT, RP...>;

  return std::is_same<typename lhs_traits::const_value_type,
                      typename rhs_traits::const_value_type>::value &&
@ -1795,8 +1790,8 @@ template <class LT, class... LP, class RT, class... RP>
 KOKKOS_INLINE_FUNCTION bool operator==(const View<LT, LP...>& lhs,
                                       const OffsetView<RT, RP...>& rhs) {
  // Same data, layout, dimensions
-  typedef ViewTraits<LT, LP...> lhs_traits;
-  typedef ViewTraits<RT, RP...> rhs_traits;
+  using lhs_traits = ViewTraits<LT, LP...>;
+  using rhs_traits = ViewTraits<RT, RP...>;

  return std::is_same<typename lhs_traits::const_value_type,
                      typename rhs_traits::const_value_type>::value &&
@ -1825,10 +1820,10 @@ KOKKOS_INLINE_FUNCTION bool operator==(const OffsetView<LT, LP...>& lhs,
 //----------------------------------------------------------------------------

 namespace Kokkos {
-namespace Experimental {
+
 template <class DT, class... DP>
 inline void deep_copy(
-    const OffsetView<DT, DP...>& dst,
+    const Experimental::OffsetView<DT, DP...>& dst,
    typename ViewTraits<DT, DP...>::const_value_type& value,
    typename std::enable_if<std::is_same<
        typename ViewTraits<DT, DP...>::specialize, void>::value>::type* =
@ -1844,7 +1839,8 @@ inline void deep_copy(

 template <class DT, class... DP, class ST, class... SP>
 inline void deep_copy(
-    const OffsetView<DT, DP...>& dst, const OffsetView<ST, SP...>& value,
+    const Experimental::OffsetView<DT, DP...>& dst,
+    const Experimental::OffsetView<ST, SP...>& value,
    typename std::enable_if<std::is_same<
        typename ViewTraits<DT, DP...>::specialize, void>::value>::type* =
        nullptr) {
@ -1858,7 +1854,8 @@ inline void deep_copy(
 }
 template <class DT, class... DP, class ST, class... SP>
 inline void deep_copy(
-    const OffsetView<DT, DP...>& dst, const View<ST, SP...>& value,
+    const Experimental::OffsetView<DT, DP...>& dst,
+    const View<ST, SP...>& value,
    typename std::enable_if<std::is_same<
        typename ViewTraits<DT, DP...>::specialize, void>::value>::type* =
        nullptr) {
@ -1873,7 +1870,8 @@ inline void deep_copy(

 template <class DT, class... DP, class ST, class... SP>
 inline void deep_copy(
-    const View<DT, DP...>& dst, const OffsetView<ST, SP...>& value,
+    const View<DT, DP...>& dst,
+    const Experimental::OffsetView<ST, SP...>& value,
    typename std::enable_if<std::is_same<
        typename ViewTraits<DT, DP...>::specialize, void>::value>::type* =
        nullptr) {
@ -1884,53 +1882,54 @@ inline void deep_copy(

  Kokkos::deep_copy(dst, value.view());
 }
+
 namespace Impl {

 // Deduce Mirror Types
 template <class Space, class T, class... P>
 struct MirrorOffsetViewType {
  // The incoming view_type
-  typedef typename Kokkos::Experimental::OffsetView<T, P...> src_view_type;
+  using src_view_type = typename Kokkos::Experimental::OffsetView<T, P...>;
  // The memory space for the mirror view
-  typedef typename Space::memory_space memory_space;
+  using memory_space = typename Space::memory_space;
  // Check whether it is the same memory space
  enum {
    is_same_memspace =
        std::is_same<memory_space, typename src_view_type::memory_space>::value
  };
  // The array_layout
-  typedef typename src_view_type::array_layout array_layout;
+  using array_layout = typename src_view_type::array_layout;
  // The data type (we probably want it non-const since otherwise we can't even
  // deep_copy to it.
-  typedef typename src_view_type::non_const_data_type data_type;
+  using data_type = typename src_view_type::non_const_data_type;
  // The destination view type if it is not the same memory space
-  typedef Kokkos::Experimental::OffsetView<data_type, array_layout, Space>
-      dest_view_type;
+  using dest_view_type =
+      Kokkos::Experimental::OffsetView<data_type, array_layout, Space>;
  // If it is the same memory_space return the existsing view_type
  // This will also keep the unmanaged trait if necessary
-  typedef typename std::conditional<is_same_memspace, src_view_type,
-                                    dest_view_type>::type view_type;
+  using view_type = typename std::conditional<is_same_memspace, src_view_type,
+                                              dest_view_type>::type;
 };

 template <class Space, class T, class... P>
 struct MirrorOffsetType {
  // The incoming view_type
-  typedef typename Kokkos::Experimental::OffsetView<T, P...> src_view_type;
+  using src_view_type = typename Kokkos::Experimental::OffsetView<T, P...>;
  // The memory space for the mirror view
-  typedef typename Space::memory_space memory_space;
+  using memory_space = typename Space::memory_space;
  // Check whether it is the same memory space
  enum {
    is_same_memspace =
        std::is_same<memory_space, typename src_view_type::memory_space>::value
  };
  // The array_layout
-  typedef typename src_view_type::array_layout array_layout;
+  using array_layout = typename src_view_type::array_layout;
  // The data type (we probably want it non-const since otherwise we can't even
  // deep_copy to it.
-  typedef typename src_view_type::non_const_data_type data_type;
+  using data_type = typename src_view_type::non_const_data_type;
  // The destination view type if it is not the same memory space
-  typedef Kokkos::Experimental::OffsetView<data_type, array_layout, Space>
-      view_type;
+  using view_type =
+      Kokkos::Experimental::OffsetView<data_type, array_layout, Space>;
 };

 }  // namespace Impl
@ -1942,8 +1941,8 @@ create_mirror(
    typename std::enable_if<
        !std::is_same<typename Kokkos::ViewTraits<T, P...>::array_layout,
                      Kokkos::LayoutStride>::value>::type* = 0) {
-  typedef OffsetView<T, P...> src_type;
-  typedef typename src_type::HostMirror dst_type;
+  using src_type = Experimental::OffsetView<T, P...>;
+  using dst_type = typename src_type::HostMirror;

  return dst_type(
      Kokkos::Impl::ViewCtorProp<std::string>(
@ -1962,8 +1961,8 @@ create_mirror(
    typename std::enable_if<
        std::is_same<typename Kokkos::ViewTraits<T, P...>::array_layout,
                     Kokkos::LayoutStride>::value>::type* = 0) {
-  typedef OffsetView<T, P...> src_type;
-  typedef typename src_type::HostMirror dst_type;
+  using src_type = Experimental::OffsetView<T, P...>;
+  using dst_type = typename src_type::HostMirror;

  Kokkos::LayoutStride layout;

@ -1992,14 +1991,13 @@ create_mirror(

 // Create a mirror in a new space (specialization for different space)
 template <class Space, class T, class... P>
-typename Kokkos::Experimental::Impl::MirrorOffsetType<Space, T, P...>::view_type
+typename Kokkos::Impl::MirrorOffsetType<Space, T, P...>::view_type
 create_mirror(const Space&,
              const Kokkos::Experimental::OffsetView<T, P...>& src) {
-  return typename Kokkos::Experimental::Impl::MirrorOffsetType<
-      Space, T, P...>::view_type(src.label(), src.layout(),
-                                 {src.begin(0), src.begin(1), src.begin(2),
-                                  src.begin(3), src.begin(4), src.begin(5),
-                                  src.begin(6), src.begin(7)});
+  return typename Kokkos::Impl::MirrorOffsetType<Space, T, P...>::view_type(
+      src.label(), src.layout(),
+      {src.begin(0), src.begin(1), src.begin(2), src.begin(3), src.begin(4),
+       src.begin(5), src.begin(6), src.begin(7)});
 }

 template <class T, class... P>
@ -2031,13 +2029,12 @@ create_mirror_view(
              typename Kokkos::Experimental::OffsetView<T, P...>::data_type,
              typename Kokkos::Experimental::OffsetView<
                  T, P...>::HostMirror::data_type>::value)>::type* = 0) {
-  return Kokkos::Experimental::create_mirror(src);
+  return Kokkos::create_mirror(src);
 }

 // Create a mirror view in a new space (specialization for same space)
 template <class Space, class T, class... P>
-typename Kokkos::Experimental::Impl::MirrorOffsetViewType<Space, T,
-                                                          P...>::view_type
+typename Kokkos::Impl::MirrorOffsetViewType<Space, T, P...>::view_type
 create_mirror_view(const Space&,
                   const Kokkos::Experimental::OffsetView<T, P...>& src,
                   typename std::enable_if<Impl::MirrorOffsetViewType<
@ -2047,17 +2044,15 @@ create_mirror_view(const Space&,

 // Create a mirror view in a new space (specialization for different space)
 template <class Space, class T, class... P>
-typename Kokkos::Experimental::Impl::MirrorOffsetViewType<Space, T,
-                                                          P...>::view_type
+typename Kokkos::Impl::MirrorOffsetViewType<Space, T, P...>::view_type
 create_mirror_view(const Space&,
                   const Kokkos::Experimental::OffsetView<T, P...>& src,
                   typename std::enable_if<!Impl::MirrorOffsetViewType<
                       Space, T, P...>::is_same_memspace>::type* = 0) {
-  return typename Kokkos::Experimental::Impl::MirrorOffsetViewType<
-      Space, T, P...>::view_type(src.label(), src.layout(),
-                                 {src.begin(0), src.begin(1), src.begin(2),
-                                  src.begin(3), src.begin(4), src.begin(5),
-                                  src.begin(6), src.begin(7)});
+  return typename Kokkos::Impl::MirrorOffsetViewType<Space, T, P...>::view_type(
+      src.label(), src.layout(),
+      {src.begin(0), src.begin(1), src.begin(2), src.begin(3), src.begin(4),
+       src.begin(5), src.begin(6), src.begin(7)});
 }
 //
 //  // Create a mirror view and deep_copy in a new space (specialization for
@ -2093,7 +2088,6 @@ create_mirror_view(const Space&,
 //    return mirror;
 //  }

-}  // namespace Experimental
 } /* namespace Kokkos */

 //----------------------------------------------------------------------------
--- a/lib/kokkos/containers/src/Kokkos_ScatterView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_ScatterView.hpp
--- a/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
+++ b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
@ -57,7 +57,7 @@ namespace Kokkos {
 namespace Impl {
 template <class RowOffsetsType, class RowBlockOffsetsType>
 struct StaticCrsGraphBalancerFunctor {
-  typedef typename RowOffsetsType::non_const_value_type int_type;
+  using int_type = typename RowOffsetsType::non_const_value_type;
  RowOffsetsType row_offsets;
  RowBlockOffsetsType row_block_offsets;

@ -148,7 +148,7 @@ struct StaticCrsGraphBalancerFunctor {
 ///
 /// Here is an example loop over the entries in the row:
 /// \code
-/// typedef typename GraphRowViewConst<MatrixType>::ordinal_type ordinal_type;
+/// using ordinal_type = typename GraphRowViewConst<MatrixType>::ordinal_type;
 ///
 /// GraphRowView<GraphType> G_i = ...;
 /// const ordinal_type numEntries = G_i.length;
@ -159,7 +159,7 @@ struct StaticCrsGraphBalancerFunctor {
 /// \endcode
 ///
 /// GraphType must provide the \c data_type
-/// typedefs. In addition, it must make sense to use GraphRowViewConst to
+/// aliases. In addition, it must make sense to use GraphRowViewConst to
 /// view a row of GraphType. In particular, column
 /// indices of a row must be accessible using the <tt>entries</tt>
 /// resp. <tt>colidx</tt> arrays given to the constructor of this
@ -170,7 +170,7 @@ struct StaticCrsGraphBalancerFunctor {
 template <class GraphType>
 struct GraphRowViewConst {
  //! The type of the column indices in the row.
-  typedef const typename GraphType::data_type ordinal_type;
+  using ordinal_type = const typename GraphType::data_type;

 private:
  //! Array of (local) column indices in the row.
@ -279,49 +279,33 @@ struct GraphRowViewConst {
 /// <li> <tt> entries( row_map[i0] + i1 , i2 , i3 , ... ); </tt> </li>
 /// </ul>
 template <class DataType, class Arg1Type, class Arg2Type = void,
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-          typename SizeType =
-              typename ViewTraits<DataType*, Arg1Type, Arg2Type>::size_type,
-          class Arg3Type = void>
-#else
          class Arg3Type    = void,
          typename SizeType = typename ViewTraits<DataType*, Arg1Type, Arg2Type,
                                                  Arg3Type>::size_type>
-#endif
 class StaticCrsGraph {
 private:
-  typedef ViewTraits<DataType*, Arg1Type, Arg2Type, Arg3Type> traits;
+  using traits = ViewTraits<DataType*, Arg1Type, Arg2Type, Arg3Type>;

 public:
-  typedef DataType data_type;
-  typedef typename traits::array_layout array_layout;
-  typedef typename traits::execution_space execution_space;
-  typedef typename traits::device_type device_type;
-  typedef typename traits::memory_traits memory_traits;
-  typedef SizeType size_type;
+  using data_type       = DataType;
+  using array_layout    = typename traits::array_layout;
+  using execution_space = typename traits::execution_space;
+  using device_type     = typename traits::device_type;
+  using memory_traits   = typename traits::memory_traits;
+  using size_type       = SizeType;

-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-  typedef StaticCrsGraph<DataType, Arg1Type, Arg2Type, SizeType, Arg3Type>
-      staticcrsgraph_type;
-  typedef StaticCrsGraph<data_type, array_layout,
-                         typename traits::host_mirror_space, size_type,
-                         memory_traits>
-      HostMirror;
-#else
-  typedef StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type, SizeType>
-      staticcrsgraph_type;
-  typedef StaticCrsGraph<data_type, array_layout,
-                         typename traits::host_mirror_space, memory_traits,
-                         size_type>
-      HostMirror;
-#endif
+  using staticcrsgraph_type =
+      StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type, SizeType>;
+  using HostMirror = StaticCrsGraph<data_type, array_layout,
+                                    typename traits::host_mirror_space,
+                                    memory_traits, size_type>;

-  typedef View<const size_type*, array_layout, device_type, memory_traits>
-      row_map_type;
-  typedef View<data_type*, array_layout, device_type, memory_traits>
-      entries_type;
-  typedef View<const size_type*, array_layout, device_type, memory_traits>
-      row_block_type;
+  using row_map_type =
+      View<const size_type*, array_layout, device_type, memory_traits>;
+  using entries_type =
+      View<data_type*, array_layout, device_type, memory_traits>;
+  using row_block_type =
+      View<const size_type*, array_layout, device_type, memory_traits>;

  entries_type entries;
  row_map_type row_map;
@ -370,6 +354,10 @@ class StaticCrsGraph {
               : static_cast<size_type>(0);
  }

+  KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const {
+    return (row_map.is_allocated() && entries.is_allocated());
+  }
+
  /// \brief Return a const view of row i of the graph.
  ///
  /// If row i does not belong to the graph, return an empty view.
@ -436,35 +424,19 @@ typename StaticCrsGraphType::staticcrsgraph_type create_staticcrsgraph(

 //----------------------------------------------------------------------------

-template <class DataType, class Arg1Type, class Arg2Type,
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-          typename SizeType, class Arg3Type>
-typename StaticCrsGraph<DataType, Arg1Type, Arg2Type, SizeType,
-                        Arg3Type>::HostMirror
-create_mirror_view(const StaticCrsGraph<DataType, Arg1Type, Arg2Type, SizeType,
-                                        Arg3Type>& input);
-#else
-          class Arg3Type, typename SizeType>
+template <class DataType, class Arg1Type, class Arg2Type, class Arg3Type,
+          typename SizeType>
 typename StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type,
                        SizeType>::HostMirror
 create_mirror_view(const StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type,
                                        SizeType>& input);
-#endif

-template <class DataType, class Arg1Type, class Arg2Type,
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-          typename SizeType, class Arg3Type>
-typename StaticCrsGraph<DataType, Arg1Type, Arg2Type, SizeType,
-                        Arg3Type>::HostMirror
-create_mirror_view(const StaticCrsGraph<DataType, Arg1Type, Arg2Type, SizeType,
-                                        Arg3Type>& input);
-#else
-          class Arg3Type, typename SizeType>
+template <class DataType, class Arg1Type, class Arg2Type, class Arg3Type,
+          typename SizeType>
 typename StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type,
                        SizeType>::HostMirror
 create_mirror(const StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type,
                                   SizeType>& input);
-#endif

 }  // namespace Kokkos

@ -481,8 +453,8 @@ namespace Impl {

 template <class GraphType>
 struct StaticCrsGraphMaximumEntry {
-  typedef typename GraphType::execution_space execution_space;
-  typedef typename GraphType::data_type value_type;
+  using execution_space = typename GraphType::execution_space;
+  using value_type      = typename GraphType::data_type;

  const typename GraphType::entries_type entries;

@ -505,22 +477,13 @@ struct StaticCrsGraphMaximumEntry {

 }  // namespace Impl

-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-template <class DataType, class Arg1Type, class Arg2Type, typename SizeType,
-          class Arg3Type>
-DataType maximum_entry(const StaticCrsGraph<DataType, Arg1Type, Arg2Type,
-                                            SizeType, Arg3Type>& graph) {
-  typedef StaticCrsGraph<DataType, Arg1Type, Arg2Type, SizeType, Arg3Type>
-      GraphType;
-#else
 template <class DataType, class Arg1Type, class Arg2Type, class Arg3Type,
          typename SizeType>
 DataType maximum_entry(const StaticCrsGraph<DataType, Arg1Type, Arg2Type,
                                            Arg3Type, SizeType>& graph) {
-  typedef StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type, SizeType>
-      GraphType;
-#endif
-  typedef Impl::StaticCrsGraphMaximumEntry<GraphType> FunctorType;
+  using GraphType =
+      StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type, SizeType>;
+  using FunctorType = Impl::StaticCrsGraphMaximumEntry<GraphType>;

  DataType result = 0;
  Kokkos::parallel_reduce("Kokkos::maximum_entry", graph.entries.extent(0),
--- a/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp
+++ b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp
@ -66,7 +66,7 @@

 namespace Kokkos {

-enum { UnorderedMapInvalidIndex = ~0u };
+enum : unsigned { UnorderedMapInvalidIndex = ~0u };

 /// \brief First element of the return value of UnorderedMap::insert().
 ///
@ -84,7 +84,7 @@ enum { UnorderedMapInvalidIndex = ~0u };

 class UnorderedMapInsertResult {
 private:
-  enum Status {
+  enum Status : uint32_t {
    SUCCESS          = 1u << 31,
    EXISTING         = 1u << 30,
    FREED_EXISTING   = 1u << 29,
@ -206,42 +206,40 @@ template <typename Key, typename Value,
              pod_equal_to<typename std::remove_const<Key>::type> >
 class UnorderedMap {
 private:
-  typedef typename ViewTraits<Key, Device, void, void>::host_mirror_space
-      host_mirror_space;
+  using host_mirror_space =
+      typename ViewTraits<Key, Device, void, void>::host_mirror_space;

 public:
  //! \name Public types and constants
  //@{

  // key_types
-  typedef Key declared_key_type;
-  typedef typename std::remove_const<declared_key_type>::type key_type;
-  typedef typename std::add_const<key_type>::type const_key_type;
+  using declared_key_type = Key;
+  using key_type          = typename std::remove_const<declared_key_type>::type;
+  using const_key_type    = typename std::add_const<key_type>::type;

  // value_types
-  typedef Value declared_value_type;
-  typedef typename std::remove_const<declared_value_type>::type value_type;
-  typedef typename std::add_const<value_type>::type const_value_type;
+  using declared_value_type = Value;
+  using value_type = typename std::remove_const<declared_value_type>::type;
+  using const_value_type = typename std::add_const<value_type>::type;

-  typedef Device device_type;
-  typedef typename Device::execution_space execution_space;
-  typedef Hasher hasher_type;
-  typedef EqualTo equal_to_type;
-  typedef uint32_t size_type;
+  using device_type     = Device;
+  using execution_space = typename Device::execution_space;
+  using hasher_type     = Hasher;
+  using equal_to_type   = EqualTo;
+  using size_type       = uint32_t;

  // map_types
-  typedef UnorderedMap<declared_key_type, declared_value_type, device_type,
-                       hasher_type, equal_to_type>
-      declared_map_type;
-  typedef UnorderedMap<key_type, value_type, device_type, hasher_type,
-                       equal_to_type>
-      insertable_map_type;
-  typedef UnorderedMap<const_key_type, value_type, device_type, hasher_type,
-                       equal_to_type>
-      modifiable_map_type;
-  typedef UnorderedMap<const_key_type, const_value_type, device_type,
-                       hasher_type, equal_to_type>
-      const_map_type;
+  using declared_map_type =
+      UnorderedMap<declared_key_type, declared_value_type, device_type,
+                   hasher_type, equal_to_type>;
+  using insertable_map_type = UnorderedMap<key_type, value_type, device_type,
+                                           hasher_type, equal_to_type>;
+  using modifiable_map_type =
+      UnorderedMap<const_key_type, value_type, device_type, hasher_type,
+                   equal_to_type>;
+  using const_map_type = UnorderedMap<const_key_type, const_value_type,
+                                      device_type, hasher_type, equal_to_type>;

  static const bool is_set = std::is_same<void, value_type>::value;
  static const bool has_const_key =
@ -254,43 +252,42 @@ class UnorderedMap {
  static const bool is_modifiable_map = has_const_key && !has_const_value;
  static const bool is_const_map      = has_const_key && has_const_value;

-  typedef UnorderedMapInsertResult insert_result;
+  using insert_result = UnorderedMapInsertResult;

-  typedef UnorderedMap<Key, Value, host_mirror_space, Hasher, EqualTo>
-      HostMirror;
+  using HostMirror =
+      UnorderedMap<Key, Value, host_mirror_space, Hasher, EqualTo>;

-  typedef Impl::UnorderedMapHistogram<const_map_type> histogram_type;
+  using histogram_type = Impl::UnorderedMapHistogram<const_map_type>;

  //@}

 private:
-  enum { invalid_index = ~static_cast<size_type>(0) };
+  enum : size_type { invalid_index = ~static_cast<size_type>(0) };

-  typedef typename Impl::if_c<is_set, int, declared_value_type>::type
-      impl_value_type;
+  using impl_value_type =
+      typename Impl::if_c<is_set, int, declared_value_type>::type;

-  typedef typename Impl::if_c<
+  using key_type_view = typename Impl::if_c<
      is_insertable_map, View<key_type *, device_type>,
-      View<const key_type *, device_type, MemoryTraits<RandomAccess> > >::type
-      key_type_view;
+      View<const key_type *, device_type, MemoryTraits<RandomAccess> > >::type;

-  typedef typename Impl::if_c<is_insertable_map || is_modifiable_map,
-                              View<impl_value_type *, device_type>,
-                              View<const impl_value_type *, device_type,
-                                   MemoryTraits<RandomAccess> > >::type
-      value_type_view;
+  using value_type_view =
+      typename Impl::if_c<is_insertable_map || is_modifiable_map,
+                          View<impl_value_type *, device_type>,
+                          View<const impl_value_type *, device_type,
+                               MemoryTraits<RandomAccess> > >::type;

-  typedef typename Impl::if_c<
+  using size_type_view = typename Impl::if_c<
      is_insertable_map, View<size_type *, device_type>,
-      View<const size_type *, device_type, MemoryTraits<RandomAccess> > >::type
-      size_type_view;
+      View<const size_type *, device_type, MemoryTraits<RandomAccess> > >::type;

-  typedef typename Impl::if_c<is_insertable_map, Bitset<execution_space>,
-                              ConstBitset<execution_space> >::type bitset_type;
+  using bitset_type =
+      typename Impl::if_c<is_insertable_map, Bitset<execution_space>,
+                          ConstBitset<execution_space> >::type;

  enum { modified_idx = 0, erasable_idx = 1, failed_insert_idx = 2 };
  enum { num_scalars = 3 };
-  typedef View<int[num_scalars], LayoutLeft, device_type> scalars_view;
+  using scalars_view = View<int[num_scalars], LayoutLeft, device_type>;

 public:
  //! \name Public member functions
@ -353,6 +350,11 @@ class UnorderedMap {
    { Kokkos::deep_copy(m_scalars, 0); }
  }

+  KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const {
+    return (m_keys.is_allocated() && m_values.is_allocated() &&
+            m_scalars.is_allocated());
+  }
+
  /// \brief Change the capacity of the the map
  ///
  /// If there are no failed inserts the current size of the map will
@ -742,9 +744,9 @@ class UnorderedMap {

      Kokkos::deep_copy(tmp.m_available_indexes, src.m_available_indexes);

-      typedef Kokkos::Impl::DeepCopy<typename device_type::memory_space,
-                                     typename SDevice::memory_space>
-          raw_deep_copy;
+      using raw_deep_copy =
+          Kokkos::Impl::DeepCopy<typename device_type::memory_space,
+                                 typename SDevice::memory_space>;

      raw_deep_copy(tmp.m_hash_lists.data(), src.m_hash_lists.data(),
                    sizeof(size_type) * src.m_hash_lists.extent(0));
@ -768,25 +770,25 @@ class UnorderedMap {
  bool modified() const { return get_flag(modified_idx); }

  void set_flag(int flag) const {
-    typedef Kokkos::Impl::DeepCopy<typename device_type::memory_space,
-                                   Kokkos::HostSpace>
-        raw_deep_copy;
+    using raw_deep_copy =
+        Kokkos::Impl::DeepCopy<typename device_type::memory_space,
+                               Kokkos::HostSpace>;
    const int true_ = true;
    raw_deep_copy(m_scalars.data() + flag, &true_, sizeof(int));
  }

  void reset_flag(int flag) const {
-    typedef Kokkos::Impl::DeepCopy<typename device_type::memory_space,
-                                   Kokkos::HostSpace>
-        raw_deep_copy;
+    using raw_deep_copy =
+        Kokkos::Impl::DeepCopy<typename device_type::memory_space,
+                               Kokkos::HostSpace>;
    const int false_ = false;
    raw_deep_copy(m_scalars.data() + flag, &false_, sizeof(int));
  }

  bool get_flag(int flag) const {
-    typedef Kokkos::Impl::DeepCopy<Kokkos::HostSpace,
-                                   typename device_type::memory_space>
-        raw_deep_copy;
+    using raw_deep_copy =
+        Kokkos::Impl::DeepCopy<Kokkos::HostSpace,
+                               typename device_type::memory_space>;
    int result = false;
    raw_deep_copy(&result, m_scalars.data() + flag, sizeof(int));
    return result;
--- a/lib/kokkos/containers/src/Kokkos_Vector.hpp
+++ b/lib/kokkos/containers/src/Kokkos_Vector.hpp
@ -58,19 +58,19 @@ namespace Kokkos {
 template <class Scalar, class Arg1Type = void>
 class vector : public DualView<Scalar*, LayoutLeft, Arg1Type> {
 public:
-  typedef Scalar value_type;
-  typedef Scalar* pointer;
-  typedef const Scalar* const_pointer;
-  typedef Scalar& reference;
-  typedef const Scalar& const_reference;
-  typedef Scalar* iterator;
-  typedef const Scalar* const_iterator;
-  typedef size_t size_type;
+  using value_type      = Scalar;
+  using pointer         = Scalar*;
+  using const_pointer   = const Scalar*;
+  using reference       = Scalar&;
+  using const_reference = const Scalar&;
+  using iterator        = Scalar*;
+  using const_iterator  = const Scalar*;
+  using size_type       = size_t;

 private:
  size_t _size;
  float _extra_storage;
-  typedef DualView<Scalar*, LayoutLeft, Arg1Type> DV;
+  using DV = DualView<Scalar*, LayoutLeft, Arg1Type>;

 public:
 #ifdef KOKKOS_ENABLE_CUDA_UVM
@ -212,14 +212,17 @@ class vector : public DualView<Scalar*, LayoutLeft, Arg1Type> {
    return begin() + start;
  }

+  KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const {
+    return DV::is_allocated();
+  }
+
  size_type size() const { return _size; }
  size_type max_size() const { return 2000000000; }
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-  size_type capacity() const { return DV::capacity(); }
-#endif
  size_type span() const { return DV::span(); }
  bool empty() const { return _size == 0; }

+  pointer data() const { return DV::h_view.data(); }
+
  iterator begin() const { return DV::h_view.data(); }

  iterator end() const {
@ -310,7 +313,7 @@ class vector : public DualView<Scalar*, LayoutLeft, Arg1Type> {

 public:
  struct set_functor {
-    typedef typename DV::t_dev::execution_space execution_space;
+    using execution_space = typename DV::t_dev::execution_space;
    typename DV::t_dev _data;
    Scalar _val;

@ -321,7 +324,7 @@ class vector : public DualView<Scalar*, LayoutLeft, Arg1Type> {
  };

  struct set_functor_host {
-    typedef typename DV::t_host::execution_space execution_space;
+    using execution_space = typename DV::t_host::execution_space;
    typename DV::t_host _data;
    Scalar _val;

--- a/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
+++ b/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
@ -65,11 +65,11 @@ unsigned rotate_right(unsigned i, int r) {

 template <typename Bitset>
 struct BitsetCount {
-  typedef Bitset bitset_type;
-  typedef
-      typename bitset_type::execution_space::execution_space execution_space;
-  typedef typename bitset_type::size_type size_type;
-  typedef size_type value_type;
+  using bitset_type = Bitset;
+  using execution_space =
+      typename bitset_type::execution_space::execution_space;
+  using size_type  = typename bitset_type::size_type;
+  using value_type = size_type;

  bitset_type m_bitset;

--- a/lib/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp
+++ b/lib/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp
@ -140,10 +140,10 @@ uint32_t MurmurHash3_x86_32(const void* key, int len, uint32_t seed) {
 template <typename T>
 KOKKOS_FORCEINLINE_FUNCTION bool bitwise_equal(T const* const a_ptr,
                                               T const* const b_ptr) {
-  typedef uint64_t KOKKOS_IMPL_MAY_ALIAS T64;
-  typedef uint32_t KOKKOS_IMPL_MAY_ALIAS T32;
-  typedef uint16_t KOKKOS_IMPL_MAY_ALIAS T16;
-  typedef uint8_t KOKKOS_IMPL_MAY_ALIAS T8;
+  typedef uint64_t KOKKOS_IMPL_MAY_ALIAS T64;  // NOLINT(modernize-use-using)
+  typedef uint32_t KOKKOS_IMPL_MAY_ALIAS T32;  // NOLINT(modernize-use-using)
+  typedef uint16_t KOKKOS_IMPL_MAY_ALIAS T16;  // NOLINT(modernize-use-using)
+  typedef uint8_t KOKKOS_IMPL_MAY_ALIAS T8;    // NOLINT(modernize-use-using)

  enum {
    NUM_8  = sizeof(T),
--- a/lib/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp
+++ b/lib/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp
@ -50,19 +50,6 @@

 namespace Kokkos {

-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-template <class DataType, class Arg1Type, class Arg2Type, typename SizeType,
-          class Arg3Type>
-inline typename StaticCrsGraph<DataType, Arg1Type, Arg2Type, SizeType,
-                               Arg3Type>::HostMirror
-create_mirror_view(
-    const StaticCrsGraph<DataType, Arg1Type, Arg2Type, SizeType, Arg3Type>&
-        view,
-    typename std::enable_if<ViewTraits<DataType, Arg1Type, Arg2Type,
-                                       Arg3Type>::is_hostspace>::type* = 0) {
-  return view;
-}
-#else
 template <class DataType, class Arg1Type, class Arg2Type, class Arg3Type,
          typename SizeType>
 inline typename StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type,
@ -74,20 +61,7 @@ create_mirror_view(
                                       Arg3Type>::is_hostspace>::type* = 0) {
  return view;
 }
-#endif

-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-template <class DataType, class Arg1Type, class Arg2Type, typename SizeType,
-          class Arg3Type>
-inline typename StaticCrsGraph<DataType, Arg1Type, Arg2Type, SizeType,
-                               Arg3Type>::HostMirror
-create_mirror(const StaticCrsGraph<DataType, Arg1Type, Arg2Type, SizeType,
-                                   Arg3Type>& view) {
-  // Force copy:
-  // typedef Impl::ViewAssignment< Impl::ViewDefault > alloc ; // unused
-  typedef StaticCrsGraph<DataType, Arg1Type, Arg2Type, SizeType, Arg3Type>
-      staticcrsgraph_type;
-#else
 template <class DataType, class Arg1Type, class Arg2Type, class Arg3Type,
          typename SizeType>
 inline typename StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type,
@ -95,10 +69,9 @@ inline typename StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type,
 create_mirror(const StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type,
                                   SizeType>& view) {
  // Force copy:
-  // typedef Impl::ViewAssignment< Impl::ViewDefault > alloc ; // unused
-  typedef StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type, SizeType>
-      staticcrsgraph_type;
-#endif
+  // using alloc = Impl::ViewAssignment<Impl::ViewDefault>; // unused
+  using staticcrsgraph_type =
+      StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type, SizeType>;

  typename staticcrsgraph_type::HostMirror tmp;
  typename staticcrsgraph_type::row_map_type::HostMirror tmp_row_map =
@ -120,17 +93,6 @@ create_mirror(const StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type,
  return tmp;
 }

-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-template <class DataType, class Arg1Type, class Arg2Type, typename SizeType,
-          class Arg3Type>
-inline typename StaticCrsGraph<DataType, Arg1Type, Arg2Type, SizeType,
-                               Arg3Type>::HostMirror
-create_mirror_view(
-    const StaticCrsGraph<DataType, Arg1Type, Arg2Type, SizeType, Arg3Type>&
-        view,
-    typename std::enable_if<!ViewTraits<DataType, Arg1Type, Arg2Type,
-                                        Arg3Type>::is_hostspace>::type* = 0)
-#else
 template <class DataType, class Arg1Type, class Arg2Type, class Arg3Type,
          typename SizeType>
 inline typename StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type,
@ -139,9 +101,7 @@ create_mirror_view(
    const StaticCrsGraph<DataType, Arg1Type, Arg2Type, Arg3Type, SizeType>&
        view,
    typename std::enable_if<!ViewTraits<DataType, Arg1Type, Arg2Type,
-                                        Arg3Type>::is_hostspace>::type* = 0)
-#endif
-{
+                                        Arg3Type>::is_hostspace>::type* = 0) {
  return create_mirror(view);
 }
 }  // namespace Kokkos
@ -154,16 +114,15 @@ namespace Kokkos {
 template <class StaticCrsGraphType, class InputSizeType>
 inline typename StaticCrsGraphType::staticcrsgraph_type create_staticcrsgraph(
    const std::string& label, const std::vector<InputSizeType>& input) {
-  typedef StaticCrsGraphType output_type;
-  // typedef std::vector< InputSizeType >  input_type ; // unused
+  using output_type = StaticCrsGraphType;
+  // using input_type = std::vector<InputSizeType>; // unused

-  typedef typename output_type::entries_type entries_type;
+  using entries_type = typename output_type::entries_type;

-  typedef View<typename output_type::size_type[],
-               typename output_type::array_layout,
-               typename output_type::execution_space,
-               typename output_type::memory_traits>
-      work_type;
+  using work_type = View<typename output_type::size_type[],
+                         typename output_type::array_layout,
+                         typename output_type::execution_space,
+                         typename output_type::memory_traits>;

  output_type output;

@ -197,16 +156,15 @@ template <class StaticCrsGraphType, class InputSizeType>
 inline typename StaticCrsGraphType::staticcrsgraph_type create_staticcrsgraph(
    const std::string& label,
    const std::vector<std::vector<InputSizeType> >& input) {
-  typedef StaticCrsGraphType output_type;
-  typedef typename output_type::entries_type entries_type;
+  using output_type  = StaticCrsGraphType;
+  using entries_type = typename output_type::entries_type;

  static_assert(entries_type::rank == 1, "Graph entries view must be rank one");

-  typedef View<typename output_type::size_type[],
-               typename output_type::array_layout,
-               typename output_type::execution_space,
-               typename output_type::memory_traits>
-      work_type;
+  using work_type = View<typename output_type::size_type[],
+                         typename output_type::array_layout,
+                         typename output_type::execution_space,
+                         typename output_type::memory_traits>;

  output_type output;

--- a/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp
+++ b/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp
@ -60,10 +60,10 @@ uint32_t find_hash_size(uint32_t size);

 template <typename Map>
 struct UnorderedMapRehash {
-  typedef Map map_type;
-  typedef typename map_type::const_map_type const_map_type;
-  typedef typename map_type::execution_space execution_space;
-  typedef typename map_type::size_type size_type;
+  using map_type        = Map;
+  using const_map_type  = typename map_type::const_map_type;
+  using execution_space = typename map_type::execution_space;
+  using size_type       = typename map_type::size_type;

  map_type m_dst;
  const_map_type m_src;
@ -84,11 +84,11 @@ struct UnorderedMapRehash {

 template <typename UMap>
 struct UnorderedMapErase {
-  typedef UMap map_type;
-  typedef typename map_type::execution_space execution_space;
-  typedef typename map_type::size_type size_type;
-  typedef typename map_type::key_type key_type;
-  typedef typename map_type::impl_value_type value_type;
+  using map_type        = UMap;
+  using execution_space = typename map_type::execution_space;
+  using size_type       = typename map_type::size_type;
+  using key_type        = typename map_type::key_type;
+  using value_type      = typename map_type::impl_value_type;

  map_type m_map;

@ -140,12 +140,12 @@ struct UnorderedMapErase {

 template <typename UMap>
 struct UnorderedMapHistogram {
-  typedef UMap map_type;
-  typedef typename map_type::execution_space execution_space;
-  typedef typename map_type::size_type size_type;
+  using map_type        = UMap;
+  using execution_space = typename map_type::execution_space;
+  using size_type       = typename map_type::size_type;

-  typedef View<int[100], execution_space> histogram_view;
-  typedef typename histogram_view::HostMirror host_histogram_view;
+  using histogram_view      = View<int[100], execution_space>;
+  using host_histogram_view = typename histogram_view::HostMirror;

  map_type m_map;
  histogram_view m_length;
@ -230,9 +230,9 @@ struct UnorderedMapHistogram {

 template <typename UMap>
 struct UnorderedMapPrint {
-  typedef UMap map_type;
-  typedef typename map_type::execution_space execution_space;
-  typedef typename map_type::size_type size_type;
+  using map_type        = UMap;
+  using execution_space = typename map_type::execution_space;
+  using size_type       = typename map_type::size_type;

  map_type m_map;

--- a/lib/kokkos/containers/unit_tests/TestBitset.hpp
+++ b/lib/kokkos/containers/unit_tests/TestBitset.hpp
@ -47,6 +47,7 @@
 #include <iostream>
 #include <Kokkos_Core.hpp>
 #include <Kokkos_Bitset.hpp>
+#include <array>

 namespace Test {

@ -54,9 +55,9 @@ namespace Impl {

 template <typename Bitset, bool Set>
 struct TestBitset {
-  typedef Bitset bitset_type;
-  typedef typename bitset_type::execution_space execution_space;
-  typedef uint32_t value_type;
+  using bitset_type     = Bitset;
+  using execution_space = typename bitset_type::execution_space;
+  using value_type      = uint32_t;

  bitset_type m_bitset;

@ -95,9 +96,9 @@ struct TestBitset {

 template <typename Bitset>
 struct TestBitsetTest {
-  typedef Bitset bitset_type;
-  typedef typename bitset_type::execution_space execution_space;
-  typedef uint32_t value_type;
+  using bitset_type     = Bitset;
+  using execution_space = typename bitset_type::execution_space;
+  using value_type      = uint32_t;

  bitset_type m_bitset;

@ -127,9 +128,9 @@ struct TestBitsetTest {

 template <typename Bitset, bool Set>
 struct TestBitsetAny {
-  typedef Bitset bitset_type;
-  typedef typename bitset_type::execution_space execution_space;
-  typedef uint32_t value_type;
+  using bitset_type     = Bitset;
+  using execution_space = typename bitset_type::execution_space;
+  using value_type      = uint32_t;

  bitset_type m_bitset;

@ -181,16 +182,30 @@ struct TestBitsetAny {

 template <typename Device>
 void test_bitset() {
-  typedef Kokkos::Bitset<Device> bitset_type;
-  typedef Kokkos::ConstBitset<Device> const_bitset_type;
+  using bitset_type       = Kokkos::Bitset<Device>;
+  using const_bitset_type = Kokkos::ConstBitset<Device>;

-  // unsigned test_sizes[] = { 0u, 1000u, 1u<<14, 1u<<16, 10000001 };
-  unsigned test_sizes[] = {1000u, 1u << 14, 1u << 16, 10000001};
+  {
+    unsigned ts = 100u;
+    bitset_type b1;
+    ASSERT_TRUE(b1.is_allocated());

-  for (int i = 0, end = sizeof(test_sizes) / sizeof(unsigned); i < end; ++i) {
+    b1 = bitset_type(ts);
+    bitset_type b2(b1);
+    bitset_type b3(ts);
+
+    ASSERT_TRUE(b1.is_allocated());
+    ASSERT_TRUE(b2.is_allocated());
+    ASSERT_TRUE(b3.is_allocated());
+  }
+
+  std::array<unsigned, 7> test_sizes = {
+      {0u, 10u, 100u, 1000u, 1u << 14, 1u << 16, 10000001}};
+
+  for (const auto test_size : test_sizes) {
    // std::cout << "Bitset " << test_sizes[i] << std::endl;

-    bitset_type bitset(test_sizes[i]);
+    bitset_type bitset(test_size);

    // std::cout << "  Check initial count " << std::endl;
    // nothing should be set
@ -253,10 +268,7 @@ void test_bitset() {
  }
 }

-// FIXME_HIP deadlock
-#ifndef KOKKOS_ENABLE_HIP
 TEST(TEST_CATEGORY, bitset) { test_bitset<TEST_EXECSPACE>(); }
-#endif
 }  // namespace Test

 #endif  // KOKKOS_TEST_BITSET_HPP
--- a/lib/kokkos/containers/unit_tests/TestDualView.hpp
+++ b/lib/kokkos/containers/unit_tests/TestDualView.hpp
@ -55,13 +55,45 @@
 namespace Test {

 namespace Impl {
+template <typename Scalar, class Device>
+struct test_dualview_alloc {
+  using scalar_type     = Scalar;
+  using execution_space = Device;
+
+  template <typename ViewType>
+  bool run_me(unsigned int n, unsigned int m) {
+    if (n < 10) n = 10;
+    if (m < 3) m = 3;
+
+    {
+      ViewType b1;
+      if (b1.is_allocated() == true) return false;
+
+      b1 = ViewType("B1", n, m);
+      ViewType b2(b1);
+      ViewType b3("B3", n, m);
+
+      if (b1.is_allocated() == false) return false;
+      if (b2.is_allocated() == false) return false;
+      if (b3.is_allocated() == false) return false;
+    }
+    return true;
+  }
+
+  bool result = false;
+
+  test_dualview_alloc(unsigned int size) {
+    result = run_me<Kokkos::DualView<Scalar**, Kokkos::LayoutLeft, Device> >(
+        size, 3);
+  }
+};

 template <typename Scalar, class Device>
 struct test_dualview_combinations {
-  typedef test_dualview_combinations<Scalar, Device> self_type;
+  using self_type = test_dualview_combinations<Scalar, Device>;

-  typedef Scalar scalar_type;
-  typedef Device execution_space;
+  using scalar_type     = Scalar;
+  using execution_space = Device;

  Scalar reference;
  Scalar result;
@ -110,7 +142,7 @@ struct test_dualview_combinations {

 template <typename Scalar, class ViewType>
 struct SumViewEntriesFunctor {
-  typedef Scalar value_type;
+  using value_type = Scalar;

  ViewType fv;

@ -126,8 +158,8 @@ struct SumViewEntriesFunctor {

 template <typename Scalar, class Device>
 struct test_dual_view_deep_copy {
-  typedef Scalar scalar_type;
-  typedef Device execution_space;
+  using scalar_type     = Scalar;
+  using execution_space = Device;

  template <typename ViewType>
  void run_me(int n, const int m, const bool use_templ_sync) {
@ -153,8 +185,8 @@ struct test_dual_view_deep_copy {
    // Check device view is initialized as expected
    scalar_type a_d_sum = 0;
    // Execute on the execution_space associated with t_dev's memory space
-    typedef typename ViewType::t_dev::memory_space::execution_space
-        t_dev_exec_space;
+    using t_dev_exec_space =
+        typename ViewType::t_dev::memory_space::execution_space;
    Kokkos::parallel_reduce(
        Kokkos::RangePolicy<t_dev_exec_space>(0, n),
        SumViewEntriesFunctor<scalar_type, typename ViewType::t_dev>(a.d_view),
@ -220,8 +252,8 @@ struct test_dual_view_deep_copy {

 template <typename Scalar, class Device>
 struct test_dualview_resize {
-  typedef Scalar scalar_type;
-  typedef Device execution_space;
+  using scalar_type     = Scalar;
+  using execution_space = Device;

  template <typename ViewType>
  void run_me() {
@ -244,8 +276,8 @@ struct test_dualview_resize {
    // Check device view is initialized as expected
    scalar_type a_d_sum = 0;
    // Execute on the execution_space associated with t_dev's memory space
-    typedef typename ViewType::t_dev::memory_space::execution_space
-        t_dev_exec_space;
+    using t_dev_exec_space =
+        typename ViewType::t_dev::memory_space::execution_space;
    Kokkos::parallel_reduce(
        Kokkos::RangePolicy<t_dev_exec_space>(0, a.d_view.extent(0)),
        SumViewEntriesFunctor<scalar_type, typename ViewType::t_dev>(a.d_view),
@ -274,8 +306,8 @@ struct test_dualview_resize {
    // Check device view is initialized as expected
    a_d_sum = 0;
    // Execute on the execution_space associated with t_dev's memory space
-    typedef typename ViewType::t_dev::memory_space::execution_space
-        t_dev_exec_space;
+    using t_dev_exec_space =
+        typename ViewType::t_dev::memory_space::execution_space;
    Kokkos::parallel_reduce(
        Kokkos::RangePolicy<t_dev_exec_space>(0, a.d_view.extent(0)),
        SumViewEntriesFunctor<scalar_type, typename ViewType::t_dev>(a.d_view),
@ -301,8 +333,8 @@ struct test_dualview_resize {

 template <typename Scalar, class Device>
 struct test_dualview_realloc {
-  typedef Scalar scalar_type;
-  typedef Device execution_space;
+  using scalar_type     = Scalar;
+  using execution_space = Device;

  template <typename ViewType>
  void run_me() {
@ -319,8 +351,8 @@ struct test_dualview_realloc {
    // Check device view is initialized as expected
    scalar_type a_d_sum = 0;
    // Execute on the execution_space associated with t_dev's memory space
-    typedef typename ViewType::t_dev::memory_space::execution_space
-        t_dev_exec_space;
+    using t_dev_exec_space =
+        typename ViewType::t_dev::memory_space::execution_space;
    Kokkos::parallel_reduce(
        Kokkos::RangePolicy<t_dev_exec_space>(0, a.d_view.extent(0)),
        SumViewEntriesFunctor<scalar_type, typename ViewType::t_dev>(a.d_view),
@ -351,6 +383,12 @@ void test_dualview_combinations(unsigned int size, bool with_init) {
  ASSERT_EQ(test.result, 0);
 }

+template <typename Scalar, typename Device>
+void test_dualview_alloc(unsigned int size) {
+  Impl::test_dualview_alloc<Scalar, Device> test(size);
+  ASSERT_TRUE(test.result);
+}
+
 template <typename Scalar, typename Device>
 void test_dualview_deep_copy() {
  Impl::test_dual_view_deep_copy<Scalar, Device>();
@ -370,6 +408,10 @@ TEST(TEST_CATEGORY, dualview_combination) {
  test_dualview_combinations<int, TEST_EXECSPACE>(10, true);
 }

+TEST(TEST_CATEGORY, dualview_alloc) {
+  test_dualview_alloc<int, TEST_EXECSPACE>(10);
+}
+
 TEST(TEST_CATEGORY, dualview_combinations_without_init) {
  test_dualview_combinations<int, TEST_EXECSPACE>(10, false);
 }
--- a/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp
+++ b/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp
@ -68,12 +68,12 @@ size_t allocation_count(const Kokkos::DynRankView<T, P...>& view) {

 template <typename T, class DeviceType>
 struct TestViewOperator {
-  typedef DeviceType execution_space;
+  using execution_space = DeviceType;

  static const unsigned N = 100;
  static const unsigned D = 3;

-  typedef Kokkos::DynRankView<T, execution_space> view_type;
+  using view_type = Kokkos::DynRankView<T, execution_space>;

  const view_type v1;
  const view_type v2;
@ -101,11 +101,11 @@ struct TestViewOperator_LeftAndRight;

 template <class DataType, class DeviceType>
 struct TestViewOperator_LeftAndRight<DataType, DeviceType, 7> {
-  typedef DeviceType execution_space;
-  typedef typename execution_space::memory_space memory_space;
-  typedef typename execution_space::size_type size_type;
+  using execution_space = DeviceType;
+  using memory_space    = typename execution_space::memory_space;
+  using size_type       = typename execution_space::size_type;

-  typedef int value_type;
+  using value_type = int;

  KOKKOS_INLINE_FUNCTION
  static void join(volatile value_type& update,
@ -116,11 +116,11 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 7> {
  KOKKOS_INLINE_FUNCTION
  static void init(value_type& update) { update = 0; }

-  typedef Kokkos::DynRankView<DataType, Kokkos::LayoutLeft, execution_space>
-      left_view;
+  using left_view =
+      Kokkos::DynRankView<DataType, Kokkos::LayoutLeft, execution_space>;

-  typedef Kokkos::DynRankView<DataType, Kokkos::LayoutRight, execution_space>
-      right_view;
+  using right_view =
+      Kokkos::DynRankView<DataType, Kokkos::LayoutRight, execution_space>;

  left_view left;
  right_view right;
@ -186,11 +186,11 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 7> {

 template <class DataType, class DeviceType>
 struct TestViewOperator_LeftAndRight<DataType, DeviceType, 6> {
-  typedef DeviceType execution_space;
-  typedef typename execution_space::memory_space memory_space;
-  typedef typename execution_space::size_type size_type;
+  using execution_space = DeviceType;
+  using memory_space    = typename execution_space::memory_space;
+  using size_type       = typename execution_space::size_type;

-  typedef int value_type;
+  using value_type = int;

  KOKKOS_INLINE_FUNCTION
  static void join(volatile value_type& update,
@ -201,11 +201,11 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 6> {
  KOKKOS_INLINE_FUNCTION
  static void init(value_type& update) { update = 0; }

-  typedef Kokkos::DynRankView<DataType, Kokkos::LayoutLeft, execution_space>
-      left_view;
+  using left_view =
+      Kokkos::DynRankView<DataType, Kokkos::LayoutLeft, execution_space>;

-  typedef Kokkos::DynRankView<DataType, Kokkos::LayoutRight, execution_space>
-      right_view;
+  using right_view =
+      Kokkos::DynRankView<DataType, Kokkos::LayoutRight, execution_space>;

  left_view left;
  right_view right;
@ -268,11 +268,11 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 6> {

 template <class DataType, class DeviceType>
 struct TestViewOperator_LeftAndRight<DataType, DeviceType, 5> {
-  typedef DeviceType execution_space;
-  typedef typename execution_space::memory_space memory_space;
-  typedef typename execution_space::size_type size_type;
+  using execution_space = DeviceType;
+  using memory_space    = typename execution_space::memory_space;
+  using size_type       = typename execution_space::size_type;

-  typedef int value_type;
+  using value_type = int;

  KOKKOS_INLINE_FUNCTION
  static void join(volatile value_type& update,
@ -283,14 +283,14 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 5> {
  KOKKOS_INLINE_FUNCTION
  static void init(value_type& update) { update = 0; }

-  typedef Kokkos::DynRankView<DataType, Kokkos::LayoutLeft, execution_space>
-      left_view;
+  using left_view =
+      Kokkos::DynRankView<DataType, Kokkos::LayoutLeft, execution_space>;

-  typedef Kokkos::DynRankView<DataType, Kokkos::LayoutRight, execution_space>
-      right_view;
+  using right_view =
+      Kokkos::DynRankView<DataType, Kokkos::LayoutRight, execution_space>;

-  typedef Kokkos::DynRankView<DataType, Kokkos::LayoutStride, execution_space>
-      stride_view;
+  using stride_view =
+      Kokkos::DynRankView<DataType, Kokkos::LayoutStride, execution_space>;

  left_view left;
  right_view right;
@ -363,11 +363,11 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 5> {

 template <class DataType, class DeviceType>
 struct TestViewOperator_LeftAndRight<DataType, DeviceType, 4> {
-  typedef DeviceType execution_space;
-  typedef typename execution_space::memory_space memory_space;
-  typedef typename execution_space::size_type size_type;
+  using execution_space = DeviceType;
+  using memory_space    = typename execution_space::memory_space;
+  using size_type       = typename execution_space::size_type;

-  typedef int value_type;
+  using value_type = int;

  KOKKOS_INLINE_FUNCTION
  static void join(volatile value_type& update,
@ -378,11 +378,11 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 4> {
  KOKKOS_INLINE_FUNCTION
  static void init(value_type& update) { update = 0; }

-  typedef Kokkos::DynRankView<DataType, Kokkos::LayoutLeft, execution_space>
-      left_view;
+  using left_view =
+      Kokkos::DynRankView<DataType, Kokkos::LayoutLeft, execution_space>;

-  typedef Kokkos::DynRankView<DataType, Kokkos::LayoutRight, execution_space>
-      right_view;
+  using right_view =
+      Kokkos::DynRankView<DataType, Kokkos::LayoutRight, execution_space>;

  left_view left;
  right_view right;
@ -438,11 +438,11 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 4> {

 template <class DataType, class DeviceType>
 struct TestViewOperator_LeftAndRight<DataType, DeviceType, 3> {
-  typedef DeviceType execution_space;
-  typedef typename execution_space::memory_space memory_space;
-  typedef typename execution_space::size_type size_type;
+  using execution_space = DeviceType;
+  using memory_space    = typename execution_space::memory_space;
+  using size_type       = typename execution_space::size_type;

-  typedef int value_type;
+  using value_type = int;

  KOKKOS_INLINE_FUNCTION
  static void join(volatile value_type& update,
@ -453,14 +453,14 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 3> {
  KOKKOS_INLINE_FUNCTION
  static void init(value_type& update) { update = 0; }

-  typedef Kokkos::DynRankView<DataType, Kokkos::LayoutLeft, execution_space>
-      left_view;
+  using left_view =
+      Kokkos::DynRankView<DataType, Kokkos::LayoutLeft, execution_space>;

-  typedef Kokkos::DynRankView<DataType, Kokkos::LayoutRight, execution_space>
-      right_view;
+  using right_view =
+      Kokkos::DynRankView<DataType, Kokkos::LayoutRight, execution_space>;

-  typedef Kokkos::DynRankView<DataType, Kokkos::LayoutStride, execution_space>
-      stride_view;
+  using stride_view =
+      Kokkos::DynRankView<DataType, Kokkos::LayoutStride, execution_space>;

  left_view left;
  right_view right;
@ -536,11 +536,11 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 3> {

 template <class DataType, class DeviceType>
 struct TestViewOperator_LeftAndRight<DataType, DeviceType, 2> {
-  typedef DeviceType execution_space;
-  typedef typename execution_space::memory_space memory_space;
-  typedef typename execution_space::size_type size_type;
+  using execution_space = DeviceType;
+  using memory_space    = typename execution_space::memory_space;
+  using size_type       = typename execution_space::size_type;

-  typedef int value_type;
+  using value_type = int;

  KOKKOS_INLINE_FUNCTION
  static void join(volatile value_type& update,
@ -551,11 +551,11 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 2> {
  KOKKOS_INLINE_FUNCTION
  static void init(value_type& update) { update = 0; }

-  typedef Kokkos::DynRankView<DataType, Kokkos::LayoutLeft, execution_space>
-      left_view;
+  using left_view =
+      Kokkos::DynRankView<DataType, Kokkos::LayoutLeft, execution_space>;

-  typedef Kokkos::DynRankView<DataType, Kokkos::LayoutRight, execution_space>
-      right_view;
+  using right_view =
+      Kokkos::DynRankView<DataType, Kokkos::LayoutRight, execution_space>;

  left_view left;
  right_view right;
@ -616,11 +616,11 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 2> {

 template <class DataType, class DeviceType>
 struct TestViewOperator_LeftAndRight<DataType, DeviceType, 1> {
-  typedef DeviceType execution_space;
-  typedef typename execution_space::memory_space memory_space;
-  typedef typename execution_space::size_type size_type;
+  using execution_space = DeviceType;
+  using memory_space    = typename execution_space::memory_space;
+  using size_type       = typename execution_space::size_type;

-  typedef int value_type;
+  using value_type = int;

  KOKKOS_INLINE_FUNCTION
  static void join(volatile value_type& update,
@ -631,14 +631,14 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 1> {
  KOKKOS_INLINE_FUNCTION
  static void init(value_type& update) { update = 0; }

-  typedef Kokkos::DynRankView<DataType, Kokkos::LayoutLeft, execution_space>
-      left_view;
+  using left_view =
+      Kokkos::DynRankView<DataType, Kokkos::LayoutLeft, execution_space>;

-  typedef Kokkos::DynRankView<DataType, Kokkos::LayoutRight, execution_space>
-      right_view;
+  using right_view =
+      Kokkos::DynRankView<DataType, Kokkos::LayoutRight, execution_space>;

-  typedef Kokkos::DynRankView<DataType, Kokkos::LayoutStride, execution_space>
-      stride_view;
+  using stride_view =
+      Kokkos::DynRankView<DataType, Kokkos::LayoutStride, execution_space>;

  left_view left;
  right_view right;
@ -689,22 +689,22 @@ struct TestViewOperator_LeftAndRight<DataType, DeviceType, 1> {
 template <typename T, class DeviceType>
 class TestDynViewAPI {
 public:
-  typedef DeviceType device;
+  using device = DeviceType;

  enum { N0 = 1000, N1 = 3, N2 = 5, N3 = 7 };

-  typedef Kokkos::DynRankView<T, device> dView0;
-  typedef Kokkos::DynRankView<const T, device> const_dView0;
+  using dView0       = Kokkos::DynRankView<T, device>;
+  using const_dView0 = Kokkos::DynRankView<const T, device>;

-  typedef Kokkos::DynRankView<T, device, Kokkos::MemoryUnmanaged>
-      dView0_unmanaged;
-  typedef typename dView0::host_mirror_space host_drv_space;
+  using dView0_unmanaged =
+      Kokkos::DynRankView<T, device, Kokkos::MemoryUnmanaged>;
+  using host_drv_space = typename dView0::host_mirror_space;

-  typedef Kokkos::View<T, device> View0;
-  typedef Kokkos::View<T*, device> View1;
-  typedef Kokkos::View<T*******, device> View7;
+  using View0 = Kokkos::View<T, device>;
+  using View1 = Kokkos::View<T*, device>;
+  using View7 = Kokkos::View<T*******, device>;

-  typedef typename View0::host_mirror_space host_view_space;
+  using host_view_space = typename View0::host_mirror_space;

  static void run_tests() {
    run_test_resize_realloc();
@ -712,6 +712,7 @@ class TestDynViewAPI {
    run_test_mirror_and_copy();
    run_test_scalar();
    run_test();
+    run_test_allocated();
    run_test_const();
    run_test_subview();
    run_test_subview_strided();
@ -750,8 +751,8 @@ class TestDynViewAPI {
  }

  static void run_test_mirror() {
-    typedef Kokkos::DynRankView<int, host_drv_space> view_type;
-    typedef typename view_type::HostMirror mirror_type;
+    using view_type   = Kokkos::DynRankView<int, host_drv_space>;
+    using mirror_type = typename view_type::HostMirror;
    view_type a("a");
    mirror_type am = Kokkos::create_mirror_view(a);
    mirror_type ax = Kokkos::create_mirror(a);
@ -851,8 +852,8 @@ class TestDynViewAPI {
      ASSERT_EQ(a_h.rank(), a_d.rank());
    }
    {
-      typedef Kokkos::DynRankView<int, Kokkos::LayoutStride, Kokkos::HostSpace>
-          view_stride_type;
+      using view_stride_type =
+          Kokkos::DynRankView<int, Kokkos::LayoutStride, Kokkos::HostSpace>;
      unsigned order[] = {6, 5, 4, 3, 2, 1, 0},
               dimen[] = {N0, N1, N2, 2, 2, 2, 2};  // LayoutRight equivalent
      view_stride_type a_h(
@ -956,8 +957,8 @@ class TestDynViewAPI {
  }

  static void run_test_scalar() {
-    typedef typename dView0::HostMirror
-        hView0;  // HostMirror of DynRankView is a DynRankView
+    using hView0 = typename dView0::HostMirror;  // HostMirror of DynRankView is
+                                                 // a DynRankView

    dView0 dx, dy;
    hView0 hx, hy;
@ -1050,12 +1051,12 @@ class TestDynViewAPI {

  static void run_test() {
    // mfh 14 Feb 2014: This test doesn't actually create instances of
-    // these types.  In order to avoid "declared but unused typedef"
+    // these types.  In order to avoid "unused type alias"
    // warnings, we declare empty instances of these types, with the
    // usual "(void)" marker to avoid compiler warnings for unused
    // variables.

-    typedef typename dView0::HostMirror hView0;
+    using hView0 = typename dView0::HostMirror;

    {
      hView0 thing;
@ -1361,7 +1362,7 @@ class TestDynViewAPI {
    }
  }

-  typedef T DataType;
+  using DataType = T;

  static void check_auto_conversion_to_const(
      const Kokkos::DynRankView<const DataType, device>& arg_const,
@ -1369,12 +1370,28 @@ class TestDynViewAPI {
    ASSERT_TRUE(arg_const == arg);
  }

+  static void run_test_allocated() {
+    using device_type = Kokkos::DynRankView<DataType, device>;
+
+    const int N1 = 100;
+    const int N2 = 10;
+
+    device_type d1;
+    ASSERT_FALSE(d1.is_allocated());
+
+    d1 = device_type("d1", N1, N2);
+    device_type d2(d1);
+    device_type d3("d3", N1);
+    ASSERT_TRUE(d1.is_allocated());
+    ASSERT_TRUE(d2.is_allocated());
+    ASSERT_TRUE(d3.is_allocated());
+  }
+
  static void run_test_const() {
-    typedef Kokkos::DynRankView<DataType, device> typeX;
-    typedef Kokkos::DynRankView<const DataType, device> const_typeX;
-    typedef Kokkos::DynRankView<const DataType, device,
-                                Kokkos::MemoryRandomAccess>
-        const_typeR;
+    using typeX       = Kokkos::DynRankView<DataType, device>;
+    using const_typeX = Kokkos::DynRankView<const DataType, device>;
+    using const_typeR =
+        Kokkos::DynRankView<const DataType, device, Kokkos::MemoryRandomAccess>;
    typeX x("X", 2);
    const_typeX xc = x;
    const_typeR xr = x;
@ -1398,10 +1415,10 @@ class TestDynViewAPI {
  }

  static void run_test_subview() {
-    typedef Kokkos::DynRankView<const T, device> cdView;
-    typedef Kokkos::DynRankView<T, device> dView;
+    using cdView = Kokkos::DynRankView<const T, device>;
+    using dView  = Kokkos::DynRankView<T, device>;
    // LayoutStride required for all returned DynRankView subdynrankview's
-    typedef Kokkos::DynRankView<T, Kokkos::LayoutStride, device> sdView;
+    using sdView = Kokkos::DynRankView<T, Kokkos::LayoutStride, device>;

    dView0 d0("d0");
    cdView s0 = d0;
@ -1452,7 +1469,7 @@ class TestDynViewAPI {
    ASSERT_EQ(dv6.rank(), 6);

    // DynRankView with LayoutRight
-    typedef Kokkos::DynRankView<T, Kokkos::LayoutRight, device> drView;
+    using drView = Kokkos::DynRankView<T, Kokkos::LayoutRight, device>;
    drView dr5("dr5", N0, N1, N2, 2, 2);
    ASSERT_EQ(dr5.rank(), 5);

@ -1514,7 +1531,8 @@ class TestDynViewAPI {
    ASSERT_EQ(ds5.extent(4), ds5plus.extent(4));
    ASSERT_EQ(ds5.extent(5), ds5plus.extent(5));

-#if !defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_CUDA_UVM)
+#if (!defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_CUDA_UVM)) && \
+    !defined(KOKKOS_ENABLE_HIP)
    ASSERT_EQ(&ds5(1, 1, 1, 1, 0) - &ds5plus(1, 1, 1, 1, 0), 0);
    ASSERT_EQ(&ds5(1, 1, 1, 1, 0, 0) - &ds5plus(1, 1, 1, 1, 0, 0),
              0);  // passing argument to rank beyond the view's rank is allowed
@ -1538,12 +1556,12 @@ class TestDynViewAPI {
  }

  static void run_test_subview_strided() {
-    typedef Kokkos::DynRankView<int, Kokkos::LayoutLeft, host_drv_space>
-        drview_left;
-    typedef Kokkos::DynRankView<int, Kokkos::LayoutRight, host_drv_space>
-        drview_right;
-    typedef Kokkos::DynRankView<int, Kokkos::LayoutStride, host_drv_space>
-        drview_stride;
+    using drview_left =
+        Kokkos::DynRankView<int, Kokkos::LayoutLeft, host_drv_space>;
+    using drview_right =
+        Kokkos::DynRankView<int, Kokkos::LayoutRight, host_drv_space>;
+    using drview_stride =
+        Kokkos::DynRankView<int, Kokkos::LayoutStride, host_drv_space>;

    drview_left xl2("xl2", 100, 200);
    drview_right xr2("xr2", 100, 200);
@ -1588,31 +1606,29 @@ class TestDynViewAPI {
  static void run_test_vector() {
    static const unsigned Length = 1000, Count = 8;

-    typedef typename Kokkos::DynRankView<T, Kokkos::LayoutLeft, host_drv_space>
-        multivector_type;
+    using multivector_type =
+        typename Kokkos::DynRankView<T, Kokkos::LayoutLeft, host_drv_space>;

-    typedef typename Kokkos::DynRankView<T, Kokkos::LayoutRight, host_drv_space>
-        multivector_right_type;
+    using multivector_right_type =
+        typename Kokkos::DynRankView<T, Kokkos::LayoutRight, host_drv_space>;

    multivector_type mv = multivector_type("mv", Length, Count);
    multivector_right_type mv_right =
        multivector_right_type("mv", Length, Count);

-    typedef
-        typename Kokkos::DynRankView<T, Kokkos::LayoutStride, host_drv_space>
-            svector_type;
-    typedef
-        typename Kokkos::DynRankView<T, Kokkos::LayoutStride, host_drv_space>
-            smultivector_type;
-    typedef typename Kokkos::DynRankView<const T, Kokkos::LayoutStride,
-                                         host_drv_space>
-        const_svector_right_type;
-    typedef typename Kokkos::DynRankView<const T, Kokkos::LayoutStride,
-                                         host_drv_space>
-        const_svector_type;
-    typedef typename Kokkos::DynRankView<const T, Kokkos::LayoutStride,
-                                         host_drv_space>
-        const_smultivector_type;
+    using svector_type =
+        typename Kokkos::DynRankView<T, Kokkos::LayoutStride, host_drv_space>;
+    using smultivector_type =
+        typename Kokkos::DynRankView<T, Kokkos::LayoutStride, host_drv_space>;
+    using const_svector_right_type =
+        typename Kokkos::DynRankView<const T, Kokkos::LayoutStride,
+                                     host_drv_space>;
+    using const_svector_type =
+        typename Kokkos::DynRankView<const T, Kokkos::LayoutStride,
+                                     host_drv_space>;
+    using const_smultivector_type =
+        typename Kokkos::DynRankView<const T, Kokkos::LayoutStride,
+                                     host_drv_space>;

    svector_type v1 = Kokkos::subdynrankview(mv, Kokkos::ALL(), 0);
    svector_type v2 = Kokkos::subdynrankview(mv, Kokkos::ALL(), 1);
--- a/lib/kokkos/containers/unit_tests/TestDynViewAPI_generic.hpp
+++ b/lib/kokkos/containers/unit_tests/TestDynViewAPI_generic.hpp
@ -44,10 +44,7 @@

 #include <TestDynViewAPI.hpp>
 namespace Test {
-// FIXME_HIP attempt to access inaccessible memory space
-#ifndef KOKKOS_ENABLE_HIP
 TEST(TEST_CATEGORY, dyn_rank_view_api_generic) {
  TestDynViewAPI<double, TEST_EXECSPACE>::run_tests();
 }
-#endif
 }  // namespace Test
--- a/lib/kokkos/containers/unit_tests/TestDynViewAPI_rank12345.hpp
+++ b/lib/kokkos/containers/unit_tests/TestDynViewAPI_rank12345.hpp
@ -45,10 +45,7 @@
 #include <TestDynViewAPI.hpp>

 namespace Test {
-// FIXME_HIP failing with wrong value
-#ifndef KOKKOS_ENABLE_HIP
 TEST(TEST_CATEGORY, dyn_rank_view_api_operator_rank12345) {
  TestDynViewAPI<double, TEST_EXECSPACE>::run_operator_test_rank12345();
 }
-#endif
 }  // namespace Test
--- a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp
+++ b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp
@ -58,12 +58,12 @@ namespace Test {

 template <typename Scalar, class Space>
 struct TestDynamicView {
-  typedef typename Space::execution_space execution_space;
-  typedef typename Space::memory_space memory_space;
+  using execution_space = typename Space::execution_space;
+  using memory_space    = typename Space::memory_space;

-  typedef Kokkos::Experimental::DynamicView<Scalar*, Space> view_type;
+  using view_type = Kokkos::Experimental::DynamicView<Scalar*, Space>;

-  typedef double value_type;
+  using value_type = double;

  static void run(unsigned arg_total_size) {
    // Test: Create DynamicView, initialize size (via resize), run through
@ -71,6 +71,27 @@ struct TestDynamicView {
    // values and repeat
    //   Case 1: min_chunk_size is a power of 2
    {
+      {
+        view_type d1;
+        ASSERT_FALSE(d1.is_allocated());
+
+        d1 = view_type("d1", 1024, arg_total_size);
+        view_type d2(d1);
+        view_type d3("d3", 1024, arg_total_size);
+
+        ASSERT_FALSE(d1.is_allocated());
+        ASSERT_FALSE(d2.is_allocated());
+        ASSERT_FALSE(d3.is_allocated());
+
+        unsigned d_size = arg_total_size / 8;
+        d1.resize_serial(d_size);
+        d2.resize_serial(d_size);
+        d3.resize_serial(d_size);
+
+        ASSERT_TRUE(d1.is_allocated());
+        ASSERT_TRUE(d2.is_allocated());
+        ASSERT_TRUE(d3.is_allocated());
+      }
      view_type da("da", 1024, arg_total_size);
      ASSERT_EQ(da.size(), 0);
      // Init
@ -223,7 +244,7 @@ struct TestDynamicView {
 };

 TEST(TEST_CATEGORY, dynamic_view) {
-  typedef TestDynamicView<double, TEST_EXECSPACE> TestDynView;
+  using TestDynView = TestDynamicView<double, TEST_EXECSPACE>;

  for (int i = 0; i < 10; ++i) {
    TestDynView::run(100000 + 100 * i);
--- a/lib/kokkos/containers/unit_tests/TestErrorReporter.hpp
+++ b/lib/kokkos/containers/unit_tests/TestErrorReporter.hpp
@ -84,9 +84,9 @@ void checkReportersAndReportsAgree(const std::vector<int> &reporters,

 template <typename DeviceType>
 struct ErrorReporterDriverBase {
-  typedef ThreeValReport<int, int, double> report_type;
-  typedef Kokkos::Experimental::ErrorReporter<report_type, DeviceType>
-      error_reporter_type;
+  using report_type = ThreeValReport<int, int, double>;
+  using error_reporter_type =
+      Kokkos::Experimental::ErrorReporter<report_type, DeviceType>;
  error_reporter_type m_errorReporter;

  ErrorReporterDriverBase(int reporter_capacity, int /*test_size*/)
@ -97,10 +97,11 @@ struct ErrorReporterDriverBase {
  }

  void check_expectations(int reporter_capacity, int test_size) {
+    using namespace std;
    int num_reported = m_errorReporter.getNumReports();
    int num_attempts = m_errorReporter.getNumReportAttempts();

-    int expected_num_reports = std::min(reporter_capacity, test_size / 2);
+    int expected_num_reports = min(reporter_capacity, test_size / 2);
    EXPECT_EQ(expected_num_reports, num_reported);
    EXPECT_EQ(test_size / 2, num_attempts);

@ -112,7 +113,7 @@ struct ErrorReporterDriverBase {

 template <typename ErrorReporterDriverType>
 void TestErrorReporter() {
-  typedef ErrorReporterDriverType tester_type;
+  using tester_type = ErrorReporterDriverType;
  std::vector<int> reporters;
  std::vector<typename tester_type::report_type> reports;

@ -147,9 +148,9 @@ void TestErrorReporter() {

 template <typename DeviceType>
 struct ErrorReporterDriver : public ErrorReporterDriverBase<DeviceType> {
-  typedef ErrorReporterDriverBase<DeviceType> driver_base;
-  typedef typename driver_base::error_reporter_type::execution_space
-      execution_space;
+  using driver_base = ErrorReporterDriverBase<DeviceType>;
+  using execution_space =
+      typename driver_base::error_reporter_type::execution_space;

  ErrorReporterDriver(int reporter_capacity, int test_size)
      : driver_base(reporter_capacity, test_size) {
@ -185,12 +186,16 @@ struct ErrorReporterDriver : public ErrorReporterDriverBase<DeviceType> {
 template <typename DeviceType>
 struct ErrorReporterDriverUseLambda
    : public ErrorReporterDriverBase<DeviceType> {
-  typedef ErrorReporterDriverBase<DeviceType> driver_base;
-  typedef typename driver_base::error_reporter_type::execution_space
-      execution_space;
+  using driver_base = ErrorReporterDriverBase<DeviceType>;
+  using execution_space =
+      typename driver_base::error_reporter_type::execution_space;

  ErrorReporterDriverUseLambda(int reporter_capacity, int test_size)
      : driver_base(reporter_capacity, test_size) {
+    execute(reporter_capacity, test_size);
+  }
+
+  void execute(int reporter_capacity, int test_size) {
    Kokkos::parallel_for(
        Kokkos::RangePolicy<execution_space>(0, test_size),
        KOKKOS_CLASS_LAMBDA(const int work_idx) {
@ -210,9 +215,9 @@ struct ErrorReporterDriverUseLambda
 #ifdef KOKKOS_ENABLE_OPENMP
 struct ErrorReporterDriverNativeOpenMP
    : public ErrorReporterDriverBase<Kokkos::OpenMP> {
-  typedef ErrorReporterDriverBase<Kokkos::OpenMP> driver_base;
-  typedef typename driver_base::error_reporter_type::execution_space
-      execution_space;
+  using driver_base = ErrorReporterDriverBase<Kokkos::OpenMP>;
+  using execution_space =
+      typename driver_base::error_reporter_type::execution_space;

  ErrorReporterDriverNativeOpenMP(int reporter_capacity, int test_size)
      : driver_base(reporter_capacity, test_size) {
--- a/lib/kokkos/containers/unit_tests/TestOffsetView.hpp
+++ b/lib/kokkos/containers/unit_tests/TestOffsetView.hpp
@ -61,12 +61,25 @@ namespace Test {

 template <typename Scalar, typename Device>
 void test_offsetview_construction() {
-  typedef Kokkos::Experimental::OffsetView<Scalar**, Device> offset_view_type;
-  typedef Kokkos::View<Scalar**, Device> view_type;
+  using offset_view_type = Kokkos::Experimental::OffsetView<Scalar**, Device>;
+  using view_type        = Kokkos::View<Scalar**, Device>;

  Kokkos::Experimental::index_list_type range0 = {-1, 3};
  Kokkos::Experimental::index_list_type range1 = {-2, 2};

+  {
+    offset_view_type o1;
+    ASSERT_FALSE(o1.is_allocated());
+
+    o1 = offset_view_type("o1", range0, range1);
+    offset_view_type o2(o1);
+    offset_view_type o3("o3", range0, range1);
+
+    ASSERT_TRUE(o1.is_allocated());
+    ASSERT_TRUE(o2.is_allocated());
+    ASSERT_TRUE(o3.is_allocated());
+  }
+
  offset_view_type ov("firstOV", range0, range1);

  ASSERT_EQ("firstOV", ov.label());
@ -109,9 +122,9 @@ void test_offsetview_construction() {
  {  // test deep copy of scalar const value into mirro
    const int constVal = 6;
    typename offset_view_type::HostMirror hostOffsetView =
-        Kokkos::Experimental::create_mirror_view(ov);
+        Kokkos::create_mirror_view(ov);

-    Kokkos::Experimental::deep_copy(hostOffsetView, constVal);
+    Kokkos::deep_copy(hostOffsetView, constVal);

    for (int i = hostOffsetView.begin(0); i < hostOffsetView.end(0); ++i) {
      for (int j = hostOffsetView.begin(1); j < hostOffsetView.end(1); ++j) {
@ -121,10 +134,9 @@ void test_offsetview_construction() {
    }
  }

-  typedef Kokkos::MDRangePolicy<Device, Kokkos::Rank<2>,
-                                Kokkos::IndexType<int> >
-      range_type;
-  typedef typename range_type::point_type point_type;
+  using range_type =
+      Kokkos::MDRangePolicy<Device, Kokkos::Rank<2>, Kokkos::IndexType<int> >;
+  using point_type = typename range_type::point_type;

  range_type rangePolicy2D(point_type{{ovmin0, ovmin1}},
                           point_type{{ovend0, ovend1}});
@ -136,9 +148,9 @@ void test_offsetview_construction() {

  // test offsetview to offsetviewmirror deep copy
  typename offset_view_type::HostMirror hostOffsetView =
-      Kokkos::Experimental::create_mirror_view(ov);
+      Kokkos::create_mirror_view(ov);

-  Kokkos::Experimental::deep_copy(hostOffsetView, ov);
+  Kokkos::deep_copy(hostOffsetView, ov);

  for (int i = hostOffsetView.begin(0); i < hostOffsetView.end(0); ++i) {
    for (int j = hostOffsetView.begin(1); j < hostOffsetView.end(1); ++j) {
@ -185,10 +197,9 @@ void test_offsetview_construction() {

    Kokkos::deep_copy(view3D, 1);

-    typedef Kokkos::MDRangePolicy<Device, Kokkos::Rank<3>,
-                                  Kokkos::IndexType<int64_t> >
-        range3_type;
-    typedef typename range3_type::point_type point3_type;
+    using range3_type = Kokkos::MDRangePolicy<Device, Kokkos::Rank<3>,
+                                              Kokkos::IndexType<int64_t> >;
+    using point3_type = typename range3_type::point_type;

    typename point3_type::value_type begins0 = -10, begins1 = -20,
                                     begins2 = -30;
@ -245,7 +256,7 @@ void test_offsetview_construction() {

  {  // test offsetview to view deep copy
    view_type aView("aView", ov.extent(0), ov.extent(1));
-    Kokkos::Experimental::deep_copy(aView, ov);
+    Kokkos::deep_copy(aView, ov);

 #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
    int sum = 0;
@ -264,7 +275,7 @@ void test_offsetview_construction() {
    view_type aView("aView", ov.extent(0), ov.extent(1));

    Kokkos::deep_copy(aView, 99);
-    Kokkos::Experimental::deep_copy(ov, aView);
+    Kokkos::deep_copy(ov, aView);

 #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
    int sum = 0;
@ -447,10 +458,9 @@ void test_offsetview_subview() {
      ASSERT_EQ(offsetSubview.end(1), 9);

 #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
-      typedef Kokkos::MDRangePolicy<Device, Kokkos::Rank<2>,
-                                    Kokkos::IndexType<int> >
-          range_type;
-      typedef typename range_type::point_type point_type;
+      using range_type = Kokkos::MDRangePolicy<Device, Kokkos::Rank<2>,
+                                               Kokkos::IndexType<int> >;
+      using point_type = typename range_type::point_type;

      const int b0 = offsetSubview.begin(0);
      const int b1 = offsetSubview.begin(1);
--- a/lib/kokkos/containers/unit_tests/TestScatterView.hpp
+++ b/lib/kokkos/containers/unit_tests/TestScatterView.hpp
@ -50,21 +50,22 @@

 namespace Test {

-template <typename DeviceType, typename Layout, int duplication,
-          int contribution, int op>
+template <typename DeviceType, typename Layout, typename Duplication,
+          typename Contribution, typename Op, typename NumberType>
 struct test_scatter_view_impl_cls;

-template <typename DeviceType, typename Layout, int duplication,
-          int contribution>
-struct test_scatter_view_impl_cls<DeviceType, Layout, duplication, contribution,
-                                  Kokkos::Experimental::ScatterSum> {
+template <typename DeviceType, typename Layout, typename Duplication,
+          typename Contribution, typename NumberType>
+struct test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution,
+                                  Kokkos::Experimental::ScatterSum,
+                                  NumberType> {
 public:
-  typedef Kokkos::Experimental::ScatterView<double * [3], Layout, DeviceType,
-                                            Kokkos::Experimental::ScatterSum,
-                                            duplication, contribution>
-      scatter_view_type;
+  using scatter_view_type =
+      Kokkos::Experimental::ScatterView<NumberType * [12], Layout, DeviceType,
+                                        Kokkos::Experimental::ScatterSum,
+                                        Duplication, Contribution>;

-  typedef Kokkos::View<double * [3], Layout, DeviceType> orig_view_type;
+  using orig_view_type = Kokkos::View<NumberType * [12], Layout, DeviceType>;

  scatter_view_type scatter_view;
  int scatterSize;
@ -80,9 +81,18 @@ struct test_scatter_view_impl_cls<DeviceType, Layout, duplication, contribution,
    Kokkos::fence();
    for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0);
         ++i) {
-      host_view(i, 0) = 0.0;
-      host_view(i, 1) = 0.0;
-      host_view(i, 2) = 0.0;
+      host_view(i, 0)  = 0.0;
+      host_view(i, 1)  = 0.0;
+      host_view(i, 2)  = 0.0;
+      host_view(i, 3)  = 0.0;
+      host_view(i, 4)  = 0.0;
+      host_view(i, 5)  = 0.0;
+      host_view(i, 6)  = 0.0;
+      host_view(i, 7)  = 0.0;
+      host_view(i, 8)  = 0.0;
+      host_view(i, 9)  = 0.0;
+      host_view(i, 10) = 0.0;
+      host_view(i, 11) = 0.0;
    }
    Kokkos::fence();
    Kokkos::deep_copy(orig, host_view);
@ -102,9 +112,18 @@ struct test_scatter_view_impl_cls<DeviceType, Layout, duplication, contribution,
        scatter_view.template access<Kokkos::Experimental::ScatterAtomic>();
    for (int j = 0; j < 10; ++j) {
      auto k = (i + j) % scatterSize;
-      scatter_access(k, 0) += 4.2;
-      scatter_access_atomic(k, 1) += 2.0;
-      scatter_access(k, 2) += 1.0;
+      scatter_access(k, 0) += 4;
+      ++scatter_access(k, 1);
+      --scatter_access(k, 2);
+      scatter_access(k, 3)++;
+      scatter_access(k, 4)--;
+      scatter_access(k, 5) -= 5;
+      scatter_access_atomic(k, 6) += 2;
+      scatter_access_atomic(k, 7)++;
+      scatter_access_atomic(k, 8)--;
+      --scatter_access_atomic(k, 9);
+      ++scatter_access_atomic(k, 10);
+      scatter_access(k, 11) -= 3;
    }
  }

@ -114,27 +133,46 @@ struct test_scatter_view_impl_cls<DeviceType, Layout, duplication, contribution,
    Kokkos::fence();
    for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0);
         ++i) {
-      auto val0 = host_view(i, 0);
-      auto val1 = host_view(i, 1);
-      auto val2 = host_view(i, 2);
-      EXPECT_TRUE(std::fabs((val0 - 84.0) / 84.0) < 1e-14);
-      EXPECT_TRUE(std::fabs((val1 - 40.0) / 40.0) < 1e-14);
-      EXPECT_TRUE(std::fabs((val2 - 20.0) / 20.0) < 1e-14);
+      auto val0  = host_view(i, 0);
+      auto val1  = host_view(i, 1);
+      auto val2  = host_view(i, 2);
+      auto val3  = host_view(i, 3);
+      auto val4  = host_view(i, 4);
+      auto val5  = host_view(i, 5);
+      auto val6  = host_view(i, 6);
+      auto val7  = host_view(i, 7);
+      auto val8  = host_view(i, 8);
+      auto val9  = host_view(i, 9);
+      auto val10 = host_view(i, 10);
+      auto val11 = host_view(i, 11);
+      EXPECT_NEAR(val0, NumberType(80), 1e-14);
+      EXPECT_NEAR(val1, NumberType(20), 1e-14);
+      EXPECT_NEAR(val2, NumberType(-20), 1e-14);
+      EXPECT_NEAR(val3, NumberType(20), 1e-14);
+      EXPECT_NEAR(val4, NumberType(-20), 1e-14);
+      EXPECT_NEAR(val5, NumberType(-100), 1e-14);
+      EXPECT_NEAR(val6, NumberType(40), 1e-14);
+      EXPECT_NEAR(val7, NumberType(20), 1e-14);
+      EXPECT_NEAR(val8, NumberType(-20), 1e-14);
+      EXPECT_NEAR(val9, NumberType(-20), 1e-14);
+      EXPECT_NEAR(val10, NumberType(20), 1e-14);
+      EXPECT_NEAR(val11, NumberType(-60), 1e-14);
    }
  }
 };

-template <typename DeviceType, typename Layout, int duplication,
-          int contribution>
-struct test_scatter_view_impl_cls<DeviceType, Layout, duplication, contribution,
-                                  Kokkos::Experimental::ScatterProd> {
+template <typename DeviceType, typename Layout, typename Duplication,
+          typename Contribution, typename NumberType>
+struct test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution,
+                                  Kokkos::Experimental::ScatterProd,
+                                  NumberType> {
 public:
-  typedef Kokkos::Experimental::ScatterView<double * [3], Layout, DeviceType,
-                                            Kokkos::Experimental::ScatterProd,
-                                            duplication, contribution>
-      scatter_view_type;
+  using scatter_view_type =
+      Kokkos::Experimental::ScatterView<NumberType * [3], Layout, DeviceType,
+                                        Kokkos::Experimental::ScatterProd,
+                                        Duplication, Contribution>;

-  typedef Kokkos::View<double * [3], Layout, DeviceType> orig_view_type;
+  using orig_view_type = Kokkos::View<NumberType * [3], Layout, DeviceType>;

  scatter_view_type scatter_view;
  int scatterSize;
@ -194,17 +232,18 @@ struct test_scatter_view_impl_cls<DeviceType, Layout, duplication, contribution,
  }
 };

-template <typename DeviceType, typename Layout, int duplication,
-          int contribution>
-struct test_scatter_view_impl_cls<DeviceType, Layout, duplication, contribution,
-                                  Kokkos::Experimental::ScatterMin> {
+template <typename DeviceType, typename Layout, typename Duplication,
+          typename Contribution, typename NumberType>
+struct test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution,
+                                  Kokkos::Experimental::ScatterMin,
+                                  NumberType> {
 public:
-  typedef Kokkos::Experimental::ScatterView<double * [3], Layout, DeviceType,
-                                            Kokkos::Experimental::ScatterMin,
-                                            duplication, contribution>
-      scatter_view_type;
+  using scatter_view_type =
+      Kokkos::Experimental::ScatterView<NumberType * [3], Layout, DeviceType,
+                                        Kokkos::Experimental::ScatterMin,
+                                        Duplication, Contribution>;

-  typedef Kokkos::View<double * [3], Layout, DeviceType> orig_view_type;
+  using orig_view_type = Kokkos::View<NumberType * [3], Layout, DeviceType>;

  scatter_view_type scatter_view;
  int scatterSize;
@ -242,9 +281,9 @@ struct test_scatter_view_impl_cls<DeviceType, Layout, duplication, contribution,
        scatter_view.template access<Kokkos::Experimental::ScatterAtomic>();
    for (int j = 0; j < 4; ++j) {
      auto k = (i + j) % scatterSize;
-      scatter_access(k, 0).update((double)(j + 1) * 4);
-      scatter_access_atomic(k, 1).update((double)(j + 1) * 2.0);
-      scatter_access(k, 2).update((double)(j + 1) * 1.0);
+      scatter_access(k, 0).update((NumberType)(j + 1) * 4);
+      scatter_access_atomic(k, 1).update((NumberType)(j + 1) * 2.0);
+      scatter_access(k, 2).update((NumberType)(j + 1) * 1.0);
    }
  }

@ -264,17 +303,18 @@ struct test_scatter_view_impl_cls<DeviceType, Layout, duplication, contribution,
  }
 };

-template <typename DeviceType, typename Layout, int duplication,
-          int contribution>
-struct test_scatter_view_impl_cls<DeviceType, Layout, duplication, contribution,
-                                  Kokkos::Experimental::ScatterMax> {
+template <typename DeviceType, typename Layout, typename Duplication,
+          typename Contribution, typename NumberType>
+struct test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution,
+                                  Kokkos::Experimental::ScatterMax,
+                                  NumberType> {
 public:
-  typedef Kokkos::Experimental::ScatterView<double * [3], Layout, DeviceType,
-                                            Kokkos::Experimental::ScatterMax,
-                                            duplication, contribution>
-      scatter_view_type;
+  using scatter_view_type =
+      Kokkos::Experimental::ScatterView<NumberType * [3], Layout, DeviceType,
+                                        Kokkos::Experimental::ScatterMax,
+                                        Duplication, Contribution>;

-  typedef Kokkos::View<double * [3], Layout, DeviceType> orig_view_type;
+  using orig_view_type = Kokkos::View<NumberType * [3], Layout, DeviceType>;

  scatter_view_type scatter_view;
  int scatterSize;
@ -311,9 +351,9 @@ struct test_scatter_view_impl_cls<DeviceType, Layout, duplication, contribution,
        scatter_view.template access<Kokkos::Experimental::ScatterAtomic>();
    for (int j = 0; j < 4; ++j) {
      auto k = (i + j) % scatterSize;
-      scatter_access(k, 0).update((double)(j + 1) * 4);
-      scatter_access_atomic(k, 1).update((double)(j + 1) * 2.0);
-      scatter_access(k, 2).update((double)(j + 1) * 1.0);
+      scatter_access(k, 0).update((NumberType)(j + 1) * 4);
+      scatter_access_atomic(k, 1).update((NumberType)(j + 1) * 2.0);
+      scatter_access(k, 2).update((NumberType)(j + 1) * 1.0);
    }
  }

@ -333,27 +373,126 @@ struct test_scatter_view_impl_cls<DeviceType, Layout, duplication, contribution,
  }
 };

-template <typename DeviceType, typename Layout, int duplication,
-          int contribution, int op>
-struct test_scatter_view_config {
+template <typename DeviceType, typename Layout, typename Op,
+          typename NumberType>
+struct test_default_scatter_view {
 public:
-  typedef
-      typename test_scatter_view_impl_cls<DeviceType, Layout, duplication,
-                                          contribution, op>::scatter_view_type
-          scatter_view_def;
-  typedef typename test_scatter_view_impl_cls<DeviceType, Layout, duplication,
-                                              contribution, op>::orig_view_type
-      orig_view_def;
+  using default_duplication = Kokkos::Impl::Experimental::DefaultDuplication<
+      typename DeviceType::execution_space>;
+  using Duplication  = typename default_duplication::type;
+  using Contribution = typename Kokkos::Impl::Experimental::DefaultContribution<
+      typename DeviceType::execution_space, Duplication>::type;
+  using scatter_view_def =
+      typename test_scatter_view_impl_cls<DeviceType, Layout, Duplication,
+                                          Contribution, Op,
+                                          NumberType>::scatter_view_type;
+  using orig_view_def =
+      typename test_scatter_view_impl_cls<DeviceType, Layout, Duplication,
+                                          Contribution, Op,
+                                          NumberType>::orig_view_type;

  void run_test(int n) {
+    // Test creation via create_scatter_view overload 1
+    {
+      orig_view_def original_view("original_view", n);
+      scatter_view_def scatter_view =
+          Kokkos::Experimental::create_scatter_view(Op{}, original_view);
+
+      test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution,
+                                 Op, NumberType>
+          scatter_view_test_impl(scatter_view);
+      scatter_view_test_impl.initialize(original_view);
+      scatter_view_test_impl.run_parallel(n);
+
+      Kokkos::Experimental::contribute(original_view, scatter_view);
+      scatter_view.reset_except(original_view);
+
+      scatter_view_test_impl.run_parallel(n);
+
+      Kokkos::Experimental::contribute(original_view, scatter_view);
+      Kokkos::fence();
+
+      scatter_view_test_impl.validateResults(original_view);
+
+      {
+        scatter_view_def persistent_view("persistent", n);
+        auto result_view = persistent_view.subview();
+        contribute(result_view, persistent_view);
+        Kokkos::fence();
+      }
+    }
+  }
+};
+
+template <typename DeviceType, typename Layout, typename Duplication,
+          typename Contribution, typename Op, typename NumberType>
+struct test_scatter_view_config {
+ public:
+  using scatter_view_def =
+      typename test_scatter_view_impl_cls<DeviceType, Layout, Duplication,
+                                          Contribution, Op,
+                                          NumberType>::scatter_view_type;
+  using orig_view_def =
+      typename test_scatter_view_impl_cls<DeviceType, Layout, Duplication,
+                                          Contribution, Op,
+                                          NumberType>::orig_view_type;
+
+  void run_test(int n) {
+    // test allocation
+    {
+      orig_view_def ov1("ov1", n);
+      scatter_view_def sv1;
+
+      ASSERT_FALSE(sv1.is_allocated());
+
+      sv1 = Kokkos::Experimental::create_scatter_view<Op, Duplication,
+                                                      Contribution>(ov1);
+
+      scatter_view_def sv2(sv1);
+      scatter_view_def sv3("sv3", n);
+
+      ASSERT_TRUE(sv1.is_allocated());
+      ASSERT_TRUE(sv2.is_allocated());
+      ASSERT_TRUE(sv3.is_allocated());
+    }
+
    // Test creation via create_scatter_view
    {
      orig_view_def original_view("original_view", n);
      scatter_view_def scatter_view = Kokkos::Experimental::create_scatter_view<
-          op, duplication, contribution>(original_view);
+          Op, Duplication, Contribution>(original_view);

-      test_scatter_view_impl_cls<DeviceType, Layout, duplication, contribution,
-                                 op>
+      test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution,
+                                 Op, NumberType>
+          scatter_view_test_impl(scatter_view);
+      scatter_view_test_impl.initialize(original_view);
+      scatter_view_test_impl.run_parallel(n);
+
+      Kokkos::Experimental::contribute(original_view, scatter_view);
+      scatter_view.reset_except(original_view);
+
+      scatter_view_test_impl.run_parallel(n);
+
+      Kokkos::Experimental::contribute(original_view, scatter_view);
+      Kokkos::fence();
+
+      scatter_view_test_impl.validateResults(original_view);
+
+      {
+        scatter_view_def persistent_view("persistent", n);
+        auto result_view = persistent_view.subview();
+        contribute(result_view, persistent_view);
+        Kokkos::fence();
+      }
+    }
+    // Test creation via create_scatter_view overload 2
+    {
+      orig_view_def original_view("original_view", n);
+      scatter_view_def scatter_view = Kokkos::Experimental::create_scatter_view(
+          Op{}, Duplication{}, Contribution{}, original_view);
+
+      test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution,
+                                 Op, NumberType>
          scatter_view_test_impl(scatter_view);
      scatter_view_test_impl.initialize(original_view);
      scatter_view_test_impl.run_parallel(n);
@ -380,8 +519,8 @@ struct test_scatter_view_config {
      orig_view_def original_view("original_view", n);
      scatter_view_def scatter_view(original_view);

-      test_scatter_view_impl_cls<DeviceType, Layout, duplication, contribution,
-                                 op>
+      test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution,
+                                 Op, NumberType>
          scatter_view_test_impl(scatter_view);
      scatter_view_test_impl.initialize(original_view);
      scatter_view_test_impl.run_parallel(n);
@ -406,19 +545,19 @@ struct test_scatter_view_config {
  }
 };

-template <typename DeviceType, int ScatterType>
+template <typename DeviceType, typename ScatterType, typename NumberType>
 struct TestDuplicatedScatterView {
  TestDuplicatedScatterView(int n) {
    // ScatterSum test
    test_scatter_view_config<DeviceType, Kokkos::LayoutRight,
                             Kokkos::Experimental::ScatterDuplicated,
                             Kokkos::Experimental::ScatterNonAtomic,
-                             ScatterType>
+                             ScatterType, NumberType>
        test_sv_right_config;
    test_sv_right_config.run_test(n);
    test_scatter_view_config<
        DeviceType, Kokkos::LayoutLeft, Kokkos::Experimental::ScatterDuplicated,
-        Kokkos::Experimental::ScatterNonAtomic, ScatterType>
+        Kokkos::Experimental::ScatterNonAtomic, ScatterType, NumberType>
        test_sv_left_config;
    test_sv_left_config.run_test(n);
  }
@ -427,18 +566,19 @@ struct TestDuplicatedScatterView {
 #ifdef KOKKOS_ENABLE_CUDA
 // disable duplicated instantiation with CUDA until
 // UniqueToken can support it
-template <int ScatterType>
-struct TestDuplicatedScatterView<Kokkos::Cuda, ScatterType> {
+template <typename ScatterType, typename NumberType>
+struct TestDuplicatedScatterView<Kokkos::Cuda, ScatterType, NumberType> {
  TestDuplicatedScatterView(int) {}
 };
-template <int ScatterType>
+template <typename ScatterType, typename NumberType>
 struct TestDuplicatedScatterView<
-    Kokkos::Device<Kokkos::Cuda, Kokkos::CudaSpace>, ScatterType> {
+    Kokkos::Device<Kokkos::Cuda, Kokkos::CudaSpace>, ScatterType, NumberType> {
  TestDuplicatedScatterView(int) {}
 };
-template <int ScatterType>
+template <typename ScatterType, typename NumberType>
 struct TestDuplicatedScatterView<
-    Kokkos::Device<Kokkos::Cuda, Kokkos::CudaUVMSpace>, ScatterType> {
+    Kokkos::Device<Kokkos::Cuda, Kokkos::CudaUVMSpace>, ScatterType,
+    NumberType> {
  TestDuplicatedScatterView(int) {}
 };
 #endif
@ -446,13 +586,14 @@ struct TestDuplicatedScatterView<
 #ifdef KOKKOS_ENABLE_ROCM
 // disable duplicated instantiation with ROCm until
 // UniqueToken can support it
-template <int ScatterType>
+template <typename ScatterType>
 struct TestDuplicatedScatterView<Kokkos::Experimental::ROCm, ScatterType> {
  TestDuplicatedScatterView(int) {}
 };
 #endif

-template <typename DeviceType, int ScatterType>
+template <typename DeviceType, typename ScatterType,
+          typename NumberType = double>
 void test_scatter_view(int n) {
  using execution_space = typename DeviceType::execution_space;

@ -463,7 +604,7 @@ void test_scatter_view(int n) {
    test_scatter_view_config<DeviceType, Kokkos::LayoutRight,
                             Kokkos::Experimental::ScatterNonDuplicated,
                             Kokkos::Experimental::ScatterNonAtomic,
-                             ScatterType>
+                             ScatterType, NumberType>
        test_sv_config;
    test_sv_config.run_test(n);
  }
@ -472,30 +613,40 @@ void test_scatter_view(int n) {
 #endif
    test_scatter_view_config<DeviceType, Kokkos::LayoutRight,
                             Kokkos::Experimental::ScatterNonDuplicated,
-                             Kokkos::Experimental::ScatterAtomic, ScatterType>
+                             Kokkos::Experimental::ScatterAtomic, ScatterType,
+                             NumberType>
        test_sv_config;
    test_sv_config.run_test(n);
 #ifdef KOKKOS_ENABLE_SERIAL
  }
 #endif
  // with hundreds of threads we were running out of memory.
-  // limit (n) so that duplication doesn't exceed 8GB
+  // limit (n) so that duplication doesn't exceed 4GB
  constexpr std::size_t maximum_allowed_total_bytes =
-      8ull * 1024ull * 1024ull * 1024ull;
+      4ull * 1024ull * 1024ull * 1024ull;
  std::size_t const maximum_allowed_copy_bytes =
      maximum_allowed_total_bytes /
      std::size_t(execution_space().concurrency());
-  constexpr std::size_t bytes_per_value = sizeof(double) * 3;
+  constexpr std::size_t bytes_per_value = sizeof(NumberType) * 12;
  std::size_t const maximum_allowed_copy_values =
      maximum_allowed_copy_bytes / bytes_per_value;
  n = std::min(n, int(maximum_allowed_copy_values));
-  TestDuplicatedScatterView<DeviceType, ScatterType> duptest(n);
+
+  // if the default is duplicated, this needs to follow the limit
+  {
+    test_default_scatter_view<DeviceType, Kokkos::LayoutRight, ScatterType,
+                              NumberType>
+        test_default_sv;
+    test_default_sv.run_test(n);
+  }
+  TestDuplicatedScatterView<DeviceType, ScatterType, NumberType> duptest(n);
 }

-// FIXME_HIP ScatterView requires UniqueToken
-#ifndef KOKKOS_ENABLE_HIP
 TEST(TEST_CATEGORY, scatterview) {
-  test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterSum>(10);
+  test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterSum, double>(
+      10);
+  test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterSum,
+                    unsigned int>(10);
  test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterProd>(10);
  test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterMin>(10);
  test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterMax>(10);
@ -512,7 +663,10 @@ TEST(TEST_CATEGORY, scatterview) {
 #endif

 #endif
-  test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterSum>(big_n);
+  test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterSum, double>(
+      big_n);
+  test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterSum,
+                    unsigned int>(big_n);
  test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterProd>(big_n);
  test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterMin>(big_n);
  test_scatter_view<TEST_EXECSPACE, Kokkos::Experimental::ScatterMax>(big_n);
@ -522,7 +676,9 @@ TEST(TEST_CATEGORY, scatterview_devicetype) {
  using device_type =
      Kokkos::Device<TEST_EXECSPACE, typename TEST_EXECSPACE::memory_space>;

-  test_scatter_view<device_type, Kokkos::Experimental::ScatterSum>(10);
+  test_scatter_view<device_type, Kokkos::Experimental::ScatterSum, double>(10);
+  test_scatter_view<device_type, Kokkos::Experimental::ScatterSum,
+                    unsigned int>(10);
  test_scatter_view<device_type, Kokkos::Experimental::ScatterProd>(10);
  test_scatter_view<device_type, Kokkos::Experimental::ScatterMin>(10);
  test_scatter_view<device_type, Kokkos::Experimental::ScatterMax>(10);
@ -530,14 +686,19 @@ TEST(TEST_CATEGORY, scatterview_devicetype) {
 #ifdef KOKKOS_ENABLE_CUDA
  if (std::is_same<TEST_EXECSPACE, Kokkos::Cuda>::value) {
    using cuda_device_type = Kokkos::Device<Kokkos::Cuda, Kokkos::CudaSpace>;
-    test_scatter_view<cuda_device_type, Kokkos::Experimental::ScatterSum>(10);
+    test_scatter_view<cuda_device_type, Kokkos::Experimental::ScatterSum,
+                      double>(10);
+    test_scatter_view<cuda_device_type, Kokkos::Experimental::ScatterSum,
+                      unsigned int>(10);
    test_scatter_view<cuda_device_type, Kokkos::Experimental::ScatterProd>(10);
    test_scatter_view<cuda_device_type, Kokkos::Experimental::ScatterMin>(10);
    test_scatter_view<cuda_device_type, Kokkos::Experimental::ScatterMax>(10);
    using cudauvm_device_type =
        Kokkos::Device<Kokkos::Cuda, Kokkos::CudaUVMSpace>;
-    test_scatter_view<cudauvm_device_type, Kokkos::Experimental::ScatterSum>(
-        10);
+    test_scatter_view<cudauvm_device_type, Kokkos::Experimental::ScatterSum,
+                      double>(10);
+    test_scatter_view<cudauvm_device_type, Kokkos::Experimental::ScatterSum,
+                      unsigned int>(10);
    test_scatter_view<cudauvm_device_type, Kokkos::Experimental::ScatterProd>(
        10);
    test_scatter_view<cudauvm_device_type, Kokkos::Experimental::ScatterMin>(
@ -547,7 +708,6 @@ TEST(TEST_CATEGORY, scatterview_devicetype) {
  }
 #endif
 }
-#endif

 }  // namespace Test

--- a/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp
+++ b/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp
@ -55,12 +55,10 @@ namespace TestStaticCrsGraph {

 template <class Space>
 void run_test_graph() {
-  typedef Kokkos::StaticCrsGraph<unsigned, Space> dView;
-  typedef typename dView::HostMirror hView;
+  using dView = Kokkos::StaticCrsGraph<unsigned, Space>;
+  using hView = typename dView::HostMirror;

  const unsigned LENGTH = 1000;
-  dView dx;
-  hView hx;

  std::vector<std::vector<int> > graph(LENGTH);

@ -71,6 +69,23 @@ void run_test_graph() {
    }
  }

+  {
+    dView d1;
+    ASSERT_FALSE(d1.is_allocated());
+
+    d1 = Kokkos::create_staticcrsgraph<dView>("d1", graph);
+
+    dView d2(d1);
+    dView d3(d1.entries, d1.row_map);
+
+    ASSERT_TRUE(d1.is_allocated());
+    ASSERT_TRUE(d2.is_allocated());
+    ASSERT_TRUE(d3.is_allocated());
+  }
+
+  dView dx;
+  hView hx;
+
  dx = Kokkos::create_staticcrsgraph<dView>("dx", graph);
  hx = Kokkos::create_mirror(dx);

@ -98,8 +113,8 @@ void run_test_graph() {

 template <class Space>
 void run_test_graph2() {
-  typedef Kokkos::StaticCrsGraph<unsigned[3], Space> dView;
-  typedef typename dView::HostMirror hView;
+  using dView = Kokkos::StaticCrsGraph<unsigned[3], Space>;
+  using hView = typename dView::HostMirror;

  const unsigned LENGTH = 10;

@ -158,8 +173,8 @@ template <class Space>
 void run_test_graph3(size_t B, size_t N) {
  srand(10310);

-  typedef Kokkos::StaticCrsGraph<int, Space> dView;
-  typedef typename dView::HostMirror hView;
+  using dView = Kokkos::StaticCrsGraph<int, Space>;
+  using hView = typename dView::HostMirror;

  const unsigned LENGTH = 2000;

@ -197,20 +212,13 @@ void run_test_graph3(size_t B, size_t N) {

 template <class Space>
 void run_test_graph4() {
-  typedef unsigned ordinal_type;
-  typedef Kokkos::LayoutRight layout_type;
-  typedef Space space_type;
-  typedef Kokkos::MemoryUnmanaged memory_traits_type;
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-  typedef Kokkos::StaticCrsGraph<ordinal_type, layout_type, space_type,
-                                 ordinal_type, memory_traits_type>
-      dView;
-#else
-  typedef Kokkos::StaticCrsGraph<ordinal_type, layout_type, space_type,
-                                 memory_traits_type>
-      dView;
-#endif
-  typedef typename dView::HostMirror hView;
+  using ordinal_type       = unsigned;
+  using layout_type        = Kokkos::LayoutRight;
+  using space_type         = Space;
+  using memory_traits_type = Kokkos::MemoryUnmanaged;
+  using dView = Kokkos::StaticCrsGraph<ordinal_type, layout_type, space_type,
+                                       memory_traits_type>;
+  using hView = typename dView::HostMirror;

  dView dx;

@ -227,8 +235,8 @@ void run_test_graph4() {
  // of the unmanaged StaticCrsGraph

  // Data types for raw pointers storing StaticCrsGraph info
-  typedef typename dView::size_type ptr_row_map_type;
-  typedef typename dView::data_type ptr_entries_type;
+  using ptr_row_map_type = typename dView::size_type;
+  using ptr_entries_type = typename dView::data_type;

  const ordinal_type numRows = 8;
  const ordinal_type nnz     = 24;
@ -237,8 +245,8 @@ void run_test_graph4() {
                               4, 5, 4, 5, 2, 3, 6, 7, 2, 3, 6, 7};

  // Wrap pointers in unmanaged host views
-  typedef typename hView::row_map_type local_row_map_type;
-  typedef typename hView::entries_type local_entries_type;
+  using local_row_map_type = typename hView::row_map_type;
+  using local_entries_type = typename hView::entries_type;
  local_row_map_type unman_row_map(&(ptrRaw[0]), numRows + 1);
  local_entries_type unman_entries(&(indRaw[0]), nnz);

@ -248,10 +256,10 @@ void run_test_graph4() {
  // Create the device Views for copying the host arrays into
  // An allocation is needed on the device for the unmanaged StaticCrsGraph to
  // wrap the pointer
-  typedef typename Kokkos::View<ptr_row_map_type*, layout_type, space_type>
-      d_row_map_view_type;
-  typedef typename Kokkos::View<ptr_entries_type*, layout_type, space_type>
-      d_entries_view_type;
+  using d_row_map_view_type =
+      typename Kokkos::View<ptr_row_map_type*, layout_type, space_type>;
+  using d_entries_view_type =
+      typename Kokkos::View<ptr_entries_type*, layout_type, space_type>;

  d_row_map_view_type tmp_row_map("tmp_row_map", numRows + 1);
  d_entries_view_type tmp_entries("tmp_entries", nnz);
--- a/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp
+++ b/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp
@ -53,9 +53,9 @@ namespace Impl {

 template <typename MapType, bool Near = false>
 struct TestInsert {
-  typedef MapType map_type;
-  typedef typename map_type::execution_space execution_space;
-  typedef uint32_t value_type;
+  using map_type        = MapType;
+  using execution_space = typename map_type::execution_space;
+  using value_type      = uint32_t;

  map_type map;
  uint32_t inserts;
@ -101,10 +101,10 @@ struct TestInsert {

 template <typename MapType, bool Near>
 struct TestErase {
-  typedef TestErase<MapType, Near> self_type;
+  using self_type = TestErase<MapType, Near>;

-  typedef MapType map_type;
-  typedef typename MapType::execution_space execution_space;
+  using map_type        = MapType;
+  using execution_space = typename MapType::execution_space;

  map_type m_map;
  uint32_t m_num_erase;
@ -131,9 +131,9 @@ struct TestErase {

 template <typename MapType>
 struct TestFind {
-  typedef MapType map_type;
-  typedef typename MapType::execution_space::execution_space execution_space;
-  typedef uint32_t value_type;
+  using map_type        = MapType;
+  using execution_space = typename MapType::execution_space::execution_space;
+  using value_type      = uint32_t;

  map_type m_map;
  uint32_t m_num_insert;
@ -180,9 +180,9 @@ struct TestFind {
 template <typename Device>
 void test_insert(uint32_t num_nodes, uint32_t num_inserts,
                 uint32_t num_duplicates, bool near) {
-  typedef Kokkos::UnorderedMap<uint32_t, uint32_t, Device> map_type;
-  typedef Kokkos::UnorderedMap<const uint32_t, const uint32_t, Device>
-      const_map_type;
+  using map_type = Kokkos::UnorderedMap<uint32_t, uint32_t, Device>;
+  using const_map_type =
+      Kokkos::UnorderedMap<const uint32_t, const uint32_t, Device>;

  const uint32_t expected_inserts =
      (num_inserts + num_duplicates - 1u) / num_duplicates;
@ -232,7 +232,7 @@ void test_insert(uint32_t num_nodes, uint32_t num_inserts,

 template <typename Device>
 void test_failed_insert(uint32_t num_nodes) {
-  typedef Kokkos::UnorderedMap<uint32_t, uint32_t, Device> map_type;
+  using map_type = Kokkos::UnorderedMap<uint32_t, uint32_t, Device>;

  map_type map(num_nodes);
  Impl::TestInsert<map_type> test_insert(map, 2u * num_nodes, 1u);
@ -244,13 +244,11 @@ void test_failed_insert(uint32_t num_nodes) {

 template <typename Device>
 void test_deep_copy(uint32_t num_nodes) {
-  typedef Kokkos::UnorderedMap<uint32_t, uint32_t, Device> map_type;
-  typedef Kokkos::UnorderedMap<const uint32_t, const uint32_t, Device>
-      const_map_type;
+  using map_type = Kokkos::UnorderedMap<uint32_t, uint32_t, Device>;
+  using const_map_type =
+      Kokkos::UnorderedMap<const uint32_t, const uint32_t, Device>;

-  typedef typename map_type::HostMirror host_map_type;
-  // typedef Kokkos::UnorderedMap<uint32_t, uint32_t, typename
-  // Device::host_mirror_execution_space > host_map_type;
+  using host_map_type = typename map_type::HostMirror;

  map_type map;
  map.rehash(num_nodes, false);
@ -295,7 +293,7 @@ void test_deep_copy(uint32_t num_nodes) {
  }
 }

-// FIXME_HIP deadlock
+// FIXME_HIP wrong result in CI but works locally
 #ifndef KOKKOS_ENABLE_HIP
 // WORKAROUND MSVC
 #ifndef _WIN32
@ -306,6 +304,7 @@ TEST(TEST_CATEGORY, UnorderedMap_insert) {
  }
 }
 #endif
+#endif

 TEST(TEST_CATEGORY, UnorderedMap_failed_insert) {
  for (int i = 0; i < 1000; ++i) test_failed_insert<TEST_EXECSPACE>(10000);
@ -314,7 +313,6 @@ TEST(TEST_CATEGORY, UnorderedMap_failed_insert) {
 TEST(TEST_CATEGORY, UnorderedMap_deep_copy) {
  for (int i = 0; i < 2; ++i) test_deep_copy<TEST_EXECSPACE>(10000);
 }
-#endif

 TEST(TEST_CATEGORY, UnorderedMap_valid_empty) {
  using Key   = int;
@ -326,6 +324,8 @@ TEST(TEST_CATEGORY, UnorderedMap_valid_empty) {
  n = Map{m.capacity()};
  n.rehash(m.capacity());
  Kokkos::deep_copy(n, m);
+  ASSERT_TRUE(m.is_allocated());
+  ASSERT_TRUE(n.is_allocated());
 }

 }  // namespace Test
--- a/lib/kokkos/containers/unit_tests/TestVector.hpp
+++ b/lib/kokkos/containers/unit_tests/TestVector.hpp
@ -55,14 +55,17 @@ namespace Impl {

 template <typename Scalar, class Device>
 struct test_vector_insert {
-  typedef Scalar scalar_type;
-  typedef Device execution_space;
+  using scalar_type     = Scalar;
+  using execution_space = Device;

  template <typename Vector>
  void run_test(Vector& a) {
    int n = a.size();

    auto it = a.begin();
+    if (n > 0) {
+      ASSERT_EQ(a.data(), &a[0]);
+    }
    it += 15;
    ASSERT_EQ(*it, scalar_type(1));

@ -173,11 +176,42 @@ struct test_vector_insert {
 };

 template <typename Scalar, class Device>
-struct test_vector_combinations {
-  typedef test_vector_combinations<Scalar, Device> self_type;
+struct test_vector_allocate {
+  using self_type = test_vector_allocate<Scalar, Device>;

-  typedef Scalar scalar_type;
-  typedef Device execution_space;
+  using scalar_type     = Scalar;
+  using execution_space = Device;
+
+  bool result = false;
+
+  template <typename Vector>
+  Scalar run_me(unsigned int n) {
+    {
+      Vector v1;
+      if (v1.is_allocated() == true) return false;
+
+      v1 = Vector(n, 1);
+      Vector v2(v1);
+      Vector v3(n, 1);
+
+      if (v1.is_allocated() == false) return false;
+      if (v2.is_allocated() == false) return false;
+      if (v3.is_allocated() == false) return false;
+    }
+    return true;
+  }
+
+  test_vector_allocate(unsigned int size) {
+    result = run_me<Kokkos::vector<Scalar, Device> >(size);
+  }
+};
+
+template <typename Scalar, class Device>
+struct test_vector_combinations {
+  using self_type = test_vector_combinations<Scalar, Device>;
+
+  using scalar_type     = Scalar;
+  using execution_space = Device;

  Scalar reference;
  Scalar result;
@ -231,7 +265,14 @@ void test_vector_combinations(unsigned int size) {
  ASSERT_EQ(test.reference, test.result);
 }

+template <typename Scalar, typename Device>
+void test_vector_allocate(unsigned int size) {
+  Impl::test_vector_allocate<Scalar, Device> test(size);
+  ASSERT_TRUE(test.result);
+}
+
 TEST(TEST_CATEGORY, vector_combination) {
+  test_vector_allocate<int, TEST_EXECSPACE>(10);
  test_vector_combinations<int, TEST_EXECSPACE>(10);
  test_vector_combinations<int, TEST_EXECSPACE>(3057);
 }
--- a/lib/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp
+++ b/lib/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp
@ -91,10 +91,10 @@ struct TestViewCtorProp_EmbeddedDim {
      {
        // Two views
        auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1, vd1);
-        typedef
-            typename decltype(view_alloc_arg)::value_type CommonViewValueType;
-        typedef typename Kokkos::View<CommonViewValueType*, ExecSpace> CVT;
-        typedef typename CVT::HostMirror HostCVT;
+        using CommonViewValueType =
+            typename decltype(view_alloc_arg)::value_type;
+        using CVT     = typename Kokkos::View<CommonViewValueType*, ExecSpace>;
+        using HostCVT = typename CVT::HostMirror;

        // Construct View using the common type; for case of specialization, an
        // 'embedded_dim' would be stored by view_alloc_arg
@ -128,10 +128,10 @@ struct TestViewCtorProp_EmbeddedDim {
      {
        // Single view
        auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1);
-        typedef
-            typename decltype(view_alloc_arg)::value_type CommonViewValueType;
-        typedef typename Kokkos::View<CommonViewValueType*, ExecSpace> CVT;
-        typedef typename CVT::HostMirror HostCVT;
+        using CommonViewValueType =
+            typename decltype(view_alloc_arg)::value_type;
+        using CVT     = typename Kokkos::View<CommonViewValueType*, ExecSpace>;
+        using HostCVT = typename CVT::HostMirror;

        // Construct View using the common type; for case of specialization, an
        // 'embedded_dim' would be stored by view_alloc_arg
@ -161,10 +161,10 @@ struct TestViewCtorProp_EmbeddedDim {
      {
        // Two views
        auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1, vd1);
-        typedef
-            typename decltype(view_alloc_arg)::value_type CommonViewValueType;
-        typedef typename Kokkos::View<CommonViewValueType*, ExecSpace> CVT;
-        typedef typename CVT::HostMirror HostCVT;
+        using CommonViewValueType =
+            typename decltype(view_alloc_arg)::value_type;
+        using CVT     = typename Kokkos::View<CommonViewValueType*, ExecSpace>;
+        using HostCVT = typename CVT::HostMirror;

        // Construct View using the common type; for case of specialization, an
        // 'embedded_dim' would be stored by view_alloc_arg
@ -182,10 +182,10 @@ struct TestViewCtorProp_EmbeddedDim {
      {
        // Single views
        auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1);
-        typedef
-            typename decltype(view_alloc_arg)::value_type CommonViewValueType;
-        typedef typename Kokkos::View<CommonViewValueType*, ExecSpace> CVT;
-        typedef typename CVT::HostMirror HostCVT;
+        using CommonViewValueType =
+            typename decltype(view_alloc_arg)::value_type;
+        using CVT     = typename Kokkos::View<CommonViewValueType*, ExecSpace>;
+        using HostCVT = typename CVT::HostMirror;

        // Construct View using the common type; for case of specialization, an
        // 'embedded_dim' would be stored by view_alloc_arg
--- a/lib/kokkos/core/CMakeLists.txt
+++ b/lib/kokkos/core/CMakeLists.txt
@ -2,7 +2,9 @@

 KOKKOS_SUBPACKAGE(Core)

-ADD_SUBDIRECTORY(src)
+IF (NOT Kokkos_INSTALL_TESTING)
+  ADD_SUBDIRECTORY(src)
+ENDIF()

 KOKKOS_ADD_TEST_DIRECTORIES(unit_test)
 KOKKOS_ADD_TEST_DIRECTORIES(perf_test)
--- a/lib/kokkos/core/cmake/KokkosCore_config.h.in
+++ b/lib/kokkos/core/cmake/KokkosCore_config.h.in
@ -6,7 +6,8 @@
 #if !defined(KOKKOS_FOR_SIERRA)

 #if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
-#error "Don't include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#error \
+    "Don't include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
 #else
 #define KOKKOS_CORE_CONFIG_H
 #endif
@ -25,8 +26,8 @@
 #cmakedefine KOKKOS_ENABLE_DEBUG
 #cmakedefine KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
 #cmakedefine KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK
-#cmakedefine KOKKOS_ENABLE_PROFILING
 #cmakedefine KOKKOS_ENABLE_PROFILING_LOAD_PRINT
+#cmakedefine KOKKOS_ENABLE_TUNING

 #cmakedefine KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION

@ -38,7 +39,8 @@
 // any value of KOKKOS_USE_CUDA_UVM here.  Doing this should prevent build
 // warnings like this one:
 //
-// packages/kokkos/core/src/KokkosCore_config.h:13:1: warning: "KOKKOS_USE_CUDA_UVM" redefined
+// packages/kokkos/core/src/KokkosCore_config.h:13:1: warning:
+// "KOKKOS_USE_CUDA_UVM" redefined
 //
 // At some point, we should edit the test-build scripts in
 // Trilinos/cmake/ctest/drivers/perseus/, and take
@ -100,4 +102,4 @@
 #cmakedefine KOKKOS_USING_DEPRECATED_VIEW
 #cmakedefine KOKKOS_ENABLE_CXX11

-#endif // !defined(KOKKOS_FOR_SIERRA)
+#endif  // !defined(KOKKOS_FOR_SIERRA)
--- a/lib/kokkos/core/perf_test/CMakeLists.txt
+++ b/lib/kokkos/core/perf_test/CMakeLists.txt
@ -49,11 +49,19 @@ SET(SOURCES
  )

 IF(Kokkos_ENABLE_HIP)
-# FIXME requires TeamPolicy
+# FIXME HIP requires TeamPolicy
  LIST(REMOVE_ITEM SOURCES
    PerfTest_CustomReduction.cpp
    PerfTest_ExecSpacePartitioning.cpp
-    )
+  )
+ENDIF()
+
+IF(Kokkos_ENABLE_OPENMPTARGET)
+# FIXME OPENMPTARGET requires TeamPolicy Reductions and Custom Reduction
+  LIST(REMOVE_ITEM SOURCES
+    PerfTest_CustomReduction.cpp
+    PerfTest_ExecSpacePartitioning.cpp
+  )
 ENDIF()

 # Per #374, we always want to build this test, but we only want to run
@ -76,7 +84,22 @@ IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC")
 ENDIF()

 KOKKOS_ADD_EXECUTABLE_AND_TEST(
-  PerformanceTest_TaskDag
-  SOURCES test_taskdag.cpp
+  PerformanceTest_Atomic
+  SOURCES test_atomic.cpp
  CATEGORIES PERFORMANCE
 )
+
+KOKKOS_ADD_EXECUTABLE_AND_TEST(
+  PerformanceTest_Mempool
+  SOURCES test_mempool.cpp
+  CATEGORIES PERFORMANCE
+)
+
+IF(NOT Kokkos_ENABLE_OPENMPTARGET)
+# FIXME OPENMPTARGET needs tasking
+  KOKKOS_ADD_EXECUTABLE_AND_TEST(
+    PerformanceTest_TaskDag
+    SOURCES test_taskdag.cpp
+    CATEGORIES PERFORMANCE
+  )
+ENDIF()
--- a/lib/kokkos/core/perf_test/Makefile
+++ b/lib/kokkos/core/perf_test/Makefile
@ -53,7 +53,6 @@ TEST_TARGETS += test-atomic

 #

-ifneq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
 OBJ_MEMPOOL = test_mempool.o 
 TARGETS += KokkosCore_PerformanceTest_Mempool
 TEST_TARGETS += test-mempool
@ -63,7 +62,6 @@ TEST_TARGETS += test-mempool
 OBJ_TASKDAG = test_taskdag.o 
 TARGETS += KokkosCore_PerformanceTest_TaskDAG
 TEST_TARGETS += test-taskdag
-endif

 #

--- a/lib/kokkos/core/perf_test/PerfTestBlasKernels.hpp
+++ b/lib/kokkos/core/perf_test/PerfTestBlasKernels.hpp
@ -51,12 +51,12 @@ namespace Kokkos {

 template <class Type>
 struct Dot {
-  typedef typename Type::execution_space execution_space;
+  using execution_space = typename Type::execution_space;

  static_assert(static_cast<unsigned>(Type::Rank) == static_cast<unsigned>(1),
                "Dot static_assert Fail: Rank != 1");

-  typedef double value_type;
+  using value_type = double;

 #if 1
  typename Type::const_type X;
@ -83,12 +83,12 @@ struct Dot {

 template <class Type>
 struct DotSingle {
-  typedef typename Type::execution_space execution_space;
+  using execution_space = typename Type::execution_space;

  static_assert(static_cast<unsigned>(Type::Rank) == static_cast<unsigned>(1),
                "DotSingle static_assert Fail: Rank != 1");

-  typedef double value_type;
+  using value_type = double;

 #if 1
  typename Type::const_type X;
@ -116,7 +116,7 @@ struct DotSingle {

 template <class ScalarType, class VectorType>
 struct Scale {
-  typedef typename VectorType::execution_space execution_space;
+  using execution_space = typename VectorType::execution_space;

  static_assert(static_cast<unsigned>(ScalarType::Rank) ==
                    static_cast<unsigned>(0),
@ -143,7 +143,7 @@ struct Scale {

 template <class ScalarType, class ConstVectorType, class VectorType>
 struct AXPBY {
-  typedef typename VectorType::execution_space execution_space;
+  using execution_space = typename VectorType::execution_space;

  static_assert(static_cast<unsigned>(ScalarType::Rank) ==
                    static_cast<unsigned>(0),
@ -185,7 +185,7 @@ namespace Kokkos {
 template <class ConstScalarType, class ConstVectorType, class VectorType>
 void axpby(const ConstScalarType& alpha, const ConstVectorType& X,
           const ConstScalarType& beta, const VectorType& Y) {
-  typedef AXPBY<ConstScalarType, ConstVectorType, VectorType> functor;
+  using functor = AXPBY<ConstScalarType, ConstVectorType, VectorType>;

  parallel_for(Y.extent(0), functor(alpha, X, beta, Y));
 }
@ -193,7 +193,7 @@ void axpby(const ConstScalarType& alpha, const ConstVectorType& X,
 /** \brief  Y *= alpha */
 template <class ConstScalarType, class VectorType>
 void scale(const ConstScalarType& alpha, const VectorType& Y) {
-  typedef Scale<ConstScalarType, VectorType> functor;
+  using functor = Scale<ConstScalarType, VectorType>;

  parallel_for(Y.extent(0), functor(alpha, Y));
 }
@ -201,14 +201,14 @@ void scale(const ConstScalarType& alpha, const VectorType& Y) {
 template <class ConstVectorType, class Finalize>
 void dot(const ConstVectorType& X, const ConstVectorType& Y,
         const Finalize& finalize) {
-  typedef Dot<ConstVectorType> functor;
+  using functor = Dot<ConstVectorType>;

  parallel_reduce(X.extent(0), functor(X, Y), finalize);
 }

 template <class ConstVectorType, class Finalize>
 void dot(const ConstVectorType& X, const Finalize& finalize) {
-  typedef DotSingle<ConstVectorType> functor;
+  using functor = DotSingle<ConstVectorType>;

  parallel_reduce(X.extent(0), functor(X), finalize);
 }
--- a/lib/kokkos/core/perf_test/PerfTestGramSchmidt.cpp
+++ b/lib/kokkos/core/perf_test/PerfTestGramSchmidt.cpp
@ -58,7 +58,7 @@ namespace Test {
 // PostProcess : R(j,j) = result ; inv = 1 / result ;
 template <class VectorView, class ValueView>
 struct InvNorm2 : public Kokkos::DotSingle<VectorView> {
-  typedef typename Kokkos::DotSingle<VectorView>::value_type value_type;
+  using value_type = typename Kokkos::DotSingle<VectorView>::value_type;

  ValueView Rjj;
  ValueView inv;
@ -69,10 +69,7 @@ struct InvNorm2 : public Kokkos::DotSingle<VectorView> {

  KOKKOS_INLINE_FUNCTION
  void final(value_type& result) const {
-#ifndef KOKKOS_ENABLE_HIP  // FIXME_HIP
-    using std::sqrt;
-#endif
-    result = sqrt(result);
+    result = std::sqrt(result);
    Rjj()  = result;
    inv()  = (0 < result) ? 1.0 / result : 0;
  }
@ -88,7 +85,7 @@ inline void invnorm2(const VectorView& x, const ValueView& r,
 // PostProcess : tmp = - ( R(j,k) = result );
 template <class VectorView, class ValueView>
 struct DotM : public Kokkos::Dot<VectorView> {
-  typedef typename Kokkos::Dot<VectorView>::value_type value_type;
+  using value_type = typename Kokkos::Dot<VectorView>::value_type;

  ValueView Rjk;
  ValueView tmp;
@ -113,16 +110,16 @@ inline void dot_neg(const VectorView& x, const VectorView& y,

 template <typename Scalar, class DeviceType>
 struct ModifiedGramSchmidt {
-  typedef DeviceType execution_space;
-  typedef typename execution_space::size_type size_type;
+  using execution_space = DeviceType;
+  using size_type       = typename execution_space::size_type;

-  typedef Kokkos::View<Scalar**, Kokkos::LayoutLeft, execution_space>
-      multivector_type;
+  using multivector_type =
+      Kokkos::View<Scalar**, Kokkos::LayoutLeft, execution_space>;

-  typedef Kokkos::View<Scalar*, Kokkos::LayoutLeft, execution_space>
-      vector_type;
+  using vector_type =
+      Kokkos::View<Scalar*, Kokkos::LayoutLeft, execution_space>;

-  typedef Kokkos::View<Scalar, Kokkos::LayoutLeft, execution_space> value_view;
+  using value_view = Kokkos::View<Scalar, Kokkos::LayoutLeft, execution_space>;

  multivector_type Q;
  multivector_type R;
@ -243,9 +240,9 @@ TEST(default_exec, gramschmidt) {
  int exp_end    = 20;
  int num_trials = 5;

-  if (command_line_num_args() > 1) exp_beg = atoi(command_line_arg(1));
-  if (command_line_num_args() > 2) exp_end = atoi(command_line_arg(2));
-  if (command_line_num_args() > 3) num_trials = atoi(command_line_arg(3));
+  if (command_line_num_args() > 1) exp_beg = std::stoi(command_line_arg(1));
+  if (command_line_num_args() > 2) exp_end = std::stoi(command_line_arg(2));
+  if (command_line_num_args() > 3) num_trials = std::stoi(command_line_arg(3));

  EXPECT_NO_THROW(run_test_gramschmidt<Kokkos::DefaultExecutionSpace>(
      exp_beg, exp_end, num_trials, Kokkos::DefaultExecutionSpace::name()));
--- a/lib/kokkos/core/perf_test/PerfTestHexGrad.cpp
+++ b/lib/kokkos/core/perf_test/PerfTestHexGrad.cpp
@ -51,20 +51,20 @@ namespace Test {
 template <class DeviceType, typename CoordScalarType = double,
          typename GradScalarType = float>
 struct HexGrad {
-  typedef DeviceType execution_space;
-  typedef typename execution_space::size_type size_type;
+  using execution_space = DeviceType;
+  using size_type       = typename execution_space::size_type;

-  typedef HexGrad<DeviceType, CoordScalarType, GradScalarType> self_type;
+  using self_type = HexGrad<DeviceType, CoordScalarType, GradScalarType>;

  // 3D array : ( ParallelWork , Space , Node )

  enum { NSpace = 3, NNode = 8 };

-  typedef Kokkos::View<CoordScalarType * [NSpace][NNode], execution_space>
-      elem_coord_type;
+  using elem_coord_type =
+      Kokkos::View<CoordScalarType * [NSpace][NNode], execution_space>;

-  typedef Kokkos::View<GradScalarType * [NSpace][NNode], execution_space>
-      elem_grad_type;
+  using elem_grad_type =
+      Kokkos::View<GradScalarType * [NSpace][NNode], execution_space>;

  elem_coord_type coords;
  elem_grad_type grad_op;
@ -179,7 +179,7 @@ struct HexGrad {
  //--------------------------------------------------------------------------

  struct Init {
-    typedef typename self_type::execution_space execution_space;
+    using execution_space = typename self_type::execution_space;

    elem_coord_type coords;

@ -289,9 +289,9 @@ TEST(default_exec, hexgrad) {
  int exp_end    = 20;
  int num_trials = 5;

-  if (command_line_num_args() > 1) exp_beg = atoi(command_line_arg(1));
-  if (command_line_num_args() > 2) exp_end = atoi(command_line_arg(2));
-  if (command_line_num_args() > 3) num_trials = atoi(command_line_arg(3));
+  if (command_line_num_args() > 1) exp_beg = std::stoi(command_line_arg(1));
+  if (command_line_num_args() > 2) exp_end = std::stoi(command_line_arg(2));
+  if (command_line_num_args() > 3) num_trials = std::stoi(command_line_arg(3));

  EXPECT_NO_THROW(run_test_hexgrad<Kokkos::DefaultExecutionSpace>(
      exp_beg, exp_end, num_trials, Kokkos::DefaultExecutionSpace::name()));
--- a/lib/kokkos/core/perf_test/PerfTestMDRange.hpp
+++ b/lib/kokkos/core/perf_test/PerfTestMDRange.hpp
@ -46,13 +46,13 @@ namespace Test {
 template <class DeviceType, typename ScalarType = double,
          typename TestLayout = Kokkos::LayoutRight>
 struct MultiDimRangePerf3D {
-  typedef DeviceType execution_space;
-  typedef typename execution_space::size_type size_type;
+  using execution_space = DeviceType;
+  using size_type       = typename execution_space::size_type;

  using iterate_type = Kokkos::Iterate;

-  typedef Kokkos::View<ScalarType ***, TestLayout, DeviceType> view_type;
-  typedef typename view_type::HostMirror host_view_type;
+  using view_type      = Kokkos::View<ScalarType ***, TestLayout, DeviceType>;
+  using host_view_type = typename view_type::HostMirror;

  view_type A;
  view_type B;
@ -108,8 +108,8 @@ struct MultiDimRangePerf3D {
    // This test performs multidim range over all dims
    view_type Atest("Atest", icount, jcount, kcount);
    view_type Btest("Btest", icount + 2, jcount + 2, kcount + 2);
-    typedef MultiDimRangePerf3D<execution_space, ScalarType, TestLayout>
-        FunctorType;
+    using FunctorType =
+        MultiDimRangePerf3D<execution_space, ScalarType, TestLayout>;

    double dt_min = 0;

@ -125,10 +125,9 @@ struct MultiDimRangePerf3D {
          policy_initB({{0, 0, 0}}, {{icount + 2, jcount + 2, kcount + 2}},
                       {{Ti, Tj, Tk}});

-      typedef typename Kokkos::MDRangePolicy<
+      using MDRangeType = typename Kokkos::MDRangePolicy<
          Kokkos::Rank<3, iterate_type::Right, iterate_type::Right>,
-          execution_space>
-          MDRangeType;
+          execution_space>;
      using tile_type  = typename MDRangeType::tile_type;
      using point_type = typename MDRangeType::point_type;

@ -216,14 +215,15 @@ struct MultiDimRangePerf3D {
          policy_initB({{0, 0, 0}}, {{icount + 2, jcount + 2, kcount + 2}},
                       {{Ti, Tj, Tk}});

-      // typedef typename Kokkos::MDRangePolicy<Kokkos::Rank<3,
-      // iterate_type::Left, iterate_type::Left>, execution_space > MDRangeType;
+      // using MDRangeType =
+      //     typename Kokkos::MDRangePolicy<
+      //         Kokkos::Rank<3, iterate_type::Left, iterate_type::Left>,
+      //         execution_space >;
      // using tile_type = typename MDRangeType::tile_type;
      // using point_type = typename MDRangeType::point_type;
-      // Kokkos::MDRangePolicy<Kokkos::Rank<3, iterate_type::Left,
-      // iterate_type::Left>, execution_space >
-      // policy(point_type{{0,0,0}},point_type{{icount,jcount,kcount}},tile_type{{Ti,Tj,Tk}}
-      // );
+      // MDRangeType policy(point_type{{0,0,0}},
+      //                    point_type{{icount,jcount,kcount}},
+      //                    tile_type{{Ti,Tj,Tk}});
      Kokkos::MDRangePolicy<
          Kokkos::Rank<3, iterate_type::Left, iterate_type::Left>,
          execution_space>
@ -306,14 +306,14 @@ struct RangePolicyCollapseTwo {
  // RangePolicy for 3D range, but will collapse only 2 dims => like Rank<2> for
  // multi-dim; unroll 2 dims in one-dim

-  typedef DeviceType execution_space;
-  typedef typename execution_space::size_type size_type;
-  typedef TestLayout layout;
+  using execution_space = DeviceType;
+  using size_type       = typename execution_space::size_type;
+  using layout          = TestLayout;

  using iterate_type = Kokkos::Iterate;

-  typedef Kokkos::View<ScalarType ***, TestLayout, DeviceType> view_type;
-  typedef typename view_type::HostMirror host_view_type;
+  using view_type      = Kokkos::View<ScalarType ***, TestLayout, DeviceType>;
+  using host_view_type = typename view_type::HostMirror;

  view_type A;
  view_type B;
@ -388,8 +388,8 @@ struct RangePolicyCollapseTwo {
    // This test refers to collapsing two dims while using the RangePolicy
    view_type Atest("Atest", icount, jcount, kcount);
    view_type Btest("Btest", icount + 2, jcount + 2, kcount + 2);
-    typedef RangePolicyCollapseTwo<execution_space, ScalarType, TestLayout>
-        FunctorType;
+    using FunctorType =
+        RangePolicyCollapseTwo<execution_space, ScalarType, TestLayout>;

    long collapse_index_rangeA = 0;
    long collapse_index_rangeB = 0;
@ -480,12 +480,12 @@ template <class DeviceType, typename ScalarType = double,
 struct RangePolicyCollapseAll {
  // RangePolicy for 3D range, but will collapse all dims

-  typedef DeviceType execution_space;
-  typedef typename execution_space::size_type size_type;
-  typedef TestLayout layout;
+  using execution_space = DeviceType;
+  using size_type       = typename execution_space::size_type;
+  using layout          = TestLayout;

-  typedef Kokkos::View<ScalarType ***, TestLayout, DeviceType> view_type;
-  typedef typename view_type::HostMirror host_view_type;
+  using view_type      = Kokkos::View<ScalarType ***, TestLayout, DeviceType>;
+  using host_view_type = typename view_type::HostMirror;

  view_type A;
  view_type B;
@ -552,8 +552,8 @@ struct RangePolicyCollapseAll {
    // This test refers to collapsing all dims using the RangePolicy
    view_type Atest("Atest", icount, jcount, kcount);
    view_type Btest("Btest", icount + 2, jcount + 2, kcount + 2);
-    typedef RangePolicyCollapseAll<execution_space, ScalarType, TestLayout>
-        FunctorType;
+    using FunctorType =
+        RangePolicyCollapseAll<execution_space, ScalarType, TestLayout>;

    const long flat_index_range = icount * jcount * kcount;
    Kokkos::RangePolicy<execution_space> policy(0, flat_index_range);
--- a/lib/kokkos/core/perf_test/PerfTest_CustomReduction.cpp
+++ b/lib/kokkos/core/perf_test/PerfTest_CustomReduction.cpp
@ -129,9 +129,9 @@ TEST(default_exec, custom_reduction) {
  int R          = 1000;
  int num_trials = 1;

-  if (command_line_num_args() > 1) N = atoi(command_line_arg(1));
-  if (command_line_num_args() > 2) R = atoi(command_line_arg(2));
-  if (command_line_num_args() > 3) num_trials = atoi(command_line_arg(3));
+  if (command_line_num_args() > 1) N = std::stoi(command_line_arg(1));
+  if (command_line_num_args() > 2) R = std::stoi(command_line_arg(2));
+  if (command_line_num_args() > 3) num_trials = std::stoi(command_line_arg(3));
  custom_reduction_test<double>(N, R, num_trials);
 }
 }  // namespace Test
--- a/lib/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp
+++ b/lib/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp
@ -29,7 +29,7 @@ struct SpaceInstance<Kokkos::Cuda> {
    bool value          = true;
    auto local_rank_str = std::getenv("CUDA_LAUNCH_BLOCKING");
    if (local_rank_str) {
-      value = (std::atoi(local_rank_str) == 0);
+      value = (std::stoi(local_rank_str) == 0);
    }
    return value;
  }
--- a/lib/kokkos/core/perf_test/test_atomic.cpp
+++ b/lib/kokkos/core/perf_test/test_atomic.cpp
@ -49,7 +49,7 @@
 #include <Kokkos_Core.hpp>
 #include <impl/Kokkos_Timer.hpp>

-typedef Kokkos::DefaultExecutionSpace exec_space;
+using exec_space = Kokkos::DefaultExecutionSpace;

 #define RESET 0
 #define BRIGHT 1
@ -80,9 +80,9 @@ void textcolor_standard() { textcolor(RESET, BLACK, WHITE); }

 template <class T, class DEVICE_TYPE>
 struct ZeroFunctor {
-  typedef DEVICE_TYPE execution_space;
-  typedef typename Kokkos::View<T, execution_space> type;
-  typedef typename Kokkos::View<T, execution_space>::HostMirror h_type;
+  using execution_space = DEVICE_TYPE;
+  using type            = typename Kokkos::View<T, execution_space>;
+  using h_type          = typename Kokkos::View<T, execution_space>::HostMirror;
  type data;
  KOKKOS_INLINE_FUNCTION
  void operator()(int) const { data() = 0; }
@ -94,8 +94,8 @@ struct ZeroFunctor {

 template <class T, class DEVICE_TYPE>
 struct AddFunctor {
-  typedef DEVICE_TYPE execution_space;
-  typedef Kokkos::View<T, execution_space> type;
+  using execution_space = DEVICE_TYPE;
+  using type            = Kokkos::View<T, execution_space>;
  type data;

  KOKKOS_INLINE_FUNCTION
@ -123,8 +123,8 @@ T AddLoop(int loop) {

 template <class T, class DEVICE_TYPE>
 struct AddNonAtomicFunctor {
-  typedef DEVICE_TYPE execution_space;
-  typedef Kokkos::View<T, execution_space> type;
+  using execution_space = DEVICE_TYPE;
+  using type            = Kokkos::View<T, execution_space>;
  type data;

  KOKKOS_INLINE_FUNCTION
@ -166,8 +166,8 @@ T AddLoopSerial(int loop) {

 template <class T, class DEVICE_TYPE>
 struct CASFunctor {
-  typedef DEVICE_TYPE execution_space;
-  typedef Kokkos::View<T, execution_space> type;
+  using execution_space = DEVICE_TYPE;
+  using type            = Kokkos::View<T, execution_space>;
  type data;

  KOKKOS_INLINE_FUNCTION
@ -204,8 +204,8 @@ T CASLoop(int loop) {

 template <class T, class DEVICE_TYPE>
 struct CASNonAtomicFunctor {
-  typedef DEVICE_TYPE execution_space;
-  typedef Kokkos::View<T, execution_space> type;
+  using execution_space = DEVICE_TYPE;
+  using type            = Kokkos::View<T, execution_space>;
  type data;

  KOKKOS_INLINE_FUNCTION
@ -268,8 +268,8 @@ T CASLoopSerial(int loop) {

 template <class T, class DEVICE_TYPE>
 struct ExchFunctor {
-  typedef DEVICE_TYPE execution_space;
-  typedef Kokkos::View<T, execution_space> type;
+  using execution_space = DEVICE_TYPE;
+  using type            = Kokkos::View<T, execution_space>;
  type data, data2;

  KOKKOS_INLINE_FUNCTION
@ -309,8 +309,8 @@ T ExchLoop(int loop) {

 template <class T, class DEVICE_TYPE>
 struct ExchNonAtomicFunctor {
-  typedef DEVICE_TYPE execution_space;
-  typedef Kokkos::View<T, execution_space> type;
+  using execution_space = DEVICE_TYPE;
+  using type            = Kokkos::View<T, execution_space>;
  type data, data2;

  KOKKOS_INLINE_FUNCTION
@ -448,15 +448,15 @@ int main(int argc, char* argv[]) {

  for (int i = 0; i < argc; i++) {
    if ((strcmp(argv[i], "--test") == 0)) {
-      test = atoi(argv[++i]);
+      test = std::stoi(argv[++i]);
      continue;
    }
    if ((strcmp(argv[i], "--type") == 0)) {
-      type = atoi(argv[++i]);
+      type = std::stoi(argv[++i]);
      continue;
    }
    if ((strcmp(argv[i], "-l") == 0) || (strcmp(argv[i], "--loop") == 0)) {
-      loop = atoi(argv[++i]);
+      loop = std::stoi(argv[++i]);
      continue;
    }
  }
--- a/lib/kokkos/core/perf_test/test_mempool.cpp
+++ b/lib/kokkos/core/perf_test/test_mempool.cpp
@ -56,7 +56,7 @@ using MemorySpace = Kokkos::DefaultExecutionSpace::memory_space;
 using MemoryPool = Kokkos::MemoryPool<ExecSpace>;

 struct TestFunctor {
-  typedef Kokkos::View<uintptr_t*, ExecSpace> ptrs_type;
+  using ptrs_type = Kokkos::View<uintptr_t*, ExecSpace>;

  enum : unsigned { chunk = 32 };

@ -87,7 +87,7 @@ struct TestFunctor {

  //----------------------------------------

-  typedef long value_type;
+  using value_type = long;

  //----------------------------------------

@ -107,7 +107,7 @@ struct TestFunctor {
  }

  bool test_fill() {
-    typedef Kokkos::RangePolicy<ExecSpace, TagFill> policy;
+    using policy = Kokkos::RangePolicy<ExecSpace, TagFill>;

    long result = 0;

@ -134,7 +134,7 @@ struct TestFunctor {
  }

  void test_del() {
-    typedef Kokkos::RangePolicy<ExecSpace, TagDel> policy;
+    using policy = Kokkos::RangePolicy<ExecSpace, TagDel>;

    Kokkos::parallel_for(policy(0, range_iter), *this);
    Kokkos::fence();
@ -164,7 +164,7 @@ struct TestFunctor {
  }

  bool test_alloc_dealloc() {
-    typedef Kokkos::RangePolicy<ExecSpace, TagAllocDealloc> policy;
+    using policy = Kokkos::RangePolicy<ExecSpace, TagAllocDealloc>;

    long error_count = 0;

@ -203,22 +203,22 @@ int main(int argc, char* argv[]) {
      total_alloc_size = atol(a + strlen(alloc_size_flag));

    if (!strncmp(a, super_size_flag, strlen(super_size_flag)))
-      min_superblock_size = atoi(a + strlen(super_size_flag));
+      min_superblock_size = std::stoi(a + strlen(super_size_flag));

    if (!strncmp(a, fill_stride_flag, strlen(fill_stride_flag)))
-      fill_stride = atoi(a + strlen(fill_stride_flag));
+      fill_stride = std::stoi(a + strlen(fill_stride_flag));

    if (!strncmp(a, fill_level_flag, strlen(fill_level_flag)))
-      fill_level = atoi(a + strlen(fill_level_flag));
+      fill_level = std::stoi(a + strlen(fill_level_flag));

    if (!strncmp(a, chunk_span_flag, strlen(chunk_span_flag)))
-      chunk_span = atoi(a + strlen(chunk_span_flag));
+      chunk_span = std::stoi(a + strlen(chunk_span_flag));

    if (!strncmp(a, repeat_outer_flag, strlen(repeat_outer_flag)))
-      repeat_outer = atoi(a + strlen(repeat_outer_flag));
+      repeat_outer = std::stoi(a + strlen(repeat_outer_flag));

    if (!strncmp(a, repeat_inner_flag, strlen(repeat_inner_flag)))
-      repeat_inner = atoi(a + strlen(repeat_inner_flag));
+      repeat_inner = std::stoi(a + strlen(repeat_inner_flag));
  }

  int chunk_span_bytes = 0;
--- a/lib/kokkos/core/perf_test/test_taskdag.cpp
+++ b/lib/kokkos/core/perf_test/test_taskdag.cpp
@ -91,7 +91,7 @@ struct TestFib {
  using MemberType  = typename Scheduler::member_type;
  using FutureType  = Kokkos::BasicFuture<long, Scheduler>;

-  typedef long value_type;
+  using value_type = long;

  FutureType dep[2];
  const value_type n;
@ -152,13 +152,13 @@ int main(int argc, char* argv[]) {
      total_alloc_size = atol(a + strlen(alloc_size));

    if (!strncmp(a, super_size, strlen(super_size)))
-      min_superblock_size = atoi(a + strlen(super_size));
+      min_superblock_size = std::stoi(a + strlen(super_size));

    if (!strncmp(a, repeat_outer, strlen(repeat_outer)))
-      test_repeat_outer = atoi(a + strlen(repeat_outer));
+      test_repeat_outer = std::stoi(a + strlen(repeat_outer));

    if (!strncmp(a, input_value, strlen(input_value)))
-      fib_input = atoi(a + strlen(input_value));
+      fib_input = std::stoi(a + strlen(input_value));
  }

  const long fib_output   = eval_fib(fib_input);
@ -182,7 +182,7 @@ int main(int argc, char* argv[]) {

  using Scheduler = Kokkos::TaskSchedulerMultiple<ExecSpace>;

-  typedef TestFib<Scheduler> Functor;
+  using Functor = TestFib<Scheduler>;

  Kokkos::initialize(argc, argv);

--- a/lib/kokkos/core/src/CMakeLists.txt
+++ b/lib/kokkos/core/src/CMakeLists.txt
@ -8,50 +8,49 @@ KOKKOS_INCLUDE_DIRECTORIES(
 INSTALL (DIRECTORY
  "${CMAKE_CURRENT_SOURCE_DIR}/"
  DESTINATION ${KOKKOS_HEADER_DIR}
-  FILES_MATCHING PATTERN "*.hpp"
+  FILES_MATCHING
+  PATTERN "*.hpp"
+  PATTERN "*.h"
 )

 SET(KOKKOS_CORE_SRCS)
 APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.cpp)
+SET(KOKKOS_CORE_HEADERS)
+APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp)
+APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.hpp)

 IF (KOKKOS_ENABLE_ROCM)
  APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/ROCm/*.cpp)
-  IF (KOKKOS_ENABLE_ETI)
-    APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/eti/ROCm/*.cpp)
-  ENDIF()
 ENDIF()

 IF (KOKKOS_ENABLE_CUDA)
  APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/*.cpp)
-  IF (KOKKOS_ENABLE_ETI)
-    APPEND_GLOB(KOKKOS_CORE_SRC ${CMAKE_CURRENT_SOURCE_DIR/eti/Cuda/*.cpp)
-  ENDIF()
+  APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/*.hpp)
 ENDIF()

 IF (KOKKOS_ENABLE_OPENMP)
  APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMP/*.cpp)
-  IF (KOKKOS_ENABLE_ETI)
-    APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/eti/OpenMP/*.cpp)
-  ENDIF()
+  APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMP/*.hpp)
 ENDIF()

 IF (KOKKOS_ENABLE_OPENMPTARGET)
  APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMPTarget/*.cpp)
+  APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMPTarget/*.hpp)
 ENDIF()

 IF (KOKKOS_ENABLE_PTHREAD)
  APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Threads/*.cpp)
-  IF (KOKKOS_ENABLE_ETI)
-    APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/eti/Threads/*.cpp)
-  ENDIF()
+  APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Threads/*.hpp)
 ENDIF()

 IF (KOKKOS_ENABLE_HIP)
  APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/HIP/*.cpp)
+  APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/HIP/*.hpp)
 ENDIF()

 IF (KOKKOS_ENABLE_HPX)
  APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/*.cpp)
+  APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/*.hpp)
 ENDIF()

 IF (NOT KOKKOS_ENABLE_MEMKIND)
@ -59,9 +58,7 @@ IF (NOT KOKKOS_ENABLE_MEMKIND)
 ENDIF()

 IF (KOKKOS_ENABLE_SERIAL)
-  IF (KOKKOS_ENABLE_ETI)
-    APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/eti/Serial/*.cpp)
-  ENDIF()
+  APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.hpp)
 ELSE()
  LIST(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/Kokkos_Serial.cpp)
  LIST(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/Kokkos_Serial_task.cpp)
@ -70,6 +67,8 @@ ENDIF()
 KOKKOS_ADD_LIBRARY(
  kokkoscore
  SOURCES ${KOKKOS_CORE_SRCS}
+  HEADERS ${KOKKOS_CORE_HEADERS}
+  ADD_BUILD_OPTIONS # core should be given all the necessary compiler/linker flags
 )

 SET_TARGET_PROPERTIES(kokkoscore PROPERTIES VERSION ${Kokkos_VERSION})
--- a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile.hpp
+++ b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile.hpp
@ -48,7 +48,6 @@
 #include <Kokkos_Macros.hpp>
 #if defined(__CUDACC__) && defined(KOKKOS_ENABLE_CUDA)

-#include <iostream>
 #include <algorithm>
 #include <cstdio>

@ -60,10 +59,8 @@
 // type is not allowed As a result, recreate cuda_parallel_launch and associated
 // code

-#if defined(KOKKOS_ENABLE_PROFILING)
-#include <impl/Kokkos_Profiling_Interface.hpp>
+#include <impl/Kokkos_Tools.hpp>
 #include <typeinfo>
-#endif

 namespace Kokkos {
 namespace Impl {
@ -1291,8 +1288,8 @@ struct DeviceIterateTile {
  using point_type       = typename RP::point_type;

  struct VoidDummy {};
-  typedef typename std::conditional<std::is_same<Tag, void>::value, VoidDummy,
-                                    Tag>::type usable_tag;
+  using usable_tag = typename std::conditional<std::is_same<Tag, void>::value,
+                                               VoidDummy, Tag>::type;

  DeviceIterateTile(const RP& rp, const Functor& func)
      : m_rp{rp}, m_func{func} {}
@ -1310,6 +1307,8 @@ struct DeviceIterateTile {
        65535;  // not true for blockIdx.x for newer archs
    if (RP::rank == 2) {
      const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], 1);
+      KOKKOS_ASSERT(block.x > 0);
+      KOKKOS_ASSERT(block.y > 0);
      const dim3 grid(
          std::min((m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x,
                   maxblocks),
@ -1319,6 +1318,9 @@ struct DeviceIterateTile {
      CudaLaunch<DeviceIterateTile>(*this, grid, block);
    } else if (RP::rank == 3) {
      const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]);
+      KOKKOS_ASSERT(block.x > 0);
+      KOKKOS_ASSERT(block.y > 0);
+      KOKKOS_ASSERT(block.z > 0);
      const dim3 grid(
          std::min((m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x,
                   maxblocks),
@ -1332,6 +1334,8 @@ struct DeviceIterateTile {
      // threadIdx.z
      const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1], m_rp.m_tile[2],
                       m_rp.m_tile[3]);
+      KOKKOS_ASSERT(block.y > 0);
+      KOKKOS_ASSERT(block.z > 0);
      const dim3 grid(
          std::min(
              static_cast<index_type>(m_rp.m_tile_end[0] * m_rp.m_tile_end[1]),
@ -1346,6 +1350,7 @@ struct DeviceIterateTile {
      // threadIdx.z
      const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1],
                       m_rp.m_tile[2] * m_rp.m_tile[3], m_rp.m_tile[4]);
+      KOKKOS_ASSERT(block.z > 0);
      const dim3 grid(
          std::min(
              static_cast<index_type>(m_rp.m_tile_end[0] * m_rp.m_tile_end[1]),
--- a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp
+++ b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp
@ -48,9 +48,7 @@
 #include <Kokkos_Macros.hpp>
 #if defined(__CUDACC__) && defined(KOKKOS_ENABLE_CUDA)

-#include <iostream>
 #include <algorithm>
-#include <cstdio>

 #include <utility>

@ -60,10 +58,8 @@
 // type is not allowed use existing Kokkos functionality, e.g. max blocks, once
 // resolved

-#if defined(KOKKOS_ENABLE_PROFILING)
-#include <impl/Kokkos_Profiling_Interface.hpp>
+#include <impl/Kokkos_Tools.hpp>
 #include <typeinfo>
-#endif

 namespace Kokkos {
 namespace Impl {
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
@ -60,9 +60,7 @@
 #include <impl/Kokkos_Error.hpp>
 #include <impl/Kokkos_MemorySpace.hpp>

-#if defined(KOKKOS_ENABLE_PROFILING)
-#include <impl/Kokkos_Profiling_Interface.hpp>
-#endif
+#include <impl/Kokkos_Tools.hpp>

 /*--------------------------------------------------------------------------*/
 /*--------------------------------------------------------------------------*/
@ -75,8 +73,8 @@ namespace {
 static std::atomic<int> num_uvm_allocations(0);

 cudaStream_t get_deep_copy_stream() {
-  static cudaStream_t s = 0;
-  if (s == 0) {
+  static cudaStream_t s = nullptr;
+  if (s == nullptr) {
    cudaStreamCreate(&s);
  }
  return s;
@ -201,6 +199,10 @@ CudaHostPinnedSpace::CudaHostPinnedSpace() {}
 // <editor-fold desc="allocate()"> {{{1

 void *CudaSpace::allocate(const size_t arg_alloc_size) const {
+  return allocate("[unlabeled]", arg_alloc_size);
+}
+void *CudaSpace::allocate(const char *arg_label, const size_t arg_alloc_size,
+                          const size_t arg_logical_size) const {
  void *ptr = nullptr;

  auto error_code = cudaMalloc(&ptr, arg_alloc_size);
@ -213,10 +215,22 @@ void *CudaSpace::allocate(const size_t arg_alloc_size) const {
        Experimental::RawMemoryAllocationFailure::AllocationMechanism::
            CudaMalloc);
  }
+
+  if (Kokkos::Profiling::profileLibraryLoaded()) {
+    const size_t reported_size =
+        (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
+    Kokkos::Profiling::allocateData(
+        Kokkos::Profiling::make_space_handle(name()), arg_label, ptr,
+        reported_size);
+  }
  return ptr;
 }

 void *CudaUVMSpace::allocate(const size_t arg_alloc_size) const {
+  return allocate("[unlabeled]", arg_alloc_size);
+}
+void *CudaUVMSpace::allocate(const char *arg_label, const size_t arg_alloc_size,
+                             const size_t arg_logical_size) const {
  void *ptr = nullptr;

  Cuda::impl_static_fence();
@ -243,11 +257,22 @@ void *CudaUVMSpace::allocate(const size_t arg_alloc_size) const {
    }
  }
  Cuda::impl_static_fence();
-
+  if (Kokkos::Profiling::profileLibraryLoaded()) {
+    const size_t reported_size =
+        (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
+    Kokkos::Profiling::allocateData(
+        Kokkos::Profiling::make_space_handle(name()), arg_label, ptr,
+        reported_size);
+  }
  return ptr;
 }

 void *CudaHostPinnedSpace::allocate(const size_t arg_alloc_size) const {
+  return allocate("[unlabeled]", arg_alloc_size);
+}
+void *CudaHostPinnedSpace::allocate(const char *arg_label,
+                                    const size_t arg_alloc_size,
+                                    const size_t arg_logical_size) const {
  void *ptr = nullptr;

  auto error_code = cudaHostAlloc(&ptr, arg_alloc_size, cudaHostAllocDefault);
@ -260,24 +285,56 @@ void *CudaHostPinnedSpace::allocate(const size_t arg_alloc_size) const {
        Experimental::RawMemoryAllocationFailure::AllocationMechanism::
            CudaHostAlloc);
  }
-
+  if (Kokkos::Profiling::profileLibraryLoaded()) {
+    const size_t reported_size =
+        (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
+    Kokkos::Profiling::allocateData(
+        Kokkos::Profiling::make_space_handle(name()), arg_label, ptr,
+        reported_size);
+  }
  return ptr;
 }

 // </editor-fold> end allocate() }}}1
 //==============================================================================
-
 void CudaSpace::deallocate(void *const arg_alloc_ptr,
-                           const size_t /* arg_alloc_size */) const {
+                           const size_t arg_alloc_size) const {
+  deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size);
+}
+void CudaSpace::deallocate(const char *arg_label, void *const arg_alloc_ptr,
+                           const size_t arg_alloc_size,
+                           const size_t arg_logical_size) const {
+  if (Kokkos::Profiling::profileLibraryLoaded()) {
+    const size_t reported_size =
+        (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
+    Kokkos::Profiling::deallocateData(
+        Kokkos::Profiling::make_space_handle(name()), arg_label, arg_alloc_ptr,
+        reported_size);
+  }
+
  try {
    CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
  } catch (...) {
  }
 }
-
 void CudaUVMSpace::deallocate(void *const arg_alloc_ptr,
-                              const size_t /* arg_alloc_size */) const {
+                              const size_t arg_alloc_size) const {
+  deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size);
+}
+
+void CudaUVMSpace::deallocate(const char *arg_label, void *const arg_alloc_ptr,
+                              const size_t arg_alloc_size
+
+                              ,
+                              const size_t arg_logical_size) const {
  Cuda::impl_static_fence();
+  if (Kokkos::Profiling::profileLibraryLoaded()) {
+    const size_t reported_size =
+        (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
+    Kokkos::Profiling::deallocateData(
+        Kokkos::Profiling::make_space_handle(name()), arg_label, arg_alloc_ptr,
+        reported_size);
+  }
  try {
    if (arg_alloc_ptr != nullptr) {
      Kokkos::Impl::num_uvm_allocations--;
@ -289,7 +346,21 @@ void CudaUVMSpace::deallocate(void *const arg_alloc_ptr,
 }

 void CudaHostPinnedSpace::deallocate(void *const arg_alloc_ptr,
-                                     const size_t /* arg_alloc_size */) const {
+                                     const size_t arg_alloc_size) const {
+  deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size);
+}
+
+void CudaHostPinnedSpace::deallocate(const char *arg_label,
+                                     void *const arg_alloc_ptr,
+                                     const size_t arg_alloc_size,
+                                     const size_t arg_logical_size) const {
+  if (Kokkos::Profiling::profileLibraryLoaded()) {
+    const size_t reported_size =
+        (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
+    Kokkos::Profiling::deallocateData(
+        Kokkos::Profiling::make_space_handle(name()), arg_label, arg_alloc_ptr,
+        reported_size);
+  }
  try {
    CUDA_SAFE_CALL(cudaFreeHost(arg_alloc_ptr));
  } catch (...) {
@ -321,7 +392,8 @@ SharedAllocationRecord<Kokkos::CudaSpace, void>::attach_texture_object(
    size_t const alloc_size) {
  enum { TEXTURE_BOUND_1D = 1u << 27 };

-  if ((alloc_ptr == 0) || (sizeof_alias * TEXTURE_BOUND_1D <= alloc_size)) {
+  if ((alloc_ptr == nullptr) ||
+      (sizeof_alias * TEXTURE_BOUND_1D <= alloc_size)) {
    std::ostringstream msg;
    msg << "Kokkos::CudaSpace ERROR: Cannot attach texture object to"
        << " alloc_ptr(" << alloc_ptr << ")"
@ -434,48 +506,36 @@ void SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>::deallocate(
 // <editor-fold desc="SharedAllocationRecord destructors"> {{{1

 SharedAllocationRecord<Kokkos::CudaSpace, void>::~SharedAllocationRecord() {
-#if defined(KOKKOS_ENABLE_PROFILING)
+  const char *label = nullptr;
  if (Kokkos::Profiling::profileLibraryLoaded()) {
    SharedAllocationHeader header;
-    Kokkos::Impl::DeepCopy<CudaSpace, HostSpace>(
+    Kokkos::Impl::DeepCopy<Kokkos::CudaSpace, HostSpace>(
        &header, RecordBase::m_alloc_ptr, sizeof(SharedAllocationHeader));
-
-    Kokkos::Profiling::deallocateData(
-        Kokkos::Profiling::SpaceHandle(Kokkos::CudaSpace::name()),
-        header.m_label, data(), size());
+    label = header.label();
  }
-#endif
-
-  m_space.deallocate(SharedAllocationRecord<void, void>::m_alloc_ptr,
-                     SharedAllocationRecord<void, void>::m_alloc_size);
+  auto alloc_size = SharedAllocationRecord<void, void>::m_alloc_size;
+  m_space.deallocate(label, SharedAllocationRecord<void, void>::m_alloc_ptr,
+                     alloc_size, (alloc_size - sizeof(SharedAllocationHeader)));
 }

 SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::~SharedAllocationRecord() {
-#if defined(KOKKOS_ENABLE_PROFILING)
+  const char *label = nullptr;
  if (Kokkos::Profiling::profileLibraryLoaded()) {
-    Cuda::impl_static_fence();  // Make sure I can access the label ...
-    Kokkos::Profiling::deallocateData(
-        Kokkos::Profiling::SpaceHandle(Kokkos::CudaUVMSpace::name()),
-        RecordBase::m_alloc_ptr->m_label, data(), size());
+    label = RecordBase::m_alloc_ptr->m_label;
  }
-#endif
-
-  m_space.deallocate(SharedAllocationRecord<void, void>::m_alloc_ptr,
-                     SharedAllocationRecord<void, void>::m_alloc_size);
+  m_space.deallocate(label, SharedAllocationRecord<void, void>::m_alloc_ptr,
+                     SharedAllocationRecord<void, void>::m_alloc_size,
+                     (SharedAllocationRecord<void, void>::m_alloc_size -
+                      sizeof(SharedAllocationHeader)));
 }

 SharedAllocationRecord<Kokkos::CudaHostPinnedSpace,
                       void>::~SharedAllocationRecord() {
-#if defined(KOKKOS_ENABLE_PROFILING)
-  if (Kokkos::Profiling::profileLibraryLoaded()) {
-    Kokkos::Profiling::deallocateData(
-        Kokkos::Profiling::SpaceHandle(Kokkos::CudaHostPinnedSpace::name()),
-        RecordBase::m_alloc_ptr->m_label, data(), size());
-  }
-#endif
-
-  m_space.deallocate(SharedAllocationRecord<void, void>::m_alloc_ptr,
-                     SharedAllocationRecord<void, void>::m_alloc_size);
+  m_space.deallocate(RecordBase::m_alloc_ptr->m_label,
+                     SharedAllocationRecord<void, void>::m_alloc_ptr,
+                     SharedAllocationRecord<void, void>::m_alloc_size,
+                     (SharedAllocationRecord<void, void>::m_alloc_size -
+                      sizeof(SharedAllocationHeader)));
 }

 // </editor-fold> end SharedAllocationRecord destructors }}}1
@ -499,13 +559,6 @@ SharedAllocationRecord<Kokkos::CudaSpace, void>::SharedAllocationRecord(
          sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc),
      m_tex_obj(0),
      m_space(arg_space) {
-#if defined(KOKKOS_ENABLE_PROFILING)
-  if (Kokkos::Profiling::profileLibraryLoaded()) {
-    Kokkos::Profiling::allocateData(
-        Kokkos::Profiling::SpaceHandle(arg_space.name()), arg_label, data(),
-        arg_alloc_size);
-  }
-#endif

  SharedAllocationHeader header;

@ -537,13 +590,6 @@ SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::SharedAllocationRecord(
          sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc),
      m_tex_obj(0),
      m_space(arg_space) {
-#if defined(KOKKOS_ENABLE_PROFILING)
-  if (Kokkos::Profiling::profileLibraryLoaded()) {
-    Kokkos::Profiling::allocateData(
-        Kokkos::Profiling::SpaceHandle(arg_space.name()), arg_label, data(),
-        arg_alloc_size);
-  }
-#endif
  // Fill in the Header information, directly accessible via UVM

  RecordBase::m_alloc_ptr->m_record = this;
@ -572,13 +618,6 @@ SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>::
                                               arg_alloc_size),
          sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc),
      m_space(arg_space) {
-#if defined(KOKKOS_ENABLE_PROFILING)
-  if (Kokkos::Profiling::profileLibraryLoaded()) {
-    Kokkos::Profiling::allocateData(
-        Kokkos::Profiling::SpaceHandle(arg_space.name()), arg_label, data(),
-        arg_alloc_size);
-  }
-#endif
  // Fill in the Header information, directly accessible on the host

  RecordBase::m_alloc_ptr->m_record = this;
@ -599,7 +638,7 @@ SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>::
 void *SharedAllocationRecord<Kokkos::CudaSpace, void>::allocate_tracked(
    const Kokkos::CudaSpace &arg_space, const std::string &arg_alloc_label,
    const size_t arg_alloc_size) {
-  if (!arg_alloc_size) return (void *)0;
+  if (!arg_alloc_size) return nullptr;

  SharedAllocationRecord *const r =
      allocate(arg_space, arg_alloc_label, arg_alloc_size);
@ -611,7 +650,7 @@ void *SharedAllocationRecord<Kokkos::CudaSpace, void>::allocate_tracked(

 void SharedAllocationRecord<Kokkos::CudaSpace, void>::deallocate_tracked(
    void *const arg_alloc_ptr) {
-  if (arg_alloc_ptr != 0) {
+  if (arg_alloc_ptr != nullptr) {
    SharedAllocationRecord *const r = get_record(arg_alloc_ptr);

    RecordBase::decrement(r);
@ -636,7 +675,7 @@ void *SharedAllocationRecord<Kokkos::CudaSpace, void>::reallocate_tracked(
 void *SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::allocate_tracked(
    const Kokkos::CudaUVMSpace &arg_space, const std::string &arg_alloc_label,
    const size_t arg_alloc_size) {
-  if (!arg_alloc_size) return (void *)0;
+  if (!arg_alloc_size) return nullptr;

  SharedAllocationRecord *const r =
      allocate(arg_space, arg_alloc_label, arg_alloc_size);
@ -648,7 +687,7 @@ void *SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::allocate_tracked(

 void SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::deallocate_tracked(
    void *const arg_alloc_ptr) {
-  if (arg_alloc_ptr != 0) {
+  if (arg_alloc_ptr != nullptr) {
    SharedAllocationRecord *const r = get_record(arg_alloc_ptr);

    RecordBase::decrement(r);
@ -674,7 +713,7 @@ void *
 SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>::allocate_tracked(
    const Kokkos::CudaHostPinnedSpace &arg_space,
    const std::string &arg_alloc_label, const size_t arg_alloc_size) {
-  if (!arg_alloc_size) return (void *)0;
+  if (!arg_alloc_size) return nullptr;

  SharedAllocationRecord *const r =
      allocate(arg_space, arg_alloc_label, arg_alloc_size);
@ -687,7 +726,7 @@ SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>::allocate_tracked(
 void SharedAllocationRecord<Kokkos::CudaHostPinnedSpace,
                            void>::deallocate_tracked(void *const
                                                          arg_alloc_ptr) {
-  if (arg_alloc_ptr != 0) {
+  if (arg_alloc_ptr != nullptr) {
    SharedAllocationRecord *const r = get_record(arg_alloc_ptr);

    RecordBase::decrement(r);
@ -726,7 +765,7 @@ SharedAllocationRecord<Kokkos::CudaSpace, void>::get_record(void *alloc_ptr) {
  Header head;

  Header const *const head_cuda =
-      alloc_ptr ? Header::get_header(alloc_ptr) : (Header *)0;
+      alloc_ptr ? Header::get_header(alloc_ptr) : nullptr;

  if (alloc_ptr) {
    Kokkos::Impl::DeepCopy<HostSpace, CudaSpace>(
@ -734,7 +773,7 @@ SharedAllocationRecord<Kokkos::CudaSpace, void>::get_record(void *alloc_ptr) {
  }

  RecordCuda *const record =
-      alloc_ptr ? static_cast<RecordCuda *>(head.m_record) : (RecordCuda *)0;
+      alloc_ptr ? static_cast<RecordCuda *>(head.m_record) : nullptr;

  if (!alloc_ptr || record->m_alloc_ptr != head_cuda) {
    Kokkos::Impl::throw_runtime_exception(
@ -751,7 +790,7 @@ SharedAllocationRecord<Kokkos::CudaUVMSpace, void> *SharedAllocationRecord<
  using RecordCuda = SharedAllocationRecord<Kokkos::CudaUVMSpace, void>;

  Header *const h =
-      alloc_ptr ? reinterpret_cast<Header *>(alloc_ptr) - 1 : (Header *)0;
+      alloc_ptr ? reinterpret_cast<Header *>(alloc_ptr) - 1 : nullptr;

  if (!alloc_ptr || h->m_record->m_alloc_ptr != h) {
    Kokkos::Impl::throw_runtime_exception(
@ -769,7 +808,7 @@ SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>
  using RecordCuda = SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>;

  Header *const h =
-      alloc_ptr ? reinterpret_cast<Header *>(alloc_ptr) - 1 : (Header *)0;
+      alloc_ptr ? reinterpret_cast<Header *>(alloc_ptr) - 1 : nullptr;

  if (!alloc_ptr || h->m_record->m_alloc_ptr != h) {
    Kokkos::Impl::throw_runtime_exception(
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
@ -48,22 +48,102 @@
 #include <Kokkos_Macros.hpp>
 #ifdef KOKKOS_ENABLE_CUDA

-#include <iostream>
 #include <Cuda/Kokkos_Cuda_Error.hpp>

 namespace Kokkos {
 namespace Impl {

-template <class DriverType, class LaunchBounds, bool Large>
-struct CudaGetMaxBlockSize;
+inline int cuda_max_active_blocks_per_sm(cudaDeviceProp const& properties,
+                                         cudaFuncAttributes const& attributes,
+                                         int block_size, size_t dynamic_shmem) {
+  // Limits due do registers/SM
+  int const regs_per_sm     = properties.regsPerMultiprocessor;
+  int const regs_per_thread = attributes.numRegs;
+  int const max_blocks_regs = regs_per_sm / (regs_per_thread * block_size);

-template <class DriverType, class LaunchBounds>
-int cuda_get_max_block_size(const typename DriverType::functor_type& f,
-                            const size_t vector_length,
-                            const size_t shmem_extra_block,
-                            const size_t shmem_extra_thread) {
-  return CudaGetMaxBlockSize<DriverType, LaunchBounds, true>::get_block_size(
-      f, vector_length, shmem_extra_block, shmem_extra_thread);
+  // Limits due to shared memory/SM
+  size_t const shmem_per_sm            = properties.sharedMemPerMultiprocessor;
+  size_t const shmem_per_block         = properties.sharedMemPerBlock;
+  size_t const static_shmem            = attributes.sharedSizeBytes;
+  size_t const dynamic_shmem_per_block = attributes.maxDynamicSharedSizeBytes;
+  size_t const total_shmem             = static_shmem + dynamic_shmem;
+
+  int const max_blocks_shmem =
+      total_shmem > shmem_per_block || dynamic_shmem > dynamic_shmem_per_block
+          ? 0
+          : (total_shmem > 0 ? (int)shmem_per_sm / total_shmem
+                             : max_blocks_regs);
+
+  // Limits due to blocks/SM
+#if CUDA_VERSION >= 11000
+  int const max_blocks_per_sm = properties.maxBlocksPerMultiProcessor;
+#else
+  int const max_blocks_per_sm = [&properties]() {
+    switch (properties.major) {
+      case 3: return 16;
+      case 5:
+      case 6: return 32;
+      case 7: {
+        int isTuring = properties.minor == 5;
+        return (isTuring) ? 16 : 32;
+      }
+      default:
+        throw_runtime_exception("Unknown device in cuda block size deduction");
+        return 0;
+    }
+  }();
+#endif
+
+  // Overall occupancy in blocks
+  return std::min({max_blocks_regs, max_blocks_shmem, max_blocks_per_sm});
+}
+
+template <typename UnaryFunction, typename LaunchBounds>
+inline int cuda_deduce_block_size(bool early_termination,
+                                  cudaDeviceProp const& properties,
+                                  cudaFuncAttributes const& attributes,
+                                  UnaryFunction block_size_to_dynamic_shmem,
+                                  LaunchBounds) {
+  // Limits
+  int const max_threads_per_sm = properties.maxThreadsPerMultiProcessor;
+  // unsure if I need to do that or if this is already accounted for in the
+  // functor attributes
+  int const max_threads_per_block =
+      std::min(LaunchBounds::maxTperB == 0 ? (int)properties.maxThreadsPerBlock
+                                           : (int)LaunchBounds::maxTperB,
+               attributes.maxThreadsPerBlock);
+  int const min_blocks_per_sm =
+      LaunchBounds::minBperSM == 0 ? 1 : LaunchBounds::minBperSM;
+
+  // Recorded maximum
+  int opt_block_size     = 0;
+  int opt_threads_per_sm = 0;
+
+  for (int block_size = max_threads_per_block; block_size > 0;
+       block_size -= 32) {
+    size_t const dynamic_shmem = block_size_to_dynamic_shmem(block_size);
+
+    int blocks_per_sm = cuda_max_active_blocks_per_sm(
+        properties, attributes, block_size, dynamic_shmem);
+
+    int threads_per_sm = blocks_per_sm * block_size;
+
+    if (threads_per_sm > max_threads_per_sm) {
+      blocks_per_sm  = max_threads_per_sm / block_size;
+      threads_per_sm = blocks_per_sm * block_size;
+    }
+
+    if (blocks_per_sm >= min_blocks_per_sm) {
+      if (threads_per_sm >= opt_threads_per_sm) {
+        opt_block_size     = block_size;
+        opt_threads_per_sm = threads_per_sm;
+      }
+    }
+
+    if (early_termination && blocks_per_sm != 0) break;
+  }
+
+  return opt_block_size;
 }

 template <class FunctorType, class LaunchBounds>
@ -72,295 +152,24 @@ int cuda_get_max_block_size(const CudaInternal* cuda_instance,
                            const FunctorType& f, const size_t vector_length,
                            const size_t shmem_block,
                            const size_t shmem_thread) {
-  const int min_blocks_per_sm =
-      LaunchBounds::minBperSM == 0 ? 1 : LaunchBounds::minBperSM;
-  const int max_threads_per_block = LaunchBounds::maxTperB == 0
-                                        ? cuda_instance->m_maxThreadsPerBlock
-                                        : LaunchBounds::maxTperB;
+  (void)cuda_instance;

-  const int regs_per_thread     = attr.numRegs;
-  const int regs_per_sm         = cuda_instance->m_regsPerSM;
-  const int shmem_per_sm        = cuda_instance->m_shmemPerSM;
-  const int max_shmem_per_block = cuda_instance->m_maxShmemPerBlock;
-  const int max_blocks_per_sm   = cuda_instance->m_maxBlocksPerSM;
-  const int max_threads_per_sm  = cuda_instance->m_maxThreadsPerSM;
+  auto const& prop = Kokkos::Cuda().cuda_device_prop();

-  int block_size = std::min(attr.maxThreadsPerBlock, max_threads_per_block);
+  auto const block_size_to_dynamic_shmem = [&f, vector_length, shmem_block,
+                                            shmem_thread](int block_size) {
+    size_t const functor_shmem =
+        Kokkos::Impl::FunctorTeamShmemSize<FunctorType>::value(
+            f, block_size / vector_length);

-  int functor_shmem =
-      FunctorTeamShmemSize<FunctorType>::value(f, block_size / vector_length);
-  int total_shmem = shmem_block + shmem_thread * (block_size / vector_length) +
-                    functor_shmem + attr.sharedSizeBytes;
-  int max_blocks_regs = regs_per_sm / (regs_per_thread * block_size);
-  int max_blocks_shmem =
-      (total_shmem < max_shmem_per_block)
-          ? (total_shmem > 0 ? shmem_per_sm / total_shmem : max_blocks_regs)
-          : 0;
-  int blocks_per_sm  = std::min(max_blocks_regs, max_blocks_shmem);
-  int threads_per_sm = blocks_per_sm * block_size;
-  if (threads_per_sm > max_threads_per_sm) {
-    blocks_per_sm  = max_threads_per_sm / block_size;
-    threads_per_sm = blocks_per_sm * block_size;
-  }
-  int opt_block_size = (blocks_per_sm >= min_blocks_per_sm) ? block_size : 0;
-  int opt_threads_per_sm = threads_per_sm;
-  // printf("BlockSizeMax: %i Shmem: %i %i %i %i Regs: %i %i Blocks: %i %i
-  // Achieved: %i %i Opt: %i %i\n",block_size,
-  //   shmem_per_sm,max_shmem_per_block,functor_shmem,total_shmem,
-  //   regs_per_sm,regs_per_thread,max_blocks_shmem,max_blocks_regs,blocks_per_sm,threads_per_sm,opt_block_size,opt_threads_per_sm);
-  block_size -= 32;
-  while ((blocks_per_sm == 0) && (block_size >= 32)) {
-    functor_shmem =
-        FunctorTeamShmemSize<FunctorType>::value(f, block_size / vector_length);
-    total_shmem = shmem_block + shmem_thread * (block_size / vector_length) +
-                  functor_shmem + attr.sharedSizeBytes;
-    max_blocks_regs = regs_per_sm / (regs_per_thread * block_size);
-    max_blocks_shmem =
-        (total_shmem < max_shmem_per_block)
-            ? (total_shmem > 0 ? shmem_per_sm / total_shmem : max_blocks_regs)
-            : 0;
-    blocks_per_sm  = std::min(max_blocks_regs, max_blocks_shmem);
-    threads_per_sm = blocks_per_sm * block_size;
-    if (threads_per_sm > max_threads_per_sm) {
-      blocks_per_sm  = max_threads_per_sm / block_size;
-      threads_per_sm = blocks_per_sm * block_size;
-    }
-    if ((blocks_per_sm >= min_blocks_per_sm) &&
-        (blocks_per_sm <= max_blocks_per_sm)) {
-      if (threads_per_sm >= opt_threads_per_sm) {
-        opt_block_size     = block_size;
-        opt_threads_per_sm = threads_per_sm;
-      }
-    }
-    // printf("BlockSizeMax: %i Shmem: %i %i %i %i Regs: %i %i Blocks: %i %i
-    // Achieved: %i %i Opt: %i %i\n",block_size,
-    //   shmem_per_sm,max_shmem_per_block,functor_shmem,total_shmem,
-    //   regs_per_sm,regs_per_thread,max_blocks_shmem,max_blocks_regs,blocks_per_sm,threads_per_sm,opt_block_size,opt_threads_per_sm);
-    block_size -= 32;
-  }
-  return opt_block_size;
-}
+    size_t const dynamic_shmem = shmem_block +
+                                 shmem_thread * (block_size / vector_length) +
+                                 functor_shmem;
+    return dynamic_shmem;
+  };

-template <class DriverType>
-struct CudaGetMaxBlockSize<DriverType, Kokkos::LaunchBounds<>, true> {
-  static int get_block_size(const typename DriverType::functor_type& f,
-                            const size_t vector_length,
-                            const size_t shmem_extra_block,
-                            const size_t shmem_extra_thread) {
-    int numBlocks;
-    int blockSize = 1024;
-    int sharedmem =
-        shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
-        FunctorTeamShmemSize<typename DriverType::functor_type>::value(
-            f, blockSize / vector_length);
-    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &numBlocks, cuda_parallel_launch_constant_memory<DriverType>, blockSize,
-        sharedmem);
-
-    if (numBlocks > 0) return blockSize;
-    while (blockSize > 32 && numBlocks == 0) {
-      blockSize /= 2;
-      sharedmem =
-          shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
-          FunctorTeamShmemSize<typename DriverType::functor_type>::value(
-              f, blockSize / vector_length);
-
-      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-          &numBlocks, cuda_parallel_launch_constant_memory<DriverType>,
-          blockSize, sharedmem);
-    }
-    int blockSizeUpperBound = blockSize * 2;
-    while (blockSize < blockSizeUpperBound && numBlocks > 0) {
-      blockSize += 32;
-      sharedmem =
-          shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
-          FunctorTeamShmemSize<typename DriverType::functor_type>::value(
-              f, blockSize / vector_length);
-
-      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-          &numBlocks, cuda_parallel_launch_constant_memory<DriverType>,
-          blockSize, sharedmem);
-    }
-    return blockSize - 32;
-  }
-};
-
-template <class DriverType>
-struct CudaGetMaxBlockSize<DriverType, Kokkos::LaunchBounds<>, false> {
-  static int get_block_size(const typename DriverType::functor_type& f,
-                            const size_t vector_length,
-                            const size_t shmem_extra_block,
-                            const size_t shmem_extra_thread) {
-    int numBlocks;
-
-    unsigned int blockSize = 1024;
-    unsigned int sharedmem =
-        shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
-        FunctorTeamShmemSize<typename DriverType::functor_type>::value(
-            f, blockSize / vector_length);
-    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &numBlocks, cuda_parallel_launch_local_memory<DriverType>, blockSize,
-        sharedmem);
-
-    if (numBlocks > 0) return blockSize;
-    while (blockSize > 32 && numBlocks == 0) {
-      blockSize /= 2;
-      sharedmem =
-          shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
-          FunctorTeamShmemSize<typename DriverType::functor_type>::value(
-              f, blockSize / vector_length);
-
-      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-          &numBlocks, cuda_parallel_launch_local_memory<DriverType>, blockSize,
-          sharedmem);
-    }
-    unsigned int blockSizeUpperBound = blockSize * 2;
-    while (blockSize < blockSizeUpperBound && numBlocks > 0) {
-      blockSize += 32;
-      sharedmem =
-          shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
-          FunctorTeamShmemSize<typename DriverType::functor_type>::value(
-              f, blockSize / vector_length);
-
-      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-          &numBlocks, cuda_parallel_launch_local_memory<DriverType>, blockSize,
-          sharedmem);
-    }
-    return blockSize - 32;
-  }
-};
-
-template <class DriverType, unsigned int MaxThreadsPerBlock,
-          unsigned int MinBlocksPerSM>
-struct CudaGetMaxBlockSize<
-    DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
-    true> {
-  static int get_block_size(const typename DriverType::functor_type& f,
-                            const size_t vector_length,
-                            const size_t shmem_extra_block,
-                            const size_t shmem_extra_thread) {
-    int numBlocks = 0, oldNumBlocks = 0;
-    unsigned int blockSize = MaxThreadsPerBlock;
-    unsigned int sharedmem =
-        shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
-        FunctorTeamShmemSize<typename DriverType::functor_type>::value(
-            f, blockSize / vector_length);
-    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &numBlocks,
-        cuda_parallel_launch_constant_memory<DriverType, MaxThreadsPerBlock,
-                                             MinBlocksPerSM>,
-        blockSize, sharedmem);
-
-    if (static_cast<unsigned int>(numBlocks) >= MinBlocksPerSM)
-      return blockSize;
-
-    while (blockSize > 32 &&
-           static_cast<unsigned int>(numBlocks) < MinBlocksPerSM) {
-      blockSize /= 2;
-      sharedmem =
-          shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
-          FunctorTeamShmemSize<typename DriverType::functor_type>::value(
-              f, blockSize / vector_length);
-
-      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-          &numBlocks, cuda_parallel_launch_constant_memory<DriverType>,
-          blockSize, sharedmem);
-    }
-    unsigned int blockSizeUpperBound =
-        (blockSize * 2 < MaxThreadsPerBlock ? blockSize * 2
-                                            : MaxThreadsPerBlock);
-    while (blockSize<blockSizeUpperBound&& static_cast<unsigned int>(numBlocks)>
-               MinBlocksPerSM) {
-      blockSize += 32;
-      sharedmem =
-          shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
-          FunctorTeamShmemSize<typename DriverType::functor_type>::value(
-              f, blockSize / vector_length);
-      oldNumBlocks = numBlocks;
-      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-          &numBlocks, cuda_parallel_launch_constant_memory<DriverType>,
-          blockSize, sharedmem);
-    }
-    if (static_cast<unsigned int>(oldNumBlocks) >= MinBlocksPerSM)
-      return blockSize - 32;
-    return -1;
-  }
-};
-
-template <class DriverType, unsigned int MaxThreadsPerBlock,
-          unsigned int MinBlocksPerSM>
-struct CudaGetMaxBlockSize<
-    DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
-    false> {
-  static int get_block_size(const typename DriverType::functor_type& f,
-                            const size_t vector_length,
-                            const size_t shmem_extra_block,
-                            const size_t shmem_extra_thread) {
-    int numBlocks = 0, oldNumBlocks = 0;
-    unsigned int blockSize = MaxThreadsPerBlock;
-    int sharedmem =
-        shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
-        FunctorTeamShmemSize<typename DriverType::functor_type>::value(
-            f, blockSize / vector_length);
-    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &numBlocks,
-        cuda_parallel_launch_local_memory<DriverType, MaxThreadsPerBlock,
-                                          MinBlocksPerSM>,
-        blockSize, sharedmem);
-    if (static_cast<unsigned int>(numBlocks) >= MinBlocksPerSM)
-      return blockSize;
-
-    while (blockSize > 32 &&
-           static_cast<unsigned int>(numBlocks) < MinBlocksPerSM) {
-      blockSize /= 2;
-      sharedmem =
-          shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
-          FunctorTeamShmemSize<typename DriverType::functor_type>::value(
-              f, blockSize / vector_length);
-
-      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-          &numBlocks, cuda_parallel_launch_local_memory<DriverType>, blockSize,
-          sharedmem);
-    }
-    unsigned int blockSizeUpperBound =
-        (blockSize * 2 < MaxThreadsPerBlock ? blockSize * 2
-                                            : MaxThreadsPerBlock);
-    while (blockSize < blockSizeUpperBound &&
-           static_cast<unsigned int>(numBlocks) >= MinBlocksPerSM) {
-      blockSize += 32;
-      sharedmem =
-          shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
-          FunctorTeamShmemSize<typename DriverType::functor_type>::value(
-              f, blockSize / vector_length);
-      oldNumBlocks = numBlocks;
-      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-          &numBlocks, cuda_parallel_launch_local_memory<DriverType>, blockSize,
-          sharedmem);
-    }
-    if (static_cast<unsigned int>(oldNumBlocks) >= MinBlocksPerSM)
-      return blockSize - 32;
-    return -1;
-  }
-};
-
-template <class DriverType, class LaunchBounds, bool Large>
-struct CudaGetOptBlockSize;
-
-template <class DriverType, class LaunchBounds>
-int cuda_get_opt_block_size(const typename DriverType::functor_type& f,
-                            const size_t vector_length,
-                            const size_t shmem_extra_block,
-                            const size_t shmem_extra_thread) {
-  return CudaGetOptBlockSize<
-      DriverType, LaunchBounds,
-      // LaunchBounds::launch_mechanism == Kokkos::Experimental::LaunchDefault ?
-      //            (( CudaTraits::ConstantMemoryUseThreshold <
-      //            sizeof(DriverType) )?
-      //                   Kokkos::Experimental::CudaLaunchConstantMemory:Kokkos::Experimental::CudaLaunchLocalMemory):
-      //             LaunchBounds::launch_mechanism
-      (CudaTraits::ConstantMemoryUseThreshold <
-       sizeof(DriverType))>::get_block_size(f, vector_length, shmem_extra_block,
-                                            shmem_extra_thread);
+  return cuda_deduce_block_size(true, prop, attr, block_size_to_dynamic_shmem,
+                                LaunchBounds{});
 }

 template <class FunctorType, class LaunchBounds>
@ -369,221 +178,26 @@ int cuda_get_opt_block_size(const CudaInternal* cuda_instance,
                            const FunctorType& f, const size_t vector_length,
                            const size_t shmem_block,
                            const size_t shmem_thread) {
-  const int min_blocks_per_sm =
-      LaunchBounds::minBperSM == 0 ? 1 : LaunchBounds::minBperSM;
-  const int max_threads_per_block = LaunchBounds::maxTperB == 0
-                                        ? cuda_instance->m_maxThreadsPerBlock
-                                        : LaunchBounds::maxTperB;
+  (void)cuda_instance;

-  const int regs_per_thread     = attr.numRegs;
-  const int regs_per_sm         = cuda_instance->m_regsPerSM;
-  const int shmem_per_sm        = cuda_instance->m_shmemPerSM;
-  const int max_shmem_per_block = cuda_instance->m_maxShmemPerBlock;
-  const int max_blocks_per_sm   = cuda_instance->m_maxBlocksPerSM;
-  const int max_threads_per_sm  = cuda_instance->m_maxThreadsPerSM;
+  auto const& prop = Kokkos::Cuda().cuda_device_prop();

-  int block_size = std::min(attr.maxThreadsPerBlock, max_threads_per_block);
+  auto const block_size_to_dynamic_shmem = [&f, vector_length, shmem_block,
+                                            shmem_thread](int block_size) {
+    size_t const functor_shmem =
+        Kokkos::Impl::FunctorTeamShmemSize<FunctorType>::value(
+            f, block_size / vector_length);

-  int functor_shmem =
-      FunctorTeamShmemSize<FunctorType>::value(f, block_size / vector_length);
-  int total_shmem = shmem_block + shmem_thread * (block_size / vector_length) +
-                    functor_shmem + attr.sharedSizeBytes;
-  int max_blocks_regs = regs_per_sm / (regs_per_thread * block_size);
-  int max_blocks_shmem =
-      (total_shmem < max_shmem_per_block)
-          ? (total_shmem > 0 ? shmem_per_sm / total_shmem : max_blocks_regs)
-          : 0;
-  int blocks_per_sm  = std::min(max_blocks_regs, max_blocks_shmem);
-  int threads_per_sm = blocks_per_sm * block_size;
-  if (threads_per_sm > max_threads_per_sm) {
-    blocks_per_sm  = max_threads_per_sm / block_size;
-    threads_per_sm = blocks_per_sm * block_size;
-  }
-  int opt_block_size = (blocks_per_sm >= min_blocks_per_sm) ? block_size : 0;
-  int opt_threads_per_sm = threads_per_sm;
+    size_t const dynamic_shmem = shmem_block +
+                                 shmem_thread * (block_size / vector_length) +
+                                 functor_shmem;
+    return dynamic_shmem;
+  };

-  block_size -= 32;
-  while ((block_size >= 32)) {
-    functor_shmem =
-        FunctorTeamShmemSize<FunctorType>::value(f, block_size / vector_length);
-    total_shmem = shmem_block + shmem_thread * (block_size / vector_length) +
-                  functor_shmem + attr.sharedSizeBytes;
-    max_blocks_regs = regs_per_sm / (regs_per_thread * block_size);
-    max_blocks_shmem =
-        (total_shmem < max_shmem_per_block)
-            ? (total_shmem > 0 ? shmem_per_sm / total_shmem : max_blocks_regs)
-            : 0;
-    blocks_per_sm  = std::min(max_blocks_regs, max_blocks_shmem);
-    threads_per_sm = blocks_per_sm * block_size;
-    if (threads_per_sm > max_threads_per_sm) {
-      blocks_per_sm  = max_threads_per_sm / block_size;
-      threads_per_sm = blocks_per_sm * block_size;
-    }
-    if ((blocks_per_sm >= min_blocks_per_sm) &&
-        (blocks_per_sm <= max_blocks_per_sm)) {
-      if (threads_per_sm >= opt_threads_per_sm) {
-        opt_block_size     = block_size;
-        opt_threads_per_sm = threads_per_sm;
-      }
-    }
-    block_size -= 32;
-  }
-  return opt_block_size;
+  return cuda_deduce_block_size(false, prop, attr, block_size_to_dynamic_shmem,
+                                LaunchBounds{});
 }

-template <class DriverType>
-struct CudaGetOptBlockSize<DriverType, Kokkos::LaunchBounds<0, 0>, true> {
-  static int get_block_size(const typename DriverType::functor_type& f,
-                            const size_t vector_length,
-                            const size_t shmem_extra_block,
-                            const size_t shmem_extra_thread) {
-    int blockSize = 16;
-    int numBlocks;
-    int sharedmem;
-    int maxOccupancy  = 0;
-    int bestBlockSize = 0;
-
-    while (blockSize < 1024) {
-      blockSize *= 2;
-
-      // calculate the occupancy with that optBlockSize and check whether its
-      // larger than the largest one found so far
-      sharedmem =
-          shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
-          FunctorTeamShmemSize<typename DriverType::functor_type>::value(
-              f, blockSize / vector_length);
-      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-          &numBlocks, cuda_parallel_launch_constant_memory<DriverType>,
-          blockSize, sharedmem);
-      if (maxOccupancy < numBlocks * blockSize) {
-        maxOccupancy  = numBlocks * blockSize;
-        bestBlockSize = blockSize;
-      }
-    }
-    return bestBlockSize;
-  }
-};
-
-template <class DriverType>
-struct CudaGetOptBlockSize<DriverType, Kokkos::LaunchBounds<0, 0>, false> {
-  static int get_block_size(const typename DriverType::functor_type& f,
-                            const size_t vector_length,
-                            const size_t shmem_extra_block,
-                            const size_t shmem_extra_thread) {
-    int blockSize = 16;
-    int numBlocks;
-    int sharedmem;
-    int maxOccupancy  = 0;
-    int bestBlockSize = 0;
-
-    while (blockSize < 1024) {
-      blockSize *= 2;
-      sharedmem =
-          shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
-          FunctorTeamShmemSize<typename DriverType::functor_type>::value(
-              f, blockSize / vector_length);
-
-      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-          &numBlocks, cuda_parallel_launch_local_memory<DriverType>, blockSize,
-          sharedmem);
-
-      if (maxOccupancy < numBlocks * blockSize) {
-        maxOccupancy  = numBlocks * blockSize;
-        bestBlockSize = blockSize;
-      }
-    }
-    return bestBlockSize;
-  }
-};
-
-template <class DriverType, unsigned int MaxThreadsPerBlock,
-          unsigned int MinBlocksPerSM>
-struct CudaGetOptBlockSize<
-    DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
-    true> {
-  static int get_block_size(const typename DriverType::functor_type& f,
-                            const size_t vector_length,
-                            const size_t shmem_extra_block,
-                            const size_t shmem_extra_thread) {
-    int blockSize = 16;
-    int numBlocks;
-    int sharedmem;
-    int maxOccupancy  = 0;
-    int bestBlockSize = 0;
-    int max_threads_per_block =
-        std::min(MaxThreadsPerBlock,
-                 cuda_internal_maximum_warp_count() * CudaTraits::WarpSize);
-
-    while (blockSize < max_threads_per_block) {
-      blockSize *= 2;
-
-      // calculate the occupancy with that optBlockSize and check whether its
-      // larger than the largest one found so far
-      sharedmem =
-          shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
-          FunctorTeamShmemSize<typename DriverType::functor_type>::value(
-              f, blockSize / vector_length);
-      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-          &numBlocks,
-          cuda_parallel_launch_constant_memory<DriverType, MaxThreadsPerBlock,
-                                               MinBlocksPerSM>,
-          blockSize, sharedmem);
-      if (numBlocks >= int(MinBlocksPerSM) &&
-          blockSize <= int(MaxThreadsPerBlock)) {
-        if (maxOccupancy < numBlocks * blockSize) {
-          maxOccupancy  = numBlocks * blockSize;
-          bestBlockSize = blockSize;
-        }
-      }
-    }
-    if (maxOccupancy > 0) return bestBlockSize;
-    return -1;
-  }
-};
-
-template <class DriverType, unsigned int MaxThreadsPerBlock,
-          unsigned int MinBlocksPerSM>
-struct CudaGetOptBlockSize<
-    DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
-    false> {
-  static int get_block_size(const typename DriverType::functor_type& f,
-                            const size_t vector_length,
-                            const size_t shmem_extra_block,
-                            const size_t shmem_extra_thread) {
-    int blockSize = 16;
-    int numBlocks;
-    int sharedmem;
-    int maxOccupancy  = 0;
-    int bestBlockSize = 0;
-    int max_threads_per_block =
-        std::min(MaxThreadsPerBlock,
-                 cuda_internal_maximum_warp_count() * CudaTraits::WarpSize);
-
-    while (blockSize < max_threads_per_block) {
-      blockSize *= 2;
-      sharedmem =
-          shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) +
-          FunctorTeamShmemSize<typename DriverType::functor_type>::value(
-              f, blockSize / vector_length);
-
-      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-          &numBlocks,
-          cuda_parallel_launch_local_memory<DriverType, MaxThreadsPerBlock,
-                                            MinBlocksPerSM>,
-          blockSize, sharedmem);
-      if (numBlocks >= int(MinBlocksPerSM) &&
-          blockSize <= int(MaxThreadsPerBlock)) {
-        if (maxOccupancy < numBlocks * blockSize) {
-          maxOccupancy  = numBlocks * blockSize;
-          bestBlockSize = blockSize;
-        }
-      }
-    }
-    if (maxOccupancy > 0) return bestBlockSize;
-    return -1;
-  }
-};
-
 }  // namespace Impl
 }  // namespace Kokkos

--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp
@ -50,7 +50,7 @@

 #include <impl/Kokkos_Error.hpp>

-#include <iostream>
+#include <iosfwd>

 namespace Kokkos {
 namespace Impl {
@ -113,12 +113,7 @@ class CudaRawMemoryAllocationFailure : public RawMemoryAllocationFailure {
               get_failure_mode(arg_error_code), arg_mechanism),
        m_error_code(arg_error_code) {}

-  void append_additional_error_information(std::ostream& o) const override {
-    if (m_error_code != cudaSuccess) {
-      o << "  The Cuda allocation returned the error code \"\""
-        << cudaGetErrorName(m_error_code) << "\".";
-    }
-  }
+  void append_additional_error_information(std::ostream& o) const override;
 };

 }  // end namespace Experimental
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
@ -55,7 +55,7 @@
 #include <Cuda/Kokkos_Cuda_Instance.hpp>
 #include <Cuda/Kokkos_Cuda_Locks.hpp>
 #include <impl/Kokkos_Error.hpp>
-#include <impl/Kokkos_Profiling_Interface.hpp>
+#include <impl/Kokkos_Tools.hpp>

 /*--------------------------------------------------------------------------*/
 /* Standard 'C' libraries */
@ -134,7 +134,7 @@ bool cuda_launch_blocking() {

  if (env == 0) return false;

-  return atoi(env);
+  return std::stoi(env);
 }
 #endif

@ -239,8 +239,9 @@ const CudaInternalDevices &CudaInternalDevices::singleton() {

 }  // namespace

-int CudaInternal::was_initialized = 0;
-int CudaInternal::was_finalized   = 0;
+unsigned long *CudaInternal::constantMemHostStaging = nullptr;
+cudaEvent_t CudaInternal::constantMemReusable       = nullptr;
+
 //----------------------------------------------------------------------------

 void CudaInternal::print_configuration(std::ostream &s) const {
@ -288,11 +289,11 @@ CudaInternal::~CudaInternal() {
  m_scratchUnifiedCount     = 0;
  m_scratchUnifiedSupported = 0;
  m_streamCount             = 0;
-  m_scratchSpace            = 0;
-  m_scratchFlags            = 0;
-  m_scratchUnified          = 0;
-  m_scratchConcurrentBitset = 0;
-  m_stream                  = 0;
+  m_scratchSpace            = nullptr;
+  m_scratchFlags            = nullptr;
+  m_scratchUnified          = nullptr;
+  m_scratchConcurrentBitset = nullptr;
+  m_stream                  = nullptr;
 }

 int CudaInternal::verify_is_initialized(const char *const label) const {
@ -307,22 +308,20 @@ CudaInternal &CudaInternal::singleton() {
  static CudaInternal self;
  return self;
 }
-void CudaInternal::fence() const { cudaStreamSynchronize(m_stream); }
+void CudaInternal::fence() const {
+  CUDA_SAFE_CALL(cudaStreamSynchronize(m_stream));
+}

 void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) {
  if (was_finalized)
    Kokkos::abort("Calling Cuda::initialize after Cuda::finalize is illegal\n");
-  was_initialized = 1;
+  was_initialized = true;
  if (is_initialized()) return;

  enum { WordSize = sizeof(size_type) };

 #ifndef KOKKOS_IMPL_TURN_OFF_CUDA_HOST_INIT_CHECK
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-  if (!HostSpace::execution_space::is_initialized()) {
-#else
  if (!HostSpace::execution_space::impl_is_initialized()) {
-#endif
    const std::string msg(
        "Cuda::initialize ERROR : HostSpace::execution_space is not "
        "initialized");
@ -332,7 +331,7 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) {

  const CudaInternalDevices &dev_info = CudaInternalDevices::singleton();

-  const bool ok_init = 0 == m_scratchSpace || 0 == m_scratchFlags;
+  const bool ok_init = nullptr == m_scratchSpace || nullptr == m_scratchFlags;

  const bool ok_id =
      0 <= cuda_device_id && cuda_device_id < dev_info.m_cudaDevCount;
@ -366,7 +365,7 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) {
    int compiled_major = m_cudaArch / 100;
    int compiled_minor = (m_cudaArch % 100) / 10;

-    if (compiled_major != cudaProp.major || compiled_minor < cudaProp.minor) {
+    if (compiled_major != cudaProp.major || compiled_minor > cudaProp.minor) {
      std::stringstream ss;
      ss << "Kokkos::Cuda::initialize ERROR: running kernels compiled for "
            "compute capability "
@ -453,8 +452,8 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) {

      // Allocate and initialize uint32_t[ buffer_bound ]

-      typedef Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>
-          Record;
+      using Record =
+          Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>;

      Record *const r =
          Record::allocate(Kokkos::CudaSpace(), "InternalScratchBitset",
@ -511,7 +510,7 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) {
  if (env_force_device_alloc == 0)
    force_device_alloc = false;
  else
-    force_device_alloc = atoi(env_force_device_alloc) != 0;
+    force_device_alloc = std::stoi(env_force_device_alloc) != 0;

  const char *env_visible_devices = getenv("CUDA_VISIBLE_DEVICES");
  bool visible_devices_one        = true;
@ -542,14 +541,23 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) {
 #endif

  // Init the array for used for arbitrarily sized atomics
-  if (stream == 0) Impl::initialize_host_cuda_lock_arrays();
+  if (stream == nullptr) Impl::initialize_host_cuda_lock_arrays();
+
+  // Allocate a staging buffer for constant mem in pinned host memory
+  // and an event to avoid overwriting driver for previous kernel launches
+  if (stream == nullptr) {
+    CUDA_SAFE_CALL(cudaMallocHost((void **)&constantMemHostStaging,
+                                  CudaTraits::ConstantMemoryUsage));
+
+    CUDA_SAFE_CALL(cudaEventCreate(&constantMemReusable));
+  }

  m_stream = stream;
 }

 //----------------------------------------------------------------------------

-typedef Cuda::size_type ScratchGrain[Impl::CudaTraits::WarpSize];
+using ScratchGrain = Cuda::size_type[Impl::CudaTraits::WarpSize];
 enum { sizeScratchGrain = sizeof(ScratchGrain) };

 Cuda::size_type *CudaInternal::scratch_flags(const Cuda::size_type size) const {
@ -557,8 +565,8 @@ Cuda::size_type *CudaInternal::scratch_flags(const Cuda::size_type size) const {
      m_scratchFlagsCount * sizeScratchGrain < size) {
    m_scratchFlagsCount = (size + sizeScratchGrain - 1) / sizeScratchGrain;

-    typedef Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>
-        Record;
+    using Record =
+        Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>;

    if (m_scratchFlags) Record::decrement(Record::get_record(m_scratchFlags));

@ -582,8 +590,8 @@ Cuda::size_type *CudaInternal::scratch_space(const Cuda::size_type size) const {
      m_scratchSpaceCount * sizeScratchGrain < size) {
    m_scratchSpaceCount = (size + sizeScratchGrain - 1) / sizeScratchGrain;

-    typedef Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>
-        Record;
+    using Record =
+        Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>;

    if (m_scratchSpace) Record::decrement(Record::get_record(m_scratchSpace));

@ -605,9 +613,8 @@ Cuda::size_type *CudaInternal::scratch_unified(
      m_scratchUnifiedCount * sizeScratchGrain < size) {
    m_scratchUnifiedCount = (size + sizeScratchGrain - 1) / sizeScratchGrain;

-    typedef Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaHostPinnedSpace,
-                                                 void>
-        Record;
+    using Record =
+        Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>;

    if (m_scratchUnified)
      Record::decrement(Record::get_record(m_scratchUnified));
@ -629,8 +636,8 @@ Cuda::size_type *CudaInternal::scratch_functor(
  if (verify_is_initialized("scratch_functor") && m_scratchFunctorSize < size) {
    m_scratchFunctorSize = size;

-    typedef Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>
-        Record;
+    using Record =
+        Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>;

    if (m_scratchFunctor)
      Record::decrement(Record::get_record(m_scratchFunctor));
@ -649,15 +656,13 @@ Cuda::size_type *CudaInternal::scratch_functor(
 //----------------------------------------------------------------------------

 void CudaInternal::finalize() {
-  was_finalized = 1;
-  if (0 != m_scratchSpace || 0 != m_scratchFlags) {
+  was_finalized = true;
+  if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) {
    Impl::finalize_host_cuda_lock_arrays();

-    if (m_stream != 0) cudaStreamDestroy(m_stream);
-
-    typedef Kokkos::Impl::SharedAllocationRecord<CudaSpace> RecordCuda;
-    typedef Kokkos::Impl::SharedAllocationRecord<CudaHostPinnedSpace>
-        RecordHost;
+    using RecordCuda = Kokkos::Impl::SharedAllocationRecord<CudaSpace>;
+    using RecordHost =
+        Kokkos::Impl::SharedAllocationRecord<CudaHostPinnedSpace>;

    RecordCuda::decrement(RecordCuda::get_record(m_scratchFlags));
    RecordCuda::decrement(RecordCuda::get_record(m_scratchSpace));
@ -675,11 +680,17 @@ void CudaInternal::finalize() {
    m_scratchFlagsCount       = 0;
    m_scratchUnifiedCount     = 0;
    m_streamCount             = 0;
-    m_scratchSpace            = 0;
-    m_scratchFlags            = 0;
-    m_scratchUnified          = 0;
-    m_scratchConcurrentBitset = 0;
-    m_stream                  = 0;
+    m_scratchSpace            = nullptr;
+    m_scratchFlags            = nullptr;
+    m_scratchUnified          = nullptr;
+    m_scratchConcurrentBitset = nullptr;
+    m_stream                  = nullptr;
+  }
+
+  // only destroy these if we're finalizing the singleton
+  if (this == &singleton()) {
+    cudaFreeHost(constantMemHostStaging);
+    cudaEventDestroy(constantMemReusable);
  }
 }

@ -743,27 +754,13 @@ int Cuda::concurrency() {
  return Impl::CudaInternal::singleton().m_maxConcurrency;
 }

-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-int Cuda::is_initialized()
-#else
-int Cuda::impl_is_initialized()
-#endif
-{
+int Cuda::impl_is_initialized() {
  return Impl::CudaInternal::singleton().is_initialized();
 }

-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-void Cuda::initialize(const Cuda::SelectDevice config, size_t num_instances)
-#else
 void Cuda::impl_initialize(const Cuda::SelectDevice config,
-                           size_t /*num_instances*/)
-#endif
-{
-  Impl::CudaInternal::singleton().initialize(config.cuda_device_id, 0);
-
-#if defined(KOKKOS_ENABLE_PROFILING)
-  Kokkos::Profiling::initialize();
-#endif
+                           size_t /*num_instances*/) {
+  Impl::CudaInternal::singleton().initialize(config.cuda_device_id, nullptr);
 }

 std::vector<unsigned> Cuda::detect_device_arch() {
@ -793,48 +790,72 @@ Cuda::size_type Cuda::device_arch() {
  return dev_arch;
 }

-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-void Cuda::finalize()
-#else
-void Cuda::impl_finalize()
-#endif
-{
-  Impl::CudaInternal::singleton().finalize();
+void Cuda::impl_finalize() { Impl::CudaInternal::singleton().finalize(); }

-#if defined(KOKKOS_ENABLE_PROFILING)
-  Kokkos::Profiling::finalize();
-#endif
-}
-
-Cuda::Cuda() : m_space_instance(&Impl::CudaInternal::singleton()) {
+Cuda::Cuda()
+    : m_space_instance(&Impl::CudaInternal::singleton()), m_counter(nullptr) {
  Impl::CudaInternal::singleton().verify_is_initialized(
      "Cuda instance constructor");
 }

-Cuda::Cuda(cudaStream_t stream) : m_space_instance(new Impl::CudaInternal) {
+Cuda::Cuda(cudaStream_t stream)
+    : m_space_instance(new Impl::CudaInternal), m_counter(new int(1)) {
  Impl::CudaInternal::singleton().verify_is_initialized(
      "Cuda instance constructor");
  m_space_instance->initialize(Impl::CudaInternal::singleton().m_cudaDev,
                               stream);
 }

+KOKKOS_FUNCTION Cuda::Cuda(Cuda &&other) noexcept {
+  m_space_instance       = other.m_space_instance;
+  other.m_space_instance = nullptr;
+  m_counter              = other.m_counter;
+  other.m_counter        = nullptr;
+}
+
+KOKKOS_FUNCTION Cuda::Cuda(const Cuda &other)
+    : m_space_instance(other.m_space_instance), m_counter(other.m_counter) {
+#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA
+  if (m_counter) Kokkos::atomic_add(m_counter, 1);
+#endif
+}
+
+KOKKOS_FUNCTION Cuda &Cuda::operator=(Cuda &&other) noexcept {
+  m_space_instance       = other.m_space_instance;
+  other.m_space_instance = nullptr;
+  m_counter              = other.m_counter;
+  other.m_counter        = nullptr;
+  return *this;
+}
+
+KOKKOS_FUNCTION Cuda &Cuda::operator=(const Cuda &other) {
+  m_space_instance = other.m_space_instance;
+  m_counter        = other.m_counter;
+#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA
+  if (m_counter) Kokkos::atomic_add(m_counter, 1);
+#endif
+  return *this;
+}
+
+KOKKOS_FUNCTION Cuda::~Cuda() noexcept {
+#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA
+  if (m_counter == nullptr) return;
+  int const count = Kokkos::atomic_fetch_sub(m_counter, 1);
+  if (count == 1) {
+    delete m_counter;
+    m_space_instance->finalize();
+    delete m_space_instance;
+  }
+#endif
+}
+
 void Cuda::print_configuration(std::ostream &s, const bool) {
  Impl::CudaInternal::singleton().print_configuration(s);
 }

-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-bool Cuda::sleep() { return false; }
-
-bool Cuda::wake() { return true; }
-#endif
-
 void Cuda::impl_static_fence() { Kokkos::Impl::cuda_device_synchronize(); }

-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-void Cuda::fence() { impl_static_fence(); }
-#else
 void Cuda::fence() const { m_space_instance->fence(); }
-#endif

 const char *Cuda::name() { return "Cuda"; }

--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp
@ -1,6 +1,8 @@
 #ifndef KOKKOS_CUDA_INSTANCE_HPP_
 #define KOKKOS_CUDA_INSTANCE_HPP_

+#include <vector>
+#include <impl/Kokkos_Tools.hpp>
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 // These functions fulfill the purpose of allowing to work around
@ -15,25 +17,28 @@ namespace Kokkos {
 namespace Impl {

 struct CudaTraits {
-  enum { WarpSize = 32 /* 0x0020 */ };
-  enum { WarpIndexMask = 0x001f /* Mask for warpindex */ };
-  enum { WarpIndexShift = 5 /* WarpSize == 1 << WarpShift */ };
-
-  enum { ConstantMemoryUsage = 0x008000 /* 32k bytes */ };
-  enum { ConstantMemoryCache = 0x002000 /*  8k bytes */ };
-  enum { KernelArgumentLimit = 0x001000 /*  4k bytes */ };
-
-  typedef unsigned long
-      ConstantGlobalBufferType[ConstantMemoryUsage / sizeof(unsigned long)];
-
-#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_PASCAL)
-  enum {
-    ConstantMemoryUseThreshold =
-        0x000200 /* 0 bytes -> always use constant (or global)*/
+  enum : CudaSpace::size_type { WarpSize = 32 /* 0x0020 */ };
+  enum : CudaSpace::size_type {
+    WarpIndexMask = 0x001f /* Mask for warpindex */
  };
-#else
+  enum : CudaSpace::size_type {
+    WarpIndexShift = 5 /* WarpSize == 1 << WarpShift */
+  };
+
+  enum : CudaSpace::size_type {
+    ConstantMemoryUsage = 0x008000 /* 32k bytes */
+  };
+  enum : CudaSpace::size_type {
+    ConstantMemoryCache = 0x002000 /*  8k bytes */
+  };
+  enum : CudaSpace::size_type {
+    KernelArgumentLimit = 0x001000 /*  4k bytes */
+  };
+
+  using ConstantGlobalBufferType =
+      unsigned long[ConstantMemoryUsage / sizeof(unsigned long)];
+
  enum { ConstantMemoryUseThreshold = 0x000200 /* 512 bytes */ };
-#endif

  KOKKOS_INLINE_FUNCTION static CudaSpace::size_type warp_count(
      CudaSpace::size_type i) {
@ -42,7 +47,7 @@ struct CudaTraits {

  KOKKOS_INLINE_FUNCTION static CudaSpace::size_type warp_align(
      CudaSpace::size_type i) {
-    enum { Mask = ~CudaSpace::size_type(WarpIndexMask) };
+    constexpr CudaSpace::size_type Mask = ~WarpIndexMask;
    return (i + WarpIndexMask) & Mask;
  }
 };
@ -79,7 +84,7 @@ class CudaInternal {
 #endif

 public:
-  typedef Cuda::size_type size_type;
+  using size_type = Cuda::size_type;

  int m_cudaDev;

@ -112,18 +117,23 @@ class CudaInternal {
  uint32_t* m_scratchConcurrentBitset;
  cudaStream_t m_stream;

-  static int was_initialized;
-  static int was_finalized;
+  bool was_initialized = false;
+  bool was_finalized   = false;
+
+  // FIXME_CUDA: these want to be per-device, not per-stream...  use of 'static'
+  //  here will break once there are multiple devices though
+  static unsigned long* constantMemHostStaging;
+  static cudaEvent_t constantMemReusable;

  static CudaInternal& singleton();

  int verify_is_initialized(const char* const label) const;

  int is_initialized() const {
-    return 0 != m_scratchSpace && 0 != m_scratchFlags;
+    return nullptr != m_scratchSpace && nullptr != m_scratchFlags;
  }

-  void initialize(int cuda_device_id, cudaStream_t stream = 0);
+  void initialize(int cuda_device_id, cudaStream_t stream = nullptr);
  void finalize();

  void print_configuration(std::ostream&) const;
@ -157,12 +167,12 @@ class CudaInternal {
        m_scratchFunctorSize(0),
        m_scratchUnifiedSupported(0),
        m_streamCount(0),
-        m_scratchSpace(0),
-        m_scratchFlags(0),
-        m_scratchUnified(0),
-        m_scratchFunctor(0),
-        m_scratchConcurrentBitset(0),
-        m_stream(0) {}
+        m_scratchSpace(nullptr),
+        m_scratchFlags(nullptr),
+        m_scratchUnified(nullptr),
+        m_scratchFunctor(nullptr),
+        m_scratchConcurrentBitset(nullptr),
+        m_stream(nullptr) {}

  size_type* scratch_space(const size_type size) const;
  size_type* scratch_flags(const size_type size) const;
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
@ -244,9 +244,6 @@ struct CudaParallelLaunch<
                            const CudaInternal* cuda_instance,
                            const bool prefer_shmem) {
    if ((grid.x != 0) && ((block.x * block.y * block.z) != 0)) {
-      // Fence before changing settings and copying closure
-      Kokkos::Cuda().fence();
-
      if (cuda_instance->m_maxShmemPerBlock < shmem) {
        Kokkos::Impl::throw_runtime_exception(std::string(
            "CudaParallelLaunch FAILED: shared memory request is too large"));
@ -254,28 +251,43 @@ struct CudaParallelLaunch<
 #ifndef KOKKOS_ARCH_KEPLER
      // On Kepler the L1 has no benefit since it doesn't cache reads
      else {
-        CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
-            cuda_parallel_launch_constant_memory<DriverType, MaxThreadsPerBlock,
-                                                 MinBlocksPerSM>,
-            (prefer_shmem ? cudaFuncCachePreferShared
-                          : cudaFuncCachePreferL1)));
+        static bool cache_config_set = false;
+        if (!cache_config_set) {
+          CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
+              cuda_parallel_launch_constant_memory<
+                  DriverType, MaxThreadsPerBlock, MinBlocksPerSM>,
+              (prefer_shmem ? cudaFuncCachePreferShared
+                            : cudaFuncCachePreferL1)));
+          cache_config_set = true;
+        }
      }
 #else
      (void)prefer_shmem;
 #endif

-      // Copy functor to constant memory on the device
-      cudaMemcpyToSymbolAsync(kokkos_impl_cuda_constant_memory_buffer, &driver,
+      KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
+
+      // Wait until the previous kernel that uses the constant buffer is done
+      CUDA_SAFE_CALL(cudaEventSynchronize(cuda_instance->constantMemReusable));
+
+      // Copy functor (synchronously) to staging buffer in pinned host memory
+      unsigned long* staging = cuda_instance->constantMemHostStaging;
+      memcpy(staging, &driver, sizeof(DriverType));
+
+      // Copy functor asynchronously from there to constant memory on the device
+      cudaMemcpyToSymbolAsync(kokkos_impl_cuda_constant_memory_buffer, staging,
                              sizeof(DriverType), 0, cudaMemcpyHostToDevice,
                              cudaStream_t(cuda_instance->m_stream));

-      KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
-
      // Invoke the driver function on the device
      cuda_parallel_launch_constant_memory<DriverType, MaxThreadsPerBlock,
                                           MinBlocksPerSM>
          <<<grid, block, shmem, cuda_instance->m_stream>>>();

+      // Record an event that says when the constant buffer can be reused
+      CUDA_SAFE_CALL(cudaEventRecord(cuda_instance->constantMemReusable,
+                                     cudaStream_t(cuda_instance->m_stream)));
+
 #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK)
      CUDA_SAFE_CALL(cudaGetLastError());
      Kokkos::Cuda().fence();
@ -284,11 +296,15 @@ struct CudaParallelLaunch<
  }

  static cudaFuncAttributes get_cuda_func_attributes() {
-    cudaFuncAttributes attr;
-    CUDA_SAFE_CALL(cudaFuncGetAttributes(
-        &attr,
-        cuda_parallel_launch_constant_memory<DriverType, MaxThreadsPerBlock,
-                                             MinBlocksPerSM>));
+    static cudaFuncAttributes attr;
+    static bool attr_set = false;
+    if (!attr_set) {
+      CUDA_SAFE_CALL(cudaFuncGetAttributes(
+          &attr,
+          cuda_parallel_launch_constant_memory<DriverType, MaxThreadsPerBlock,
+                                               MinBlocksPerSM>));
+      attr_set = true;
+    }
    return attr;
  }
 };
@ -304,9 +320,6 @@ struct CudaParallelLaunch<DriverType, Kokkos::LaunchBounds<0, 0>,
                            const CudaInternal* cuda_instance,
                            const bool prefer_shmem) {
    if ((grid.x != 0) && ((block.x * block.y * block.z) != 0)) {
-      // Fence before changing settings and copying closure
-      Kokkos::Cuda().fence();
-
      if (cuda_instance->m_maxShmemPerBlock < shmem) {
        Kokkos::Impl::throw_runtime_exception(std::string(
            "CudaParallelLaunch FAILED: shared memory request is too large"));
@ -314,26 +327,41 @@ struct CudaParallelLaunch<DriverType, Kokkos::LaunchBounds<0, 0>,
 #ifndef KOKKOS_ARCH_KEPLER
      // On Kepler the L1 has no benefit since it doesn't cache reads
      else {
-        CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
-            cuda_parallel_launch_constant_memory<DriverType>,
-            (prefer_shmem ? cudaFuncCachePreferShared
-                          : cudaFuncCachePreferL1)));
+        static bool cache_config_set = false;
+        if (!cache_config_set) {
+          CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
+              cuda_parallel_launch_constant_memory<DriverType>,
+              (prefer_shmem ? cudaFuncCachePreferShared
+                            : cudaFuncCachePreferL1)));
+          cache_config_set = true;
+        }
      }
 #else
      (void)prefer_shmem;
 #endif

-      // Copy functor to constant memory on the device
-      cudaMemcpyToSymbolAsync(kokkos_impl_cuda_constant_memory_buffer, &driver,
+      KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
+
+      // Wait until the previous kernel that uses the constant buffer is done
+      CUDA_SAFE_CALL(cudaEventSynchronize(cuda_instance->constantMemReusable));
+
+      // Copy functor (synchronously) to staging buffer in pinned host memory
+      unsigned long* staging = cuda_instance->constantMemHostStaging;
+      memcpy(staging, &driver, sizeof(DriverType));
+
+      // Copy functor asynchronously from there to constant memory on the device
+      cudaMemcpyToSymbolAsync(kokkos_impl_cuda_constant_memory_buffer, staging,
                              sizeof(DriverType), 0, cudaMemcpyHostToDevice,
                              cudaStream_t(cuda_instance->m_stream));

-      KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
-
      // Invoke the driver function on the device
      cuda_parallel_launch_constant_memory<DriverType>
          <<<grid, block, shmem, cuda_instance->m_stream>>>();

+      // Record an event that says when the constant buffer can be reused
+      CUDA_SAFE_CALL(cudaEventRecord(cuda_instance->constantMemReusable,
+                                     cudaStream_t(cuda_instance->m_stream)));
+
 #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK)
      CUDA_SAFE_CALL(cudaGetLastError());
      Kokkos::Cuda().fence();
@ -342,9 +370,13 @@ struct CudaParallelLaunch<DriverType, Kokkos::LaunchBounds<0, 0>,
  }

  static cudaFuncAttributes get_cuda_func_attributes() {
-    cudaFuncAttributes attr;
-    CUDA_SAFE_CALL(cudaFuncGetAttributes(
-        &attr, cuda_parallel_launch_constant_memory<DriverType>));
+    static cudaFuncAttributes attr;
+    static bool attr_set = false;
+    if (!attr_set) {
+      CUDA_SAFE_CALL(cudaFuncGetAttributes(
+          &attr, cuda_parallel_launch_constant_memory<DriverType>));
+      attr_set = true;
+    }
    return attr;
  }
 };
@ -369,11 +401,15 @@ struct CudaParallelLaunch<
 #ifndef KOKKOS_ARCH_KEPLER
      // On Kepler the L1 has no benefit since it doesn't cache reads
      else {
-        CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
-            cuda_parallel_launch_local_memory<DriverType, MaxThreadsPerBlock,
-                                              MinBlocksPerSM>,
-            (prefer_shmem ? cudaFuncCachePreferShared
-                          : cudaFuncCachePreferL1)));
+        static bool cache_config_set = false;
+        if (!cache_config_set) {
+          CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
+              cuda_parallel_launch_local_memory<DriverType, MaxThreadsPerBlock,
+                                                MinBlocksPerSM>,
+              (prefer_shmem ? cudaFuncCachePreferShared
+                            : cudaFuncCachePreferL1)));
+          cache_config_set = true;
+        }
      }
 #else
      (void)prefer_shmem;
@ -394,10 +430,15 @@ struct CudaParallelLaunch<
  }

  static cudaFuncAttributes get_cuda_func_attributes() {
-    cudaFuncAttributes attr;
-    CUDA_SAFE_CALL(cudaFuncGetAttributes(
-        &attr, cuda_parallel_launch_local_memory<DriverType, MaxThreadsPerBlock,
-                                                 MinBlocksPerSM>));
+    static cudaFuncAttributes attr;
+    static bool attr_set = false;
+    if (!attr_set) {
+      CUDA_SAFE_CALL(cudaFuncGetAttributes(
+          &attr,
+          cuda_parallel_launch_local_memory<DriverType, MaxThreadsPerBlock,
+                                            MinBlocksPerSM>));
+      attr_set = true;
+    }
    return attr;
  }
 };
@ -420,10 +461,14 @@ struct CudaParallelLaunch<DriverType, Kokkos::LaunchBounds<0, 0>,
 #ifndef KOKKOS_ARCH_KEPLER
      // On Kepler the L1 has no benefit since it doesn't cache reads
      else {
-        CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
-            cuda_parallel_launch_local_memory<DriverType>,
-            (prefer_shmem ? cudaFuncCachePreferShared
-                          : cudaFuncCachePreferL1)));
+        static bool cache_config_set = false;
+        if (!cache_config_set) {
+          CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
+              cuda_parallel_launch_local_memory<DriverType>,
+              (prefer_shmem ? cudaFuncCachePreferShared
+                            : cudaFuncCachePreferL1)));
+          cache_config_set = true;
+        }
      }
 #else
      (void)prefer_shmem;
@ -443,9 +488,13 @@ struct CudaParallelLaunch<DriverType, Kokkos::LaunchBounds<0, 0>,
  }

  static cudaFuncAttributes get_cuda_func_attributes() {
-    cudaFuncAttributes attr;
-    CUDA_SAFE_CALL(cudaFuncGetAttributes(
-        &attr, cuda_parallel_launch_local_memory<DriverType>));
+    static cudaFuncAttributes attr;
+    static bool attr_set = false;
+    if (!attr_set) {
+      CUDA_SAFE_CALL(cudaFuncGetAttributes(
+          &attr, cuda_parallel_launch_local_memory<DriverType>));
+      attr_set = true;
+    }
    return attr;
  }
 };
@ -467,11 +516,15 @@ struct CudaParallelLaunch<
 #ifndef KOKKOS_ARCH_KEPLER
      // On Kepler the L1 has no benefit since it doesn't cache reads
      else {
-        CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
-            cuda_parallel_launch_global_memory<DriverType, MaxThreadsPerBlock,
-                                               MinBlocksPerSM>,
-            (prefer_shmem ? cudaFuncCachePreferShared
-                          : cudaFuncCachePreferL1)));
+        static bool cache_config_set = false;
+        if (!cache_config_set) {
+          CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
+              cuda_parallel_launch_global_memory<DriverType, MaxThreadsPerBlock,
+                                                 MinBlocksPerSM>,
+              (prefer_shmem ? cudaFuncCachePreferShared
+                            : cudaFuncCachePreferL1)));
+          cache_config_set = true;
+        }
      }
 #else
      (void)prefer_shmem;
@ -497,11 +550,15 @@ struct CudaParallelLaunch<
    }
  }
  static cudaFuncAttributes get_cuda_func_attributes() {
-    cudaFuncAttributes attr;
-    CUDA_SAFE_CALL(cudaFuncGetAttributes(
-        &attr,
-        cuda_parallel_launch_global_memory<DriverType, MaxThreadsPerBlock,
-                                           MinBlocksPerSM>));
+    static cudaFuncAttributes attr;
+    static bool attr_set = false;
+    if (!attr_set) {
+      CUDA_SAFE_CALL(cudaFuncGetAttributes(
+          &attr,
+          cuda_parallel_launch_global_memory<DriverType, MaxThreadsPerBlock,
+                                             MinBlocksPerSM>));
+      attr_set = true;
+    }
    return attr;
  }
 };
@ -521,10 +578,14 @@ struct CudaParallelLaunch<DriverType, Kokkos::LaunchBounds<0, 0>,
 #ifndef KOKKOS_ARCH_KEPLER
      // On Kepler the L1 has no benefit since it doesn't cache reads
      else {
-        CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
-            cuda_parallel_launch_global_memory<DriverType>,
-            (prefer_shmem ? cudaFuncCachePreferShared
-                          : cudaFuncCachePreferL1)));
+        static bool cache_config_set = false;
+        if (!cache_config_set) {
+          CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
+              cuda_parallel_launch_global_memory<DriverType>,
+              (prefer_shmem ? cudaFuncCachePreferShared
+                            : cudaFuncCachePreferL1)));
+          cache_config_set = true;
+        }
      }
 #else
      (void)prefer_shmem;
@ -549,9 +610,13 @@ struct CudaParallelLaunch<DriverType, Kokkos::LaunchBounds<0, 0>,
  }

  static cudaFuncAttributes get_cuda_func_attributes() {
-    cudaFuncAttributes attr;
-    CUDA_SAFE_CALL(cudaFuncGetAttributes(
-        &attr, cuda_parallel_launch_global_memory<DriverType>));
+    static cudaFuncAttributes attr;
+    static bool attr_set = false;
+    if (!attr_set) {
+      CUDA_SAFE_CALL(cudaFuncGetAttributes(
+          &attr, cuda_parallel_launch_global_memory<DriverType>));
+      attr_set = true;
+    }
    return attr;
  }
 };
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
@ -48,8 +48,8 @@
 #include <Kokkos_Macros.hpp>
 #if defined(__CUDACC__) && defined(KOKKOS_ENABLE_CUDA)

-#include <iostream>
 #include <algorithm>
+#include <string>
 #include <cstdio>
 #include <cstdint>

@ -63,10 +63,8 @@
 #include <Kokkos_Vectorization.hpp>
 #include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>

-#if defined(KOKKOS_ENABLE_PROFILING)
-#include <impl/Kokkos_Profiling_Interface.hpp>
+#include <impl/Kokkos_Tools.hpp>
 #include <typeinfo>
-#endif

 #include <KokkosExp_MDRangePolicy.hpp>

@ -84,9 +82,9 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
    : public PolicyTraits<Properties...> {
 public:
  //! Tag this class as a kokkos execution policy
-  typedef TeamPolicyInternal execution_policy;
+  using execution_policy = TeamPolicyInternal;

-  typedef PolicyTraits<Properties...> traits;
+  using traits = PolicyTraits<Properties...>;

  template <class ExecSpace, class... OtherProperties>
  friend class TeamPolicyInternal;
@ -104,7 +102,7 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>

 public:
  //! Execution space of this execution policy
-  typedef Kokkos::Cuda execution_space;
+  using execution_space = Kokkos::Cuda;

  template <class... OtherProperties>
  TeamPolicyInternal(const TeamPolicyInternal<OtherProperties...>& p) {
@ -119,50 +117,12 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
    m_space                  = p.m_space;
  }

-  TeamPolicyInternal& operator=(const TeamPolicyInternal& p) {
-    m_league_size            = p.m_league_size;
-    m_team_size              = p.m_team_size;
-    m_vector_length          = p.m_vector_length;
-    m_team_scratch_size[0]   = p.m_team_scratch_size[0];
-    m_team_scratch_size[1]   = p.m_team_scratch_size[1];
-    m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
-    m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
-    m_chunk_size             = p.m_chunk_size;
-    m_space                  = p.m_space;
-    return *this;
-  }
-
  //----------------------------------------

-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-  template <class FunctorType>
-  static inline int team_size_max(const FunctorType& functor) {
-    int n = MAX_WARP * Impl::CudaTraits::WarpSize;
-
-    for (; n; n >>= 1) {
-      const int shmem_size =
-          /* for global reduce */ Impl::
-              cuda_single_inter_block_reduce_scan_shmem<
-                  false, FunctorType, typename traits::work_tag>(functor, n)
-          /* for team   reduce */
-          + (n + 2) * sizeof(double)
-          /* for team   shared */
-          + Impl::FunctorTeamShmemSize<FunctorType>::value(functor, n);
-
-      if (shmem_size < typename traits::execution_space()
-                           .impl_internal_space_instance()
-                           ->m_maxShmemPerBlock)
-        break;
-    }
-
-    return n;
-  }
-#endif
-
  template <class FunctorType>
  int team_size_max(const FunctorType& f, const ParallelForTag&) const {
-    typedef Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>
-        closure_type;
+    using closure_type =
+        Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>;
    cudaFuncAttributes attr =
        CudaParallelLaunch<closure_type, typename traits::launch_bounds>::
            get_cuda_func_attributes();
@ -179,15 +139,15 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
  template <class FunctorType>
  inline int team_size_max(const FunctorType& f,
                           const ParallelReduceTag&) const {
-    typedef Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
-                                  TeamPolicyInternal, FunctorType>
-        functor_analysis_type;
-    typedef typename Impl::ParallelReduceReturnValue<
+    using functor_analysis_type =
+        Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
+                              TeamPolicyInternal, FunctorType>;
+    using reducer_type = typename Impl::ParallelReduceReturnValue<
        void, typename functor_analysis_type::value_type,
-        FunctorType>::reducer_type reducer_type;
-    typedef Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
-                                 reducer_type>
-        closure_type;
+        FunctorType>::reducer_type;
+    using closure_type =
+        Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
+                             reducer_type>;
    return internal_team_size_max<closure_type>(f);
  }

@ -200,25 +160,10 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
    return internal_team_size_max<closure_type>(f);
  }

-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-  template <class FunctorType>
-  static int team_size_recommended(const FunctorType& functor) {
-    return team_size_max(functor);
-  }
-
-  template <class FunctorType>
-  static int team_size_recommended(const FunctorType& functor,
-                                   const int vector_length) {
-    int max = team_size_max(functor) / vector_length;
-    if (max < 1) max = 1;
-    return max;
-  }
-#endif
-
  template <class FunctorType>
  int team_size_recommended(const FunctorType& f, const ParallelForTag&) const {
-    typedef Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>
-        closure_type;
+    using closure_type =
+        Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>;
    cudaFuncAttributes attr =
        CudaParallelLaunch<closure_type, typename traits::launch_bounds>::
            get_cuda_func_attributes();
@ -235,24 +180,24 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
  template <class FunctorType>
  inline int team_size_recommended(const FunctorType& f,
                                   const ParallelReduceTag&) const {
-    typedef Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
-                                  TeamPolicyInternal, FunctorType>
-        functor_analysis_type;
-    typedef typename Impl::ParallelReduceReturnValue<
+    using functor_analysis_type =
+        Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,
+                              TeamPolicyInternal, FunctorType>;
+    using reducer_type = typename Impl::ParallelReduceReturnValue<
        void, typename functor_analysis_type::value_type,
-        FunctorType>::reducer_type reducer_type;
-    typedef Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
-                                 reducer_type>
-        closure_type;
+        FunctorType>::reducer_type;
+    using closure_type =
+        Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
+                             reducer_type>;
    return internal_team_size_recommended<closure_type>(f);
  }

  template <class FunctorType, class ReducerType>
  int team_size_recommended(const FunctorType& f, const ReducerType&,
                            const ParallelReduceTag&) const {
-    typedef Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
-                                 ReducerType>
-        closure_type;
+    using closure_type =
+        Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
+                             ReducerType>;
    return internal_team_size_recommended<closure_type>(f);
  }

@ -401,44 +346,6 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>

  inline int chunk_size() const { return m_chunk_size; }

-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-  /** \brief set chunk_size to a discrete value*/
-  inline TeamPolicyInternal set_chunk_size(
-      typename traits::index_type chunk_size_) const {
-    TeamPolicyInternal p = *this;
-    p.m_chunk_size       = chunk_size_;
-    return p;
-  }
-
-  /** \brief set per team scratch size for a specific level of the scratch
-   * hierarchy */
-  inline TeamPolicyInternal set_scratch_size(
-      const int& level, const PerTeamValue& per_team) const {
-    TeamPolicyInternal p         = *this;
-    p.m_team_scratch_size[level] = per_team.value;
-    return p;
-  };
-
-  /** \brief set per thread scratch size for a specific level of the scratch
-   * hierarchy */
-  inline TeamPolicyInternal set_scratch_size(
-      const int& level, const PerThreadValue& per_thread) const {
-    TeamPolicyInternal p           = *this;
-    p.m_thread_scratch_size[level] = per_thread.value;
-    return p;
-  };
-
-  /** \brief set per thread and per team scratch size for a specific level of
-   * the scratch hierarchy */
-  inline TeamPolicyInternal set_scratch_size(
-      const int& level, const PerTeamValue& per_team,
-      const PerThreadValue& per_thread) const {
-    TeamPolicyInternal p           = *this;
-    p.m_team_scratch_size[level]   = per_team.value;
-    p.m_thread_scratch_size[level] = per_thread.value;
-    return p;
-  };
-#else
  /** \brief set chunk_size to a discrete value*/
  inline TeamPolicyInternal& set_chunk_size(
      typename traits::index_type chunk_size_) {
@ -471,46 +378,10 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
    m_thread_scratch_size[level] = per_thread.value;
    return *this;
  }
-#endif

-  typedef Kokkos::Impl::CudaTeamMember member_type;
+  using member_type = Kokkos::Impl::CudaTeamMember;

 protected:
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-  /** \brief set chunk_size to a discrete value*/
-  inline TeamPolicyInternal internal_set_chunk_size(
-      typename traits::index_type chunk_size_) {
-    m_chunk_size = chunk_size_;
-    return *this;
-  }
-
-  /** \brief set per team scratch size for a specific level of the scratch
-   * hierarchy */
-  inline TeamPolicyInternal internal_set_scratch_size(
-      const int& level, const PerTeamValue& per_team) {
-    m_team_scratch_size[level] = per_team.value;
-    return *this;
-  }
-
-  /** \brief set per thread scratch size for a specific level of the scratch
-   * hierarchy */
-  inline TeamPolicyInternal internal_set_scratch_size(
-      const int& level, const PerThreadValue& per_thread) {
-    m_thread_scratch_size[level] = per_thread.value;
-    return *this;
-  }
-
-  /** \brief set per thread and per team scratch size for a specific level of
-   * the scratch hierarchy */
-  inline TeamPolicyInternal internal_set_scratch_size(
-      const int& level, const PerTeamValue& per_team,
-      const PerThreadValue& per_thread) {
-    m_team_scratch_size[level]   = per_team.value;
-    m_thread_scratch_size[level] = per_thread.value;
-    return *this;
-  }
-#endif
-
  template <class ClosureType, class FunctorType, class BlockSizeCallable>
  int internal_team_size_common(const FunctorType& f,
                                BlockSizeCallable&& block_size_callable) const {
@ -567,12 +438,12 @@ namespace Impl {
 template <class FunctorType, class... Traits>
 class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
 public:
-  typedef Kokkos::RangePolicy<Traits...> Policy;
+  using Policy = Kokkos::RangePolicy<Traits...>;

 private:
-  typedef typename Policy::member_type Member;
-  typedef typename Policy::work_tag WorkTag;
-  typedef typename Policy::launch_bounds LaunchBounds;
+  using Member       = typename Policy::member_type;
+  using WorkTag      = typename Policy::work_tag;
+  using LaunchBounds = typename Policy::launch_bounds;

  const FunctorType m_functor;
  const Policy m_policy;
@ -595,7 +466,7 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
  }

 public:
-  typedef FunctorType functor_type;
+  using functor_type = FunctorType;

  inline __device__ void operator()(void) const {
    const Member work_stride = blockDim.y * gridDim.x;
@ -620,6 +491,7 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
        Kokkos::Impl::cuda_get_opt_block_size<FunctorType, LaunchBounds>(
            m_policy.space().impl_internal_space_instance(), attr, m_functor, 1,
            0, 0);
+    KOKKOS_ASSERT(block_size > 0);
    dim3 block(1, block_size, 1);
    dim3 grid(
        std::min(
@ -646,13 +518,13 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
 template <class FunctorType, class... Traits>
 class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
 public:
-  typedef Kokkos::MDRangePolicy<Traits...> Policy;
+  using Policy = Kokkos::MDRangePolicy<Traits...>;

 private:
-  using RP = Policy;
-  typedef typename Policy::array_index_type array_index_type;
-  typedef typename Policy::index_type index_type;
-  typedef typename Policy::launch_bounds LaunchBounds;
+  using RP               = Policy;
+  using array_index_type = typename Policy::array_index_type;
+  using index_type       = typename Policy::index_type;
+  using LaunchBounds     = typename Policy::launch_bounds;

  const FunctorType m_functor;
  const Policy m_rp;
@ -666,29 +538,36 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
  }

  inline void execute() const {
+    using namespace std;
+
    if (m_rp.m_num_tiles == 0) return;
    const array_index_type maxblocks = static_cast<array_index_type>(
        m_rp.space().impl_internal_space_instance()->m_maxBlock);
    if (RP::rank == 2) {
      const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], 1);
+      KOKKOS_ASSERT(block.x > 0);
+      KOKKOS_ASSERT(block.y > 0);
      const dim3 grid(
-          std::min((m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x,
-                   maxblocks),
-          std::min((m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1) / block.y,
-                   maxblocks),
+          min((m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x,
+              maxblocks),
+          min((m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1) / block.y,
+              maxblocks),
          1);
      CudaParallelLaunch<ParallelFor, LaunchBounds>(
          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
          false);
    } else if (RP::rank == 3) {
      const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]);
+      KOKKOS_ASSERT(block.x > 0);
+      KOKKOS_ASSERT(block.y > 0);
+      KOKKOS_ASSERT(block.z > 0);
      const dim3 grid(
-          std::min((m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x,
-                   maxblocks),
-          std::min((m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1) / block.y,
-                   maxblocks),
-          std::min((m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z,
-                   maxblocks));
+          min((m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x,
+              maxblocks),
+          min((m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1) / block.y,
+              maxblocks),
+          min((m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z,
+              maxblocks));
      CudaParallelLaunch<ParallelFor, LaunchBounds>(
          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
          false);
@ -697,14 +576,15 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
      // threadIdx.z
      const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1], m_rp.m_tile[2],
                       m_rp.m_tile[3]);
+      KOKKOS_ASSERT(block.y > 0);
+      KOKKOS_ASSERT(block.z > 0);
      const dim3 grid(
-          std::min(
-              static_cast<index_type>(m_rp.m_tile_end[0] * m_rp.m_tile_end[1]),
+          min(static_cast<index_type>(m_rp.m_tile_end[0] * m_rp.m_tile_end[1]),
              static_cast<index_type>(maxblocks)),
-          std::min((m_rp.m_upper[2] - m_rp.m_lower[2] + block.y - 1) / block.y,
-                   maxblocks),
-          std::min((m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z,
-                   maxblocks));
+          min((m_rp.m_upper[2] - m_rp.m_lower[2] + block.y - 1) / block.y,
+              maxblocks),
+          min((m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z,
+              maxblocks));
      CudaParallelLaunch<ParallelFor, LaunchBounds>(
          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
          false);
@ -713,15 +593,14 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
      // threadIdx.z
      const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1],
                       m_rp.m_tile[2] * m_rp.m_tile[3], m_rp.m_tile[4]);
+      KOKKOS_ASSERT(block.z > 0);
      const dim3 grid(
-          std::min(
-              static_cast<index_type>(m_rp.m_tile_end[0] * m_rp.m_tile_end[1]),
+          min(static_cast<index_type>(m_rp.m_tile_end[0] * m_rp.m_tile_end[1]),
              static_cast<index_type>(maxblocks)),
-          std::min(
-              static_cast<index_type>(m_rp.m_tile_end[2] * m_rp.m_tile_end[3]),
+          min(static_cast<index_type>(m_rp.m_tile_end[2] * m_rp.m_tile_end[3]),
              static_cast<index_type>(maxblocks)),
-          std::min((m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z,
-                   maxblocks));
+          min((m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z,
+              maxblocks));
      CudaParallelLaunch<ParallelFor, LaunchBounds>(
          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
          false);
@ -732,14 +611,11 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>, Kokkos::Cuda> {
                       m_rp.m_tile[2] * m_rp.m_tile[3],
                       m_rp.m_tile[4] * m_rp.m_tile[5]);
      const dim3 grid(
-          std::min(
-              static_cast<index_type>(m_rp.m_tile_end[0] * m_rp.m_tile_end[1]),
+          min(static_cast<index_type>(m_rp.m_tile_end[0] * m_rp.m_tile_end[1]),
              static_cast<index_type>(maxblocks)),
-          std::min(
-              static_cast<index_type>(m_rp.m_tile_end[2] * m_rp.m_tile_end[3]),
+          min(static_cast<index_type>(m_rp.m_tile_end[2] * m_rp.m_tile_end[3]),
              static_cast<index_type>(maxblocks)),
-          std::min(
-              static_cast<index_type>(m_rp.m_tile_end[4] * m_rp.m_tile_end[5]),
+          min(static_cast<index_type>(m_rp.m_tile_end[4] * m_rp.m_tile_end[5]),
              static_cast<index_type>(maxblocks)));
      CudaParallelLaunch<ParallelFor, LaunchBounds>(
          *this, grid, block, 0, m_rp.space().impl_internal_space_instance(),
@ -760,16 +636,16 @@ template <class FunctorType, class... Properties>
 class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
                  Kokkos::Cuda> {
 public:
-  typedef TeamPolicyInternal<Kokkos::Cuda, Properties...> Policy;
+  using Policy = TeamPolicyInternal<Kokkos::Cuda, Properties...>;

 private:
-  typedef typename Policy::member_type Member;
-  typedef typename Policy::work_tag WorkTag;
-  typedef typename Policy::launch_bounds LaunchBounds;
+  using Member       = typename Policy::member_type;
+  using WorkTag      = typename Policy::work_tag;
+  using LaunchBounds = typename Policy::launch_bounds;

 public:
-  typedef FunctorType functor_type;
-  typedef Cuda::size_type size_type;
+  using functor_type = FunctorType;
+  using size_type    = Cuda::size_type;

 private:
  // Algorithmic constraints: blockDim.y is a power of two AND blockDim.y ==
@ -941,34 +817,34 @@ template <class FunctorType, class ReducerType, class... Traits>
 class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
                     Kokkos::Cuda> {
 public:
-  typedef Kokkos::RangePolicy<Traits...> Policy;
+  using Policy = Kokkos::RangePolicy<Traits...>;

 private:
-  typedef typename Policy::WorkRange WorkRange;
-  typedef typename Policy::work_tag WorkTag;
-  typedef typename Policy::member_type Member;
-  typedef typename Policy::launch_bounds LaunchBounds;
+  using WorkRange    = typename Policy::WorkRange;
+  using WorkTag      = typename Policy::work_tag;
+  using Member       = typename Policy::member_type;
+  using LaunchBounds = typename Policy::launch_bounds;

-  typedef Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                             FunctorType, ReducerType>
-      ReducerConditional;
-  typedef typename ReducerConditional::type ReducerTypeFwd;
-  typedef
+  using ReducerConditional =
+      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
+                         FunctorType, ReducerType>;
+  using ReducerTypeFwd = typename ReducerConditional::type;
+  using WorkTagFwd =
      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                                  WorkTag, void>::type WorkTagFwd;
+                                  WorkTag, void>::type;

-  typedef Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>
-      ValueTraits;
-  typedef Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd> ValueInit;
-  typedef Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd> ValueJoin;
+  using ValueTraits =
+      Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
+  using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
+  using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;

 public:
-  typedef typename ValueTraits::pointer_type pointer_type;
-  typedef typename ValueTraits::value_type value_type;
-  typedef typename ValueTraits::reference_type reference_type;
-  typedef FunctorType functor_type;
-  typedef Kokkos::Cuda::size_type size_type;
-  typedef typename Policy::index_type index_type;
+  using pointer_type   = typename ValueTraits::pointer_type;
+  using value_type     = typename ValueTraits::value_type;
+  using reference_type = typename ValueTraits::reference_type;
+  using functor_type   = FunctorType;
+  using size_type      = Kokkos::Cuda::size_type;
+  using index_type     = typename Policy::index_type;

  // Algorithmic constraints: blockSize is a power of two AND blockDim.y ==
  // blockDim.z == 1
@ -990,8 +866,8 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
      //};
      // Some crutch to do function overloading
 private:
-  typedef double DummyShflReductionType;
-  typedef int DummySHMEMReductionType;
+  using DummyShflReductionType  = double;
+  using DummySHMEMReductionType = int;

 public:
  // Make the exec_range calls call to Reduce::DeviceIterateTile
@ -1124,13 +1000,19 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
    int shmem_size =
        cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, WorkTag>(
            f, n);
+    using closure_type = Impl::ParallelReduce<FunctorType, Policy, ReducerType>;
+    cudaFuncAttributes attr =
+        CudaParallelLaunch<closure_type,
+                           LaunchBounds>::get_cuda_func_attributes();
    while (
        (n &&
         (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
          shmem_size)) ||
-        (n > static_cast<unsigned>(
-                 Kokkos::Impl::cuda_get_max_block_size<
-                     ParallelReduce, LaunchBounds>(f, 1, shmem_size, 0)))) {
+        (n >
+         static_cast<unsigned>(
+             Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>(
+                 m_policy.space().impl_internal_space_instance(), attr, f, 1,
+                 shmem_size, 0)))) {
      n >>= 1;
      shmem_size = cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
                                                             WorkTag>(f, n);
@ -1142,6 +1024,7 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
    const index_type nwork = m_policy.end() - m_policy.begin();
    if (nwork) {
      const int block_size = local_block_size(m_functor);
+      KOKKOS_ASSERT(block_size > 0);

      m_scratch_space = cuda_internal_scratch_space(
          m_policy.space(), ValueTraits::value_size(ReducerConditional::select(
@ -1215,9 +1098,9 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
        m_result_ptr_device_accessible(
            MemorySpaceAccess<Kokkos::CudaSpace,
                              typename ViewType::memory_space>::accessible),
-        m_scratch_space(0),
-        m_scratch_flags(0),
-        m_unified_space(0) {}
+        m_scratch_space(nullptr),
+        m_scratch_flags(nullptr),
+        m_unified_space(nullptr) {}

  ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
                 const ReducerType& reducer)
@ -1229,9 +1112,9 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
            MemorySpaceAccess<Kokkos::CudaSpace,
                              typename ReducerType::result_view_type::
                                  memory_space>::accessible),
-        m_scratch_space(0),
-        m_scratch_flags(0),
-        m_unified_space(0) {}
+        m_scratch_space(nullptr),
+        m_scratch_flags(nullptr),
+        m_unified_space(nullptr) {}
 };

 // MDRangePolicy impl
@ -1239,35 +1122,35 @@ template <class FunctorType, class ReducerType, class... Traits>
 class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
                     Kokkos::Cuda> {
 public:
-  typedef Kokkos::MDRangePolicy<Traits...> Policy;
+  using Policy = Kokkos::MDRangePolicy<Traits...>;

 private:
-  typedef typename Policy::array_index_type array_index_type;
-  typedef typename Policy::index_type index_type;
+  using array_index_type = typename Policy::array_index_type;
+  using index_type       = typename Policy::index_type;

-  typedef typename Policy::work_tag WorkTag;
-  typedef typename Policy::member_type Member;
-  typedef typename Policy::launch_bounds LaunchBounds;
+  using WorkTag      = typename Policy::work_tag;
+  using Member       = typename Policy::member_type;
+  using LaunchBounds = typename Policy::launch_bounds;

-  typedef Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                             FunctorType, ReducerType>
-      ReducerConditional;
-  typedef typename ReducerConditional::type ReducerTypeFwd;
-  typedef
+  using ReducerConditional =
+      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
+                         FunctorType, ReducerType>;
+  using ReducerTypeFwd = typename ReducerConditional::type;
+  using WorkTagFwd =
      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                                  WorkTag, void>::type WorkTagFwd;
+                                  WorkTag, void>::type;

-  typedef Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>
-      ValueTraits;
-  typedef Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd> ValueInit;
-  typedef Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd> ValueJoin;
+  using ValueTraits =
+      Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
+  using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
+  using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;

 public:
-  typedef typename ValueTraits::pointer_type pointer_type;
-  typedef typename ValueTraits::value_type value_type;
-  typedef typename ValueTraits::reference_type reference_type;
-  typedef FunctorType functor_type;
-  typedef Cuda::size_type size_type;
+  using pointer_type   = typename ValueTraits::pointer_type;
+  using value_type     = typename ValueTraits::value_type;
+  using reference_type = typename ValueTraits::reference_type;
+  using functor_type   = FunctorType;
+  using size_type      = Cuda::size_type;

  // Algorithmic constraints: blockSize is a power of two AND blockDim.y ==
  // blockDim.z == 1
@ -1281,10 +1164,9 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
  size_type* m_scratch_flags;
  size_type* m_unified_space;

-  typedef typename Kokkos::Impl::Reduce::DeviceIterateTile<
+  using DeviceIteratePattern = typename Kokkos::Impl::Reduce::DeviceIterateTile<
      Policy::rank, Policy, FunctorType, typename Policy::work_tag,
-      reference_type>
-      DeviceIteratePattern;
+      reference_type>;

  // Shall we use the shfl based reduction or not (only use it for static sized
  // types of more than 128bit
@ -1294,8 +1176,8 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
  };
  // Some crutch to do function overloading
 private:
-  typedef double DummyShflReductionType;
-  typedef int DummySHMEMReductionType;
+  using DummyShflReductionType  = double;
+  using DummySHMEMReductionType = int;

 public:
  inline __device__ void exec_range(reference_type update) const {
@ -1414,13 +1296,19 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
    int shmem_size =
        cuda_single_inter_block_reduce_scan_shmem<false, FunctorType, WorkTag>(
            f, n);
+    using closure_type = Impl::ParallelReduce<FunctorType, Policy, ReducerType>;
+    cudaFuncAttributes attr =
+        CudaParallelLaunch<closure_type,
+                           LaunchBounds>::get_cuda_func_attributes();
    while (
        (n &&
         (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
          shmem_size)) ||
-        (n > static_cast<unsigned>(
-                 Kokkos::Impl::cuda_get_max_block_size<
-                     ParallelReduce, LaunchBounds>(f, 1, shmem_size, 0)))) {
+        (n >
+         static_cast<unsigned>(
+             Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>(
+                 m_policy.space().impl_internal_space_instance(), attr, f, 1,
+                 shmem_size, 0)))) {
      n >>= 1;
      shmem_size = cuda_single_inter_block_reduce_scan_shmem<false, FunctorType,
                                                             WorkTag>(f, n);
@ -1507,9 +1395,9 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
        m_result_ptr_device_accessible(
            MemorySpaceAccess<Kokkos::CudaSpace,
                              typename ViewType::memory_space>::accessible),
-        m_scratch_space(0),
-        m_scratch_flags(0),
-        m_unified_space(0) {}
+        m_scratch_space(nullptr),
+        m_scratch_flags(nullptr),
+        m_unified_space(nullptr) {}

  ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
                 const ReducerType& reducer)
@ -1521,9 +1409,9 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
            MemorySpaceAccess<Kokkos::CudaSpace,
                              typename ReducerType::result_view_type::
                                  memory_space>::accessible),
-        m_scratch_space(0),
-        m_scratch_flags(0),
-        m_unified_space(0) {}
+        m_scratch_space(nullptr),
+        m_scratch_flags(nullptr),
+        m_unified_space(nullptr) {}
 };

 //----------------------------------------------------------------------------
@ -1532,39 +1420,39 @@ template <class FunctorType, class ReducerType, class... Properties>
 class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
                     ReducerType, Kokkos::Cuda> {
 public:
-  typedef TeamPolicyInternal<Kokkos::Cuda, Properties...> Policy;
+  using Policy = TeamPolicyInternal<Kokkos::Cuda, Properties...>;

 private:
-  typedef typename Policy::member_type Member;
-  typedef typename Policy::work_tag WorkTag;
-  typedef typename Policy::launch_bounds LaunchBounds;
+  using Member       = typename Policy::member_type;
+  using WorkTag      = typename Policy::work_tag;
+  using LaunchBounds = typename Policy::launch_bounds;

-  typedef Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                             FunctorType, ReducerType>
-      ReducerConditional;
-  typedef typename ReducerConditional::type ReducerTypeFwd;
-  typedef
+  using ReducerConditional =
+      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
+                         FunctorType, ReducerType>;
+  using ReducerTypeFwd = typename ReducerConditional::type;
+  using WorkTagFwd =
      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                                  WorkTag, void>::type WorkTagFwd;
+                                  WorkTag, void>::type;

-  typedef Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>
-      ValueTraits;
-  typedef Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd> ValueInit;
-  typedef Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd> ValueJoin;
+  using ValueTraits =
+      Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
+  using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
+  using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;

-  typedef typename ValueTraits::pointer_type pointer_type;
-  typedef typename ValueTraits::reference_type reference_type;
-  typedef typename ValueTraits::value_type value_type;
+  using pointer_type   = typename ValueTraits::pointer_type;
+  using reference_type = typename ValueTraits::reference_type;
+  using value_type     = typename ValueTraits::value_type;

 public:
-  typedef FunctorType functor_type;
-  typedef Cuda::size_type size_type;
+  using functor_type = FunctorType;
+  using size_type    = Cuda::size_type;

  enum { UseShflReduction = (true && (ValueTraits::StaticValueSize != 0)) };

 private:
-  typedef double DummyShflReductionType;
-  typedef int DummySHMEMReductionType;
+  using DummyShflReductionType  = double;
+  using DummySHMEMReductionType = int;

  // Algorithmic constraints: blockDim.y is a power of two AND blockDim.y ==
  // blockDim.z == 1 shared memory utilization:
@ -1818,9 +1706,9 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
        m_result_ptr_device_accessible(
            MemorySpaceAccess<Kokkos::CudaSpace,
                              typename ViewType::memory_space>::accessible),
-        m_scratch_space(0),
-        m_scratch_flags(0),
-        m_unified_space(0),
+        m_scratch_space(nullptr),
+        m_scratch_flags(nullptr),
+        m_unified_space(nullptr),
        m_team_begin(0),
        m_shmem_begin(0),
        m_shmem_size(0),
@ -1917,9 +1805,9 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
            MemorySpaceAccess<Kokkos::CudaSpace,
                              typename ReducerType::result_view_type::
                                  memory_space>::accessible),
-        m_scratch_space(0),
-        m_scratch_flags(0),
-        m_unified_space(0),
+        m_scratch_space(nullptr),
+        m_scratch_flags(nullptr),
+        m_unified_space(nullptr),
        m_team_begin(0),
        m_shmem_begin(0),
        m_shmem_size(0),
@ -2013,23 +1901,23 @@ namespace Impl {
 template <class FunctorType, class... Traits>
 class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
 public:
-  typedef Kokkos::RangePolicy<Traits...> Policy;
+  using Policy = Kokkos::RangePolicy<Traits...>;

 private:
-  typedef typename Policy::member_type Member;
-  typedef typename Policy::work_tag WorkTag;
-  typedef typename Policy::WorkRange WorkRange;
-  typedef typename Policy::launch_bounds LaunchBounds;
+  using Member       = typename Policy::member_type;
+  using WorkTag      = typename Policy::work_tag;
+  using WorkRange    = typename Policy::WorkRange;
+  using LaunchBounds = typename Policy::launch_bounds;

-  typedef Kokkos::Impl::FunctorValueTraits<FunctorType, WorkTag> ValueTraits;
-  typedef Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag> ValueInit;
-  typedef Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag> ValueOps;
+  using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, WorkTag>;
+  using ValueInit   = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>;
+  using ValueOps    = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>;

 public:
-  typedef typename ValueTraits::pointer_type pointer_type;
-  typedef typename ValueTraits::reference_type reference_type;
-  typedef FunctorType functor_type;
-  typedef Cuda::size_type size_type;
+  using pointer_type   = typename ValueTraits::pointer_type;
+  using reference_type = typename ValueTraits::reference_type;
+  using functor_type   = FunctorType;
+  using size_type      = Cuda::size_type;

 private:
  // Algorithmic constraints:
@ -2233,6 +2121,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
      enum { GridMaxComputeCapability_2x = 0x0ffff };

      const int block_size = local_block_size(m_functor);
+      KOKKOS_ASSERT(block_size > 0);

      const int grid_max =
          (block_size * block_size) < GridMaxComputeCapability_2x
@ -2283,8 +2172,8 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
  ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy)
      : m_functor(arg_functor),
        m_policy(arg_policy),
-        m_scratch_space(0),
-        m_scratch_flags(0),
+        m_scratch_space(nullptr),
+        m_scratch_flags(nullptr),
        m_final(false)
 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
        ,
@ -2299,23 +2188,23 @@ template <class FunctorType, class ReturnType, class... Traits>
 class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
                            ReturnType, Kokkos::Cuda> {
 public:
-  typedef Kokkos::RangePolicy<Traits...> Policy;
+  using Policy = Kokkos::RangePolicy<Traits...>;

 private:
-  typedef typename Policy::member_type Member;
-  typedef typename Policy::work_tag WorkTag;
-  typedef typename Policy::WorkRange WorkRange;
-  typedef typename Policy::launch_bounds LaunchBounds;
+  using Member       = typename Policy::member_type;
+  using WorkTag      = typename Policy::work_tag;
+  using WorkRange    = typename Policy::WorkRange;
+  using LaunchBounds = typename Policy::launch_bounds;

-  typedef Kokkos::Impl::FunctorValueTraits<FunctorType, WorkTag> ValueTraits;
-  typedef Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag> ValueInit;
-  typedef Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag> ValueOps;
+  using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, WorkTag>;
+  using ValueInit   = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>;
+  using ValueOps    = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>;

 public:
-  typedef typename ValueTraits::pointer_type pointer_type;
-  typedef typename ValueTraits::reference_type reference_type;
-  typedef FunctorType functor_type;
-  typedef Cuda::size_type size_type;
+  using pointer_type   = typename ValueTraits::pointer_type;
+  using reference_type = typename ValueTraits::reference_type;
+  using functor_type   = FunctorType;
+  using size_type      = Cuda::size_type;

 private:
  // Algorithmic constraints:
@ -2523,6 +2412,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
      enum { GridMaxComputeCapability_2x = 0x0ffff };

      const int block_size = local_block_size(m_functor);
+      KOKKOS_ASSERT(block_size > 0);

      const int grid_max =
          (block_size * block_size) < GridMaxComputeCapability_2x
@ -2585,8 +2475,8 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
                        const Policy& arg_policy, ReturnType& arg_returnvalue)
      : m_functor(arg_functor),
        m_policy(arg_policy),
-        m_scratch_space(0),
-        m_scratch_flags(0),
+        m_scratch_space(nullptr),
+        m_scratch_flags(nullptr),
        m_final(false),
        m_returnvalue(arg_returnvalue)
 #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION
@ -2610,7 +2500,7 @@ template <class FunctorType, class ExecPolicy, class ValueType,
          class Tag = typename ExecPolicy::work_tag>
 struct CudaFunctorAdapter {
  const FunctorType f;
-  typedef ValueType value_type;
+  using value_type = ValueType;
  CudaFunctorAdapter(const FunctorType& f_) : f(f_) {}

  __device__ inline void operator()(typename ExecPolicy::work_tag,
@ -2680,7 +2570,7 @@ struct CudaFunctorAdapter {
 template <class FunctorType, class ExecPolicy, class ValueType>
 struct CudaFunctorAdapter<FunctorType, ExecPolicy, ValueType, void> {
  const FunctorType f;
-  typedef ValueType value_type;
+  using value_type = ValueType;
  CudaFunctorAdapter(const FunctorType& f_) : f(f_) {}

  __device__ inline void operator()(const typename ExecPolicy::member_type& i,
@ -2801,13 +2691,14 @@ struct CudaFunctorAdapter<FunctorType, ExecPolicy, ValueType, void> {
 template <class FunctorType, class ResultType, class Tag,
          bool Enable = IsNonTrivialReduceFunctor<FunctorType>::value>
 struct FunctorReferenceType {
-  typedef ResultType& reference_type;
+  using reference_type = ResultType&;
 };

 template <class FunctorType, class ResultType, class Tag>
 struct FunctorReferenceType<FunctorType, ResultType, Tag, true> {
-  typedef typename Kokkos::Impl::FunctorValueTraits<
-      FunctorType, Tag>::reference_type reference_type;
+  using reference_type =
+      typename Kokkos::Impl::FunctorValueTraits<FunctorType,
+                                                Tag>::reference_type;
 };

 template <class FunctorTypeIn, class ExecPolicy, class ValueType>
@ -2815,10 +2706,9 @@ struct ParallelReduceFunctorType<FunctorTypeIn, ExecPolicy, ValueType, Cuda> {
  enum {
    FunctorHasValueType = IsNonTrivialReduceFunctor<FunctorTypeIn>::value
  };
-  typedef typename Kokkos::Impl::if_c<
+  using functor_type = typename Kokkos::Impl::if_c<
      FunctorHasValueType, FunctorTypeIn,
-      Impl::CudaFunctorAdapter<FunctorTypeIn, ExecPolicy, ValueType>>::type
-      functor_type;
+      Impl::CudaFunctorAdapter<FunctorTypeIn, ExecPolicy, ValueType>>::type;
  static functor_type functor(const FunctorTypeIn& functor_in) {
    return Impl::if_c<FunctorHasValueType, FunctorTypeIn, functor_type>::select(
        functor_in, functor_type(functor_in));
--- a/Show More
+++ b/Show More