Kokkos lib update

2016-09-08 13:56:18 -06:00 · 2016-09-08 13:56:18 -06:00 · 236ebf7fab
parent 0252347d43
commit 236ebf7fab
212 changed files with 18902 additions and 13466 deletions
--- a/lib/kokkos/CMakeLists.txt
+++ b/lib/kokkos/CMakeLists.txt
@ -1,4 +1,15 @@

+IF(COMMAND TRIBITS_PACKAGE_DECL)
+  SET(KOKKOS_HAS_TRILINOS ON CACHE BOOL "")
+ELSE()
+  SET(KOKKOS_HAS_TRILINOS OFF CACHE BOOL "")
+ENDIF()
+
+IF(NOT KOKKOS_HAS_TRILINOS)
+  CMAKE_MINIMUM_REQUIRED(VERSION 2.8.11 FATAL_ERROR)
+  INCLUDE(cmake/tribits.cmake)
+ENDIF()
+
 #
 # A) Forward delcare the package so that certain options are also defined for
 # subpackages
@ -12,7 +23,22 @@ TRIBITS_PACKAGE_DECL(Kokkos) # ENABLE_SHADOWING_WARNINGS)
 # subpackages as well.
 #

-TRIBITS_ADD_DEBUG_OPTION()
+
+
+# mfh 01 Aug 2016: See Issue #61:
+#
+# https://github.com/kokkos/kokkos/issues/61
+#
+# Don't use TRIBITS_ADD_DEBUG_OPTION() here, because that defines
+# HAVE_KOKKOS_DEBUG.  We define KOKKOS_HAVE_DEBUG here instead,
+# for compatibility with Kokkos' Makefile build system.
+
+TRIBITS_ADD_OPTION_AND_DEFINE(
+  ${PACKAGE_NAME}_ENABLE_DEBUG
+  ${PACKAGE_NAME_UC}_HAVE_DEBUG
+  "Enable run-time debug checks.  These checks may be expensive, so they are disabled by default in a release build."
+  ${${PROJECT_NAME}_ENABLE_DEBUG}
+)

 TRIBITS_ADD_OPTION_AND_DEFINE(
  Kokkos_ENABLE_SIERRA_BUILD
@ -82,11 +108,33 @@ TRIBITS_ADD_OPTION_AND_DEFINE(
  "${TPL_ENABLE_MPI}"
  )

+# Set default value of Kokkos_ENABLE_Debug_Bounds_Check option
+#
+# CMake is case sensitive.  The Kokkos_ENABLE_Debug_Bounds_Check
+# option (defined below) is annoyingly not all caps, but we need to
+# keep it that way for backwards compatibility.  If users forget and
+# try using an all-caps variable, then make it count by using the
+# all-caps version as the default value of the original, not-all-caps
+# option.  Otherwise, the default value of this option comes from
+# Kokkos_ENABLE_DEBUG (see Issue #367).
+
+ASSERT_DEFINED(${PACKAGE_NAME}_ENABLE_DEBUG)
+IF(DEFINED Kokkos_ENABLE_DEBUG_BOUNDS_CHECK)
+  IF(Kokkos_ENABLE_DEBUG_BOUNDS_CHECK)
+    SET(Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT ON)
+  ELSE()
+    SET(Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT "${${PACKAGE_NAME}_ENABLE_DEBUG}")
+  ENDIF()
+ELSE()
+  SET(Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT "${${PACKAGE_NAME}_ENABLE_DEBUG}")
+ENDIF()
+ASSERT_DEFINED(Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT)
+
 TRIBITS_ADD_OPTION_AND_DEFINE(
  Kokkos_ENABLE_Debug_Bounds_Check
  KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
-  "Enable bounds checking support in Kokkos."
-  OFF
+  "Enable Kokkos::View run-time bounds checking."
+  "${Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT}"
  )

 TRIBITS_ADD_OPTION_AND_DEFINE(
--- a/lib/kokkos/Makefile.kokkos
+++ b/lib/kokkos/Makefile.kokkos
@ -7,7 +7,7 @@ CXXFLAGS=$(CCFLAGS)
 #Options: OpenMP,Serial,Pthreads,Cuda
 KOKKOS_DEVICES ?= "OpenMP"
 #KOKKOS_DEVICES ?= "Pthreads"
-#Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,ARMv8,BGQ,Power7,Power8,KNL
+#Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal61,ARMv8,BGQ,Power7,Power8,KNL,BDW
 KOKKOS_ARCH ?= ""
 #Options: yes,no
 KOKKOS_DEBUG ?= "no"
@ -97,6 +97,7 @@ KOKKOS_INTERNAL_USE_CUDA := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Cuda |
 KOKKOS_INTERNAL_USE_ARCH_KNC := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNC | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_SNB := $(strip $(shell echo $(KOKKOS_ARCH) | grep SNB | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_HSW := $(strip $(shell echo $(KOKKOS_ARCH) | grep HSW | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_BDW := $(strip $(shell echo $(KOKKOS_ARCH) | grep BDW | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_KNL := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNL | wc -l))

 #NVIDIA based
@ -108,10 +109,12 @@ KOKKOS_INTERNAL_USE_ARCH_KEPLER37 := $(strip $(shell echo $(KOKKOS_ARCH) | grep
 KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell50 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_MAXWELL52 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell52 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_MAXWELL53 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell53 | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_PASCAL61 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Pascal61 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30)  \
                                                      + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32)  \
                                                      + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35)  \
                                                      + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37)  \
+                                                      + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61)  \
                                                      + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
                                                      + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
                                                      + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc))
@ -123,6 +126,7 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_AR
                                                      + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32)  \
                                                      + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35)  \
                                                      + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37)  \
+                                                      + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61)  \
                                                      + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
                                                      + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
                                                      + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc))
@ -142,11 +146,11 @@ KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(strip $(shell echo $(KOKKOS_ARCH) | grep AM

 #Any AVX?
 KOKKOS_INTERNAL_USE_ARCH_AVX       := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX) | bc ))
-KOKKOS_INTERNAL_USE_ARCH_AVX2      := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW) | bc ))
+KOKKOS_INTERNAL_USE_ARCH_AVX2      := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW) | bc ))
 KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))

 # Decide what ISA level we are able to support
-KOKKOS_INTERNAL_USE_ISA_X86_64     := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
+KOKKOS_INTERNAL_USE_ISA_X86_64     := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
 KOKKOS_INTERNAL_USE_ISA_KNC        := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNC) | bc ))
 KOKKOS_INTERNAL_USE_ISA_POWERPCLE  := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8) | bc ))

@ -304,8 +308,8 @@ endif

 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1)
    tmp := $(shell echo "\#define KOKKOS_ARCH_POWER8 1" >> KokkosCore_config.tmp )
-	KOKKOS_CXXFLAGS += -mcpu=power8
-	KOKKOS_LDFLAGS  += -mcpu=power8
+	KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
+	KOKKOS_LDFLAGS  += -mcpu=power8 -mtune=power8
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX2), 1)
@ -321,8 +325,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX2), 1)

 			else
 				# Assume that this is a really a GNU compiler
-				KOKKOS_CXXFLAGS += -march=core-avx2
-				KOKKOS_LDFLAGS  += -march=core-avx2
+				KOKKOS_CXXFLAGS += -march=core-avx2 -mtune=core-avx2
+				KOKKOS_LDFLAGS  += -march=core-avx2 -mtune=core-avx2
 			endif
 		endif
 	endif
@ -390,6 +394,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1)
    tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL53 1" >> KokkosCore_config.tmp )
 	KOKKOS_CXXFLAGS += -arch=sm_53
 endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1)
+    tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL 1" >> KokkosCore_config.tmp )
+    tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL61 1" >> KokkosCore_config.tmp )
+        KOKKOS_CXXFLAGS += -arch=sm_61
+endif
 endif
 
 KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h)
--- a/lib/kokkos/Makefile.targets
+++ b/lib/kokkos/Makefile.targets
@ -1,9 +1,5 @@
 Kokkos_UnorderedMap_impl.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/containers/src/impl/Kokkos_UnorderedMap_impl.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/containers/src/impl/Kokkos_UnorderedMap_impl.cpp
-Kokkos_AllocationTracker.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_AllocationTracker.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_AllocationTracker.cpp
-Kokkos_BasicAllocators.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_BasicAllocators.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_BasicAllocators.cpp
 Kokkos_Core.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Core.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Core.cpp
 Kokkos_CPUDiscovery.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_CPUDiscovery.cpp
@ -20,6 +16,10 @@ Kokkos_Serial.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Seria
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial.cpp
 Kokkos_Serial_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_TaskPolicy.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_TaskPolicy.cpp
+Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
+Kokkos_Serial_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp
 Kokkos_Shape.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Shape.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Shape.cpp
 Kokkos_spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
@ -32,12 +32,12 @@ Kokkos_MemoryPool.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_M
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp

 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
-Kokkos_Cuda_BasicAllocators.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp
 Kokkos_Cuda_Impl.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Impl.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Impl.cpp
 Kokkos_CudaSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp
+Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
 Kokkos_Cuda_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp
 endif
@ -61,6 +61,8 @@ endif
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
 Kokkos_OpenMPexec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMPexec.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMPexec.cpp
+Kokkos_OpenMP_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
 endif

 Kokkos_HBWSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp
--- a/lib/kokkos/README
+++ b/lib/kokkos/README
@ -37,7 +37,7 @@ hcedwar(at)sandia.gov and crtrott(at)sandia.gov
 ====Requirements============================================================
 ============================================================================

-Primary tested compilers are:
+Primary tested compilers on X86 are:
  GCC 4.7.2
  GCC 4.8.4
  GCC 4.9.2
@ -48,26 +48,43 @@ Primary tested compilers are:
  Clang 3.5.2
  Clang 3.6.1

+Primary tested compilers on Power 8 are:
+  IBM XL 13.1.3 (OpenMP,Serial)
+  GCC 4.9.2 (OpenMP,Serial)
+  GCC 5.3.0 (OpenMP,Serial)
+
 Secondary tested compilers are:
  CUDA 6.5 (with gcc 4.7.2)
  CUDA 7.0 (with gcc 4.7.2)
  CUDA 7.5 (with gcc 4.8.4)

 Other compilers working:
-  PGI 15.4
-  IBM XL 13.1.2
-  Cygwin 2.1.0 64bit with gcc 4.9.3
+  X86:
+   Intel 17.0.042 (the FENL example causes internal compiler error)
+   PGI 15.4
+   Cygwin 2.1.0 64bit with gcc 4.9.3
+  KNL:
+   Intel 16.2.181 (the FENL example causes internal compiler error)
+   Intel 17.0.042 (the FENL example causes internal compiler error)
+
+Known non-working combinations:
+  Power8:
+   GCC 6.1.0
+   Pthreads backend
+

 Primary tested compiler are passing in release mode
-with warnings as errors. We are using the following set
-of flags:
+with warnings as errors. They also are tested with a comprehensive set of 
+backend combinations (i.e. OpenMP, Pthreads, Serial, OpenMP+Serial, ...).
+We are using the following set of flags:
 GCC:   -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits
       -Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized
 Intel: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized
 Clang: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized

 Secondary compilers are passing without -Werror.
-Other compilers are tested occasionally.
+Other compilers are tested occasionally, in particular when pushing from develop to 
+master branch, without -Werror and only for a select set of backends.

 ============================================================================
 ====Getting started=========================================================
--- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
@ -771,6 +771,7 @@ namespace Kokkos {
    friend class Random_XorShift1024_Pool<DeviceType>;
  public:

+    typedef Random_XorShift1024_Pool<DeviceType> pool_type;
    typedef DeviceType device_type;

    enum {MAX_URAND = 0xffffffffU};
@ -779,10 +780,10 @@ namespace Kokkos {
    enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)};

    KOKKOS_INLINE_FUNCTION
-    Random_XorShift1024 (uint64_t* state, int p, int state_idx = 0):
+    Random_XorShift1024 (const typename pool_type::state_data_type& state, int p, int state_idx = 0):
      p_(p),state_idx_(state_idx){
      for(int i=0 ; i<16; i++)
-        state_[i] = state[i];
+        state_[i] = state(state_idx,i);
    }

    KOKKOS_INLINE_FUNCTION
@ -933,6 +934,7 @@ namespace Kokkos {
    state_data_type state_;
    int_view_type p_;
    int num_states_;
+    friend class Random_XorShift1024<DeviceType>;

  public:
    typedef Random_XorShift1024<DeviceType> generator_type;
@ -1001,7 +1003,7 @@ namespace Kokkos {
    KOKKOS_INLINE_FUNCTION
    Random_XorShift1024<DeviceType> get_state() const {
      const int i = DeviceType::hardware_thread_id();
-      return Random_XorShift1024<DeviceType>(&state_(i,0),p_(i),i);
+      return Random_XorShift1024<DeviceType>(state_,p_(i),i);
    };

    KOKKOS_INLINE_FUNCTION
@ -1020,10 +1022,12 @@ namespace Kokkos {
    int p_;
    const int state_idx_;
    uint64_t* state_;
+    const int stride_;
    friend class Random_XorShift1024_Pool<Kokkos::Cuda>;
  public:

    typedef Kokkos::Cuda device_type;
+    typedef Random_XorShift1024_Pool<device_type> pool_type;

    enum {MAX_URAND = 0xffffffffU};
    enum {MAX_URAND64 = 0xffffffffffffffffULL-1};
@ -1031,30 +1035,30 @@ namespace Kokkos {
    enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)};

    KOKKOS_INLINE_FUNCTION
-    Random_XorShift1024 (uint64_t* state, int p, int state_idx = 0):
-      p_(p),state_idx_(state_idx),state_(state){
+    Random_XorShift1024 (const typename pool_type::state_data_type& state, int p, int state_idx = 0):
+      p_(p),state_idx_(state_idx),state_(&state(state_idx,0)),stride_(state.stride_1()){
    }

    KOKKOS_INLINE_FUNCTION
    uint32_t urand() {
-      uint64_t state_0 = state_[ p_ ];
-      uint64_t state_1 = state_[ p_ = ( p_ + 1 ) & 15 ];
+      uint64_t state_0 = state_[ p_ * stride_ ];
+      uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
      state_1 ^= state_1 << 31;
      state_1 ^= state_1 >> 11;
      state_0 ^= state_0 >> 30;
-      uint64_t tmp = ( state_[ p_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL;
+      uint64_t tmp = ( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL;
      tmp = tmp>>16;
      return static_cast<uint32_t>(tmp&MAX_URAND);
    }

    KOKKOS_INLINE_FUNCTION
    uint64_t urand64() {
-      uint64_t state_0 = state_[ p_ ];
-      uint64_t state_1 = state_[ p_ = ( p_ + 1 ) & 15 ];
+      uint64_t state_0 = state_[ p_ * stride_ ];
+      uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
      state_1 ^= state_1 << 31;
      state_1 ^= state_1 >> 11;
      state_0 ^= state_0 >> 30;
-      return (( state_[ p_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1;
+      return (( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1;
    }

    KOKKOS_INLINE_FUNCTION
@ -1227,9 +1231,9 @@ Random_XorShift1024<Kokkos::Cuda> Random_XorShift1024_Pool<Kokkos::Cuda>::get_st
      if(i>=num_states_) {i = i_offset;}
  }

-  return Random_XorShift1024<Kokkos::Cuda>(&state_(i,0), p_(i), i);
+  return Random_XorShift1024<Kokkos::Cuda>(state_, p_(i), i);
 #else
-  return Random_XorShift1024<Kokkos::Cuda>(&state_(0,0), p_(0), 0);
+  return Random_XorShift1024<Kokkos::Cuda>(state_, p_(0), 0);
 #endif
 }

@ -1248,14 +1252,15 @@ void Random_XorShift1024_Pool<Kokkos::Cuda>::free_state(const Random_XorShift102
 #endif


+namespace Impl {

-template<class ViewType, class RandomPool, int loops, int rank>
+template<class ViewType, class RandomPool, int loops, int rank, class IndexType>
 struct fill_random_functor_range;
-template<class ViewType, class RandomPool, int loops, int rank>
+template<class ViewType, class RandomPool, int loops, int rank, class IndexType>
 struct fill_random_functor_begin_end;

-template<class ViewType, class RandomPool, int loops>
-struct fill_random_functor_range<ViewType,RandomPool,loops,1>{
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_range<ViewType,RandomPool,loops,1,IndexType>{
  typedef typename ViewType::execution_space execution_space;
  ViewType a;
  RandomPool rand_pool;
@ -1268,19 +1273,19 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,1>{
    a(a_),rand_pool(rand_pool_),range(range_) {}

  KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (const IndexType& i) const {
    typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
-      if(idx<a.dimension_0())
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0()))
        a(idx) = Rand::draw(gen,range);
    }
    rand_pool.free_state(gen);
  }
 };

-template<class ViewType, class RandomPool, int loops>
-struct fill_random_functor_range<ViewType,RandomPool,loops,2>{
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_range<ViewType,RandomPool,loops,2,IndexType>{
  typedef typename ViewType::execution_space execution_space;
  ViewType a;
  RandomPool rand_pool;
@ -1293,12 +1298,12 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,2>{
    a(a_),rand_pool(rand_pool_),range(range_) {}

  KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
    typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
-      if(idx<a.dimension_0()) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
          a(idx,k) = Rand::draw(gen,range);
      }
    }
@ -1307,8 +1312,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,2>{
 };


-template<class ViewType, class RandomPool, int loops>
-struct fill_random_functor_range<ViewType,RandomPool,loops,3>{
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_range<ViewType,RandomPool,loops,3,IndexType>{
  typedef typename ViewType::execution_space execution_space;
  ViewType a;
  RandomPool rand_pool;
@ -1321,13 +1326,13 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,3>{
    a(a_),rand_pool(rand_pool_),range(range_) {}

  KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
    typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
-      if(idx<a.dimension_0()) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
-          for(unsigned int l=0;l<a.dimension_2();l++)
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
            a(idx,k,l) = Rand::draw(gen,range);
      }
    }
@ -1335,8 +1340,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,3>{
  }
 };

-template<class ViewType, class RandomPool, int loops>
-struct fill_random_functor_range<ViewType,RandomPool,loops,4>{
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_range<ViewType,RandomPool,loops,4, IndexType>{
  typedef typename ViewType::execution_space execution_space;
  ViewType a;
  RandomPool rand_pool;
@ -1349,14 +1354,14 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,4>{
    a(a_),rand_pool(rand_pool_),range(range_) {}

  KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
    typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
-      if(idx<a.dimension_0()) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
-          for(unsigned int l=0;l<a.dimension_2();l++)
-            for(unsigned int m=0;m<a.dimension_3();m++)
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
              a(idx,k,l,m) = Rand::draw(gen,range);
      }
    }
@ -1364,8 +1369,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,4>{
  }
 };

-template<class ViewType, class RandomPool, int loops>
-struct fill_random_functor_range<ViewType,RandomPool,loops,5>{
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_range<ViewType,RandomPool,loops,5,IndexType>{
  typedef typename ViewType::execution_space execution_space;
  ViewType a;
  RandomPool rand_pool;
@ -1378,15 +1383,15 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,5>{
    a(a_),rand_pool(rand_pool_),range(range_) {}

  KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
    typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
-      if(idx<a.dimension_0()) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
-          for(unsigned int l=0;l<a.dimension_2();l++)
-            for(unsigned int m=0;m<a.dimension_3();m++)
-              for(unsigned int n=0;n<a.dimension_4();n++)
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
+              for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
              a(idx,k,l,m,n) = Rand::draw(gen,range);
      }
    }
@ -1394,8 +1399,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,5>{
  }
 };

-template<class ViewType, class RandomPool, int loops>
-struct fill_random_functor_range<ViewType,RandomPool,loops,6>{
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_range<ViewType,RandomPool,loops,6,IndexType>{
  typedef typename ViewType::execution_space execution_space;
  ViewType a;
  RandomPool rand_pool;
@ -1408,16 +1413,16 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,6>{
    a(a_),rand_pool(rand_pool_),range(range_) {}

  KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
    typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
-      if(idx<a.dimension_0()) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
-          for(unsigned int l=0;l<a.dimension_2();l++)
-            for(unsigned int m=0;m<a.dimension_3();m++)
-              for(unsigned int n=0;n<a.dimension_4();n++)
-                for(unsigned int o=0;o<a.dimension_5();o++)
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
+              for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
+                for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
              a(idx,k,l,m,n,o) = Rand::draw(gen,range);
      }
    }
@ -1425,8 +1430,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,6>{
  }
 };

-template<class ViewType, class RandomPool, int loops>
-struct fill_random_functor_range<ViewType,RandomPool,loops,7>{
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_range<ViewType,RandomPool,loops,7,IndexType>{
  typedef typename ViewType::execution_space execution_space;
  ViewType a;
  RandomPool rand_pool;
@ -1439,17 +1444,17 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,7>{
    a(a_),rand_pool(rand_pool_),range(range_) {}

  KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
    typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
-      if(idx<a.dimension_0()) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
-          for(unsigned int l=0;l<a.dimension_2();l++)
-            for(unsigned int m=0;m<a.dimension_3();m++)
-              for(unsigned int n=0;n<a.dimension_4();n++)
-                for(unsigned int o=0;o<a.dimension_5();o++)
-                  for(unsigned int p=0;p<a.dimension_6();p++)
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
+              for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
+                for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
+                  for(IndexType p=0;p<static_cast<IndexType>(a.dimension_6());p++)
              a(idx,k,l,m,n,o,p) = Rand::draw(gen,range);
      }
    }
@ -1457,8 +1462,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,7>{
  }
 };

-template<class ViewType, class RandomPool, int loops>
-struct fill_random_functor_range<ViewType,RandomPool,loops,8>{
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_range<ViewType,RandomPool,loops,8,IndexType>{
  typedef typename ViewType::execution_space execution_space;
  ViewType a;
  RandomPool rand_pool;
@ -1471,26 +1476,26 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,8>{
    a(a_),rand_pool(rand_pool_),range(range_) {}

  KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
    typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
-      if(idx<a.dimension_0()) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
-          for(unsigned int l=0;l<a.dimension_2();l++)
-            for(unsigned int m=0;m<a.dimension_3();m++)
-              for(unsigned int n=0;n<a.dimension_4();n++)
-                for(unsigned int o=0;o<a.dimension_5();o++)
-                  for(unsigned int p=0;p<a.dimension_6();p++)
-                    for(unsigned int q=0;q<a.dimension_7();q++)
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
+              for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
+                for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
+                  for(IndexType p=0;p<static_cast<IndexType>(a.dimension_6());p++)
+                    for(IndexType q=0;q<static_cast<IndexType>(a.dimension_7());q++)
              a(idx,k,l,m,n,o,p,q) = Rand::draw(gen,range);
      }
    }
    rand_pool.free_state(gen);
  }
 };
-template<class ViewType, class RandomPool, int loops>
-struct fill_random_functor_begin_end<ViewType,RandomPool,loops,1>{
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,1,IndexType>{
  typedef typename ViewType::execution_space execution_space;
  ViewType a;
  RandomPool rand_pool;
@ -1503,19 +1508,19 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,1>{
    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}

  KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
    typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
-      if(idx<a.dimension_0())
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0()))
        a(idx) = Rand::draw(gen,begin,end);
    }
    rand_pool.free_state(gen);
  }
 };

-template<class ViewType, class RandomPool, int loops>
-struct fill_random_functor_begin_end<ViewType,RandomPool,loops,2>{
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,2,IndexType>{
  typedef typename ViewType::execution_space execution_space;
  ViewType a;
  RandomPool rand_pool;
@ -1528,12 +1533,12 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,2>{
    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}

  KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
    typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
-      if(idx<a.dimension_0()) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
          a(idx,k) = Rand::draw(gen,begin,end);
      }
    }
@ -1542,8 +1547,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,2>{
 };


-template<class ViewType, class RandomPool, int loops>
-struct fill_random_functor_begin_end<ViewType,RandomPool,loops,3>{
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,3,IndexType>{
  typedef typename ViewType::execution_space execution_space;
  ViewType a;
  RandomPool rand_pool;
@ -1556,13 +1561,13 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,3>{
    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}

  KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
    typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
-      if(idx<a.dimension_0()) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
-          for(unsigned int l=0;l<a.dimension_2();l++)
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
            a(idx,k,l) = Rand::draw(gen,begin,end);
      }
    }
@ -1570,8 +1575,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,3>{
  }
 };

-template<class ViewType, class RandomPool, int loops>
-struct fill_random_functor_begin_end<ViewType,RandomPool,loops,4>{
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,4,IndexType>{
  typedef typename ViewType::execution_space execution_space;
  ViewType a;
  RandomPool rand_pool;
@ -1584,14 +1589,14 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,4>{
    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}

  KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
    typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
-      if(idx<a.dimension_0()) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
-          for(unsigned int l=0;l<a.dimension_2();l++)
-            for(unsigned int m=0;m<a.dimension_3();m++)
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
              a(idx,k,l,m) = Rand::draw(gen,begin,end);
      }
    }
@ -1599,8 +1604,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,4>{
  }
 };

-template<class ViewType, class RandomPool, int loops>
-struct fill_random_functor_begin_end<ViewType,RandomPool,loops,5>{
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,5,IndexType>{
  typedef typename ViewType::execution_space execution_space;
  ViewType a;
  RandomPool rand_pool;
@ -1613,15 +1618,15 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,5>{
    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}

  KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
    typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
-      if(idx<a.dimension_0()){
-        for(unsigned int l=0;l<a.dimension_1();l++)
-          for(unsigned int m=0;m<a.dimension_2();m++)
-            for(unsigned int n=0;n<a.dimension_3();n++)
-              for(unsigned int o=0;o<a.dimension_4();o++)
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())){
+        for(IndexType l=0;l<static_cast<IndexType>(a.dimension_1());l++)
+          for(IndexType m=0;m<static_cast<IndexType>(a.dimension_2());m++)
+            for(IndexType n=0;n<static_cast<IndexType>(a.dimension_3());n++)
+              for(IndexType o=0;o<static_cast<IndexType>(a.dimension_4());o++)
          a(idx,l,m,n,o) = Rand::draw(gen,begin,end);
      }
    }
@ -1629,8 +1634,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,5>{
  }
 };

-template<class ViewType, class RandomPool, int loops>
-struct fill_random_functor_begin_end<ViewType,RandomPool,loops,6>{
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,6,IndexType>{
  typedef typename ViewType::execution_space execution_space;
  ViewType a;
  RandomPool rand_pool;
@ -1643,16 +1648,16 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,6>{
    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}

  KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
    typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
-      if(idx<a.dimension_0()) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
-          for(unsigned int l=0;l<a.dimension_2();l++)
-            for(unsigned int m=0;m<a.dimension_3();m++)
-              for(unsigned int n=0;n<a.dimension_4();n++)
-                for(unsigned int o=0;o<a.dimension_5();o++)
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
+              for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
+                for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
          a(idx,k,l,m,n,o) = Rand::draw(gen,begin,end);
      }
    }
@ -1661,8 +1666,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,6>{
 };


-template<class ViewType, class RandomPool, int loops>
-struct fill_random_functor_begin_end<ViewType,RandomPool,loops,7>{
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,7,IndexType>{
  typedef typename ViewType::execution_space execution_space;
  ViewType a;
  RandomPool rand_pool;
@ -1675,17 +1680,17 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,7>{
    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}

  KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
    typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
-      if(idx<a.dimension_0()) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
-          for(unsigned int l=0;l<a.dimension_2();l++)
-            for(unsigned int m=0;m<a.dimension_3();m++)
-              for(unsigned int n=0;n<a.dimension_4();n++)
-                for(unsigned int o=0;o<a.dimension_5();o++)
-                  for(unsigned int p=0;p<a.dimension_6();p++)
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
+              for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
+                for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
+                  for(IndexType p=0;p<static_cast<IndexType>(a.dimension_6());p++)
            a(idx,k,l,m,n,o,p) = Rand::draw(gen,begin,end);
      }
    }
@ -1693,8 +1698,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,7>{
  }
 };

-template<class ViewType, class RandomPool, int loops>
-struct fill_random_functor_begin_end<ViewType,RandomPool,loops,8>{
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,8,IndexType>{
  typedef typename ViewType::execution_space execution_space;
  ViewType a;
  RandomPool rand_pool;
@ -1707,18 +1712,18 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,8>{
    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}

  KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
    typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
-      if(idx<a.dimension_0()) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
-          for(unsigned int l=0;l<a.dimension_2();l++)
-            for(unsigned int m=0;m<a.dimension_3();m++)
-              for(unsigned int n=0;n<a.dimension_4();n++)
-                for(unsigned int o=0;o<a.dimension_5();o++)
-                  for(unsigned int p=0;p<a.dimension_6();p++)
-                    for(unsigned int q=0;q<a.dimension_7();q++)
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
+              for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
+                for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
+                  for(IndexType p=0;p<static_cast<IndexType>(a.dimension_6());p++)
+                    for(IndexType q=0;q<static_cast<IndexType>(a.dimension_7());q++)
              a(idx,k,l,m,n,o,p,q) = Rand::draw(gen,begin,end);
      }
    }
@ -1726,18 +1731,20 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,8>{
  }
 };

-template<class ViewType, class RandomPool>
+}
+
+template<class ViewType, class RandomPool, class IndexType = int64_t>
 void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type range) {
  int64_t LDA = a.dimension_0();
  if(LDA>0)
-    parallel_for((LDA+127)/128,fill_random_functor_range<ViewType,RandomPool,128,ViewType::Rank>(a,g,range));
+    parallel_for((LDA+127)/128,Impl::fill_random_functor_range<ViewType,RandomPool,128,ViewType::Rank,IndexType>(a,g,range));
 }

-template<class ViewType, class RandomPool>
+template<class ViewType, class RandomPool, class IndexType = int64_t>
 void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type begin,typename ViewType::const_value_type end ) {
  int64_t LDA = a.dimension_0();
  if(LDA>0)
-    parallel_for((LDA+127)/128,fill_random_functor_begin_end<ViewType,RandomPool,128,ViewType::Rank>(a,g,begin,end));
+    parallel_for((LDA+127)/128,Impl::fill_random_functor_begin_end<ViewType,RandomPool,128,ViewType::Rank,IndexType>(a,g,begin,end));
 }
 }

--- a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp
+++ b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp
@ -50,6 +50,7 @@
 #include <Kokkos_Core.hpp>
 #include <Kokkos_Random.hpp>
 #include <cmath>
+#include <chrono>

 namespace Test {

@ -207,7 +208,6 @@ struct test_histogram1d_functor {
    density_1d (d1d),
    mean (1.0*num_draws/HIST_DIM1D*3)
  {
-    printf ("Mean: %e\n", mean);
  }

  KOKKOS_INLINE_FUNCTION void
@ -295,7 +295,7 @@ struct test_random_scalar {
      parallel_reduce (num_draws/1024, functor_type (pool, density_1d, density_3d), result);

      //printf("Result: %lf %lf %lf\n",result.mean/num_draws/3,result.variance/num_draws/3,result.covariance/num_draws/2);
-      double tolerance = 2.0*sqrt(1.0/num_draws);
+      double tolerance = 1.6*sqrt(1.0/num_draws);
      double mean_expect = 0.5*Kokkos::rand<rnd_type,Scalar>::max();
      double variance_expect = 1.0/3.0*mean_expect*mean_expect;
      double mean_eps = mean_expect/(result.mean/num_draws/3)-1.0;
@ -303,10 +303,10 @@ struct test_random_scalar {
      double covariance_eps = result.covariance/num_draws/2/variance_expect;
      pass_mean  = ((-tolerance < mean_eps) &&
                    ( tolerance > mean_eps)) ? 1:0;
-      pass_var   = ((-tolerance < variance_eps) &&
-                    ( tolerance > variance_eps)) ? 1:0;
-      pass_covar = ((-1.4*tolerance < covariance_eps) &&
-                    ( 1.4*tolerance > covariance_eps)) ? 1:0;
+      pass_var   = ((-1.5*tolerance < variance_eps) &&
+                    ( 1.5*tolerance > variance_eps)) ? 1:0;
+      pass_covar = ((-2.0*tolerance < covariance_eps) &&
+                    ( 2.0*tolerance > covariance_eps)) ? 1:0;
      cerr << "Pass: " << pass_mean
           << " " << pass_var
           << " " << mean_eps
@ -328,12 +328,12 @@ struct test_random_scalar {
      double mean_eps = mean_expect/(result.mean/HIST_DIM1D)-1.0;
      double variance_eps = variance_expect/(result.variance/HIST_DIM1D)-1.0;
      double covariance_eps = (result.covariance/HIST_DIM1D - covariance_expect)/mean_expect;
-      pass_hist1d_mean  = ((-tolerance < mean_eps) &&
-                           ( tolerance > mean_eps)) ? 1:0;
-      pass_hist1d_var   = ((-tolerance < variance_eps) &&
-                           ( tolerance > variance_eps)) ? 1:0;
-      pass_hist1d_covar = ((-tolerance < covariance_eps) &&
-                           ( tolerance > covariance_eps)) ? 1:0;
+      pass_hist1d_mean  = ((-0.0001 < mean_eps) &&
+                           ( 0.0001 > mean_eps)) ? 1:0;
+      pass_hist1d_var   = ((-0.07 < variance_eps) &&
+                           ( 0.07 > variance_eps)) ? 1:0;
+      pass_hist1d_covar = ((-0.06 < covariance_eps) &&
+                           ( 0.06 > covariance_eps)) ? 1:0;

      cerr << "Density 1D: " << mean_eps
           << " " << variance_eps
@ -363,8 +363,8 @@ struct test_random_scalar {
      double covariance_eps = (result.covariance/HIST_DIM1D - covariance_expect)/mean_expect;
      pass_hist3d_mean  = ((-tolerance < mean_eps) &&
                           ( tolerance > mean_eps)) ? 1:0;
-      pass_hist3d_var   = ((-tolerance < variance_eps) &&
-                           ( tolerance > variance_eps)) ? 1:0;
+      pass_hist3d_var   = ((-1.2*tolerance < variance_eps) &&
+                           ( 1.2*tolerance > variance_eps)) ? 1:0;
      pass_hist3d_covar = ((-tolerance < covariance_eps) &&
                           ( tolerance > covariance_eps)) ? 1:0;

@ -386,8 +386,13 @@ void test_random(unsigned int num_draws)
  typename test_random_functor<RandomGenerator,int>::type_1d density_1d("D1d");
  typename test_random_functor<RandomGenerator,int>::type_3d density_3d("D3d");

+
+  uint64_t ticks = std::chrono::high_resolution_clock::now().time_since_epoch().count();
+  cerr << "Test Seed:" << ticks << endl;
+
+  RandomGenerator pool(ticks);
+
  cerr << "Test Scalar=int" << endl;
-  RandomGenerator pool(31891);
  test_random_scalar<RandomGenerator,int> test_int(density_1d,density_3d,pool,num_draws);
  ASSERT_EQ( test_int.pass_mean,1);
  ASSERT_EQ( test_int.pass_var,1);
--- a/lib/kokkos/cmake/deps/CUDA.cmake
+++ b/lib/kokkos/cmake/deps/CUDA.cmake
@ -0,0 +1,79 @@
+# @HEADER
+# ************************************************************************
+#
+#            Trilinos: An Object-Oriented Solver Framework
+#                 Copyright (2001) Sandia Corporation
+#
+#
+# Copyright (2001) Sandia Corporation. Under the terms of Contract
+# DE-AC04-94AL85000, there is a non-exclusive license for use of this
+# work by or on behalf of the U.S. Government.  Export of this program
+# may require a license from the United States Government.
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# NOTICE:  The United States Government is granted for itself and others
+# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
+# license in this data to reproduce, prepare derivative works, and
+# perform publicly and display publicly.  Beginning five (5) years from
+# July 25, 2001, the United States Government is granted for itself and
+# others acting on its behalf a paid-up, nonexclusive, irrevocable
+# worldwide license in this data to reproduce, prepare derivative works,
+# distribute copies to the public, perform publicly and display
+# publicly, and to permit others to do so.
+#
+# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
+# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
+# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
+# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
+# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
+# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
+#
+# ************************************************************************
+# @HEADER
+
+# Check for CUDA support
+
+SET(_CUDA_FAILURE OFF)
+
+# Have CMake find CUDA
+IF(NOT _CUDA_FAILURE)
+  FIND_PACKAGE(CUDA 3.2)
+  IF (NOT CUDA_FOUND)
+    SET(_CUDA_FAILURE ON)
+  ENDIF()
+ENDIF()
+
+IF(NOT _CUDA_FAILURE)
+  # if we haven't met failure
+  macro(PACKAGE_ADD_CUDA_LIBRARY cuda_target)
+    TRIBITS_ADD_LIBRARY(${cuda_target} ${ARGN} CUDALIBRARY)
+  endmacro()
+  GLOBAL_SET(TPL_CUDA_LIBRARY_DIRS)
+  GLOBAL_SET(TPL_CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE})
+  GLOBAL_SET(TPL_CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY} ${CUDA_cublas_LIBRARY} ${CUDA_cufft_LIBRARY})
+  TIBITS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE)
+ELSE()
+  SET(TPL_ENABLE_CUDA OFF)
+ENDIF()
--- a/lib/kokkos/cmake/deps/CUSPARSE.cmake
+++ b/lib/kokkos/cmake/deps/CUSPARSE.cmake
@ -0,0 +1,64 @@
+# @HEADER
+# ************************************************************************
+#
+#            Trilinos: An Object-Oriented Solver Framework
+#                 Copyright (2001) Sandia Corporation
+#
+#
+# Copyright (2001) Sandia Corporation. Under the terms of Contract
+# DE-AC04-94AL85000, there is a non-exclusive license for use of this
+# work by or on behalf of the U.S. Government.  Export of this program
+# may require a license from the United States Government.
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# NOTICE:  The United States Government is granted for itself and others
+# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
+# license in this data to reproduce, prepare derivative works, and
+# perform publicly and display publicly.  Beginning five (5) years from
+# July 25, 2001, the United States Government is granted for itself and
+# others acting on its behalf a paid-up, nonexclusive, irrevocable
+# worldwide license in this data to reproduce, prepare derivative works,
+# distribute copies to the public, perform publicly and display
+# publicly, and to permit others to do so.
+#
+# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
+# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
+# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
+# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
+# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
+# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
+#
+# ************************************************************************
+# @HEADER
+
+include(${TRIBITS_DEPS_DIR}/CUDA.cmake)
+
+IF (TPL_ENABLE_CUDA)
+  GLOBAL_SET(TPL_CUSPARSE_LIBRARY_DIRS)
+  GLOBAL_SET(TPL_CUSPARSE_INCLUDE_DIRS ${TPL_CUDA_INCLUDE_DIRS})
+  GLOBAL_SET(TPL_CUSPARSE_LIBRARIES    ${CUDA_cusparse_LIBRARY})
+  TIBITS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE)
+ENDIF()
+
--- a/lib/kokkos/cmake/deps/HWLOC.cmake
+++ b/lib/kokkos/cmake/deps/HWLOC.cmake
@ -0,0 +1,70 @@
+# @HEADER
+# ************************************************************************
+#
+#            Trilinos: An Object-Oriented Solver Framework
+#                 Copyright (2001) Sandia Corporation
+#
+#
+# Copyright (2001) Sandia Corporation. Under the terms of Contract
+# DE-AC04-94AL85000, there is a non-exclusive license for use of this
+# work by or on behalf of the U.S. Government.  Export of this program
+# may require a license from the United States Government.
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# NOTICE:  The United States Government is granted for itself and others
+# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
+# license in this data to reproduce, prepare derivative works, and
+# perform publicly and display publicly.  Beginning five (5) years from
+# July 25, 2001, the United States Government is granted for itself and
+# others acting on its behalf a paid-up, nonexclusive, irrevocable
+# worldwide license in this data to reproduce, prepare derivative works,
+# distribute copies to the public, perform publicly and display
+# publicly, and to permit others to do so.
+#
+# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
+# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
+# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
+# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
+# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
+# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
+#
+# ************************************************************************
+# @HEADER
+
+
+#-----------------------------------------------------------------------------
+#  Hardware locality detection and control library.
+#
+#  Acquisition information:
+#    Date checked:  November 2011
+#    Checked by:    H. Carter Edwards <hcedwar AT sandia.gov>
+#    Source:        http://www.open-mpi.org/projects/hwloc/
+#    Version:       1.3
+#
+
+TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( HWLOC
+  REQUIRED_HEADERS hwloc.h
+  REQUIRED_LIBS_NAMES "hwloc"
+  )
--- a/lib/kokkos/cmake/deps/Pthread.cmake
+++ b/lib/kokkos/cmake/deps/Pthread.cmake
@ -0,0 +1,83 @@
+# @HEADER
+# ************************************************************************
+#
+#            Trilinos: An Object-Oriented Solver Framework
+#                 Copyright (2001) Sandia Corporation
+#
+#
+# Copyright (2001) Sandia Corporation. Under the terms of Contract
+# DE-AC04-94AL85000, there is a non-exclusive license for use of this
+# work by or on behalf of the U.S. Government.  Export of this program
+# may require a license from the United States Government.
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# NOTICE:  The United States Government is granted for itself and others
+# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
+# license in this data to reproduce, prepare derivative works, and
+# perform publicly and display publicly.  Beginning five (5) years from
+# July 25, 2001, the United States Government is granted for itself and
+# others acting on its behalf a paid-up, nonexclusive, irrevocable
+# worldwide license in this data to reproduce, prepare derivative works,
+# distribute copies to the public, perform publicly and display
+# publicly, and to permit others to do so.
+#
+# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
+# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
+# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
+# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
+# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
+# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
+#
+# ************************************************************************
+# @HEADER
+
+
+SET(USE_THREADS FALSE)
+
+IF(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES)
+  # Use CMake's Thread finder since it is a bit smarter in determining
+  # whether pthreads is already built into the compiler and doesn't need
+  # a library to link.
+  FIND_PACKAGE(Threads)
+  #If Threads found a copy of pthreads make sure it is one of the cases the tribits
+  #tpl system cannot handle.
+  IF(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT)
+    IF(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread")
+      SET(USE_THREADS TRUE)
+    ENDIF()
+  ENDIF()
+ENDIF()
+
+IF(USE_THREADS)
+  SET(TPL_Pthread_INCLUDE_DIRS "")
+  SET(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}")
+  SET(TPL_Pthread_LIBRARY_DIRS "")
+  TIBITS_CREATE_IMPORTED_TPL_LIBRARY(Pthread)
+ELSE()
+  TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( Pthread
+    REQUIRED_HEADERS pthread.h
+    REQUIRED_LIBS_NAMES pthread
+      )
+ENDIF()
--- a/lib/kokkos/cmake/deps/QTHREAD.cmake
+++ b/lib/kokkos/cmake/deps/QTHREAD.cmake
@ -0,0 +1,70 @@
+# @HEADER
+# ************************************************************************
+#
+#            Trilinos: An Object-Oriented Solver Framework
+#                 Copyright (2001) Sandia Corporation
+#
+#
+# Copyright (2001) Sandia Corporation. Under the terms of Contract
+# DE-AC04-94AL85000, there is a non-exclusive license for use of this
+# work by or on behalf of the U.S. Government.  Export of this program
+# may require a license from the United States Government.
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# NOTICE:  The United States Government is granted for itself and others
+# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
+# license in this data to reproduce, prepare derivative works, and
+# perform publicly and display publicly.  Beginning five (5) years from
+# July 25, 2001, the United States Government is granted for itself and
+# others acting on its behalf a paid-up, nonexclusive, irrevocable
+# worldwide license in this data to reproduce, prepare derivative works,
+# distribute copies to the public, perform publicly and display
+# publicly, and to permit others to do so.
+#
+# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
+# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
+# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
+# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
+# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
+# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
+#
+# ************************************************************************
+# @HEADER
+
+
+#-----------------------------------------------------------------------------
+#  Hardware locality detection and control library.
+#
+#  Acquisition information:
+#    Date checked:  July 2014
+#    Checked by:    H. Carter Edwards <hcedwar AT sandia.gov>
+#    Source:        https://code.google.com/p/qthreads
+#
+
+TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( QTHREAD
+  REQUIRED_HEADERS qthread.h
+  REQUIRED_LIBS_NAMES "qthread"
+  )
+
--- a/lib/kokkos/cmake/tribits.cmake
+++ b/lib/kokkos/cmake/tribits.cmake
@ -0,0 +1,485 @@
+INCLUDE(CMakeParseArguments)
+INCLUDE(CTest)
+
+FUNCTION(ASSERT_DEFINED VARS)
+  FOREACH(VAR ${VARS})
+    IF(NOT DEFINED ${VAR})
+      MESSAGE(SEND_ERROR "Error, the variable ${VAR} is not defined!")
+    ENDIF()
+  ENDFOREACH()
+ENDFUNCTION()
+
+MACRO(GLOBAL_SET VARNAME)
+  SET(${VARNAME} ${ARGN} CACHE INTERNAL "")
+ENDMACRO()
+
+MACRO(PREPEND_GLOBAL_SET VARNAME)
+  ASSERT_DEFINED(${VARNAME})
+  GLOBAL_SET(${VARNAME} ${ARGN} ${${VARNAME}})
+ENDMACRO()
+
+FUNCTION(REMOVE_GLOBAL_DUPLICATES VARNAME)
+  ASSERT_DEFINED(${VARNAME})
+  IF (${VARNAME})
+    SET(TMP ${${VARNAME}})
+    LIST(REMOVE_DUPLICATES TMP)
+    GLOBAL_SET(${VARNAME} ${TMP})
+  ENDIF()
+ENDFUNCTION()
+
+MACRO(TRIBITS_ADD_OPTION_AND_DEFINE  USER_OPTION_NAME  MACRO_DEFINE_NAME DOCSTRING  DEFAULT_VALUE)
+  MESSAGE(STATUS "TRIBITS_ADD_OPTION_AND_DEFINE: '${USER_OPTION_NAME}' '${MACRO_DEFINE_NAME}' '${DEFAULT_VALUE}'")
+  SET( ${USER_OPTION_NAME} "${DEFAULT_VALUE}" CACHE BOOL "${DOCSTRING}" )
+  IF(NOT ${MACRO_DEFINE_NAME} STREQUAL "")
+    IF(${USER_OPTION_NAME})
+      GLOBAL_SET(${MACRO_DEFINE_NAME} ON)
+    ELSE()
+      GLOBAL_SET(${MACRO_DEFINE_NAME} OFF)
+    ENDIF()
+  ENDIF()
+ENDMACRO()
+
+FUNCTION(TRIBITS_CONFIGURE_FILE  PACKAGE_NAME_CONFIG_FILE)
+
+  # Configure the file
+  CONFIGURE_FILE(
+    ${PACKAGE_SOURCE_DIR}/cmake/${PACKAGE_NAME_CONFIG_FILE}.in
+    ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME_CONFIG_FILE}
+    )
+
+ENDFUNCTION()
+
+MACRO(TRIBITS_ADD_DEBUG_OPTION)
+  TRIBITS_ADD_OPTION_AND_DEFINE(
+    ${PROJECT_NAME}_ENABLE_DEBUG
+    HAVE_${PROJECT_NAME_UC}_DEBUG
+    "Enable a host of runtime debug checking."
+    OFF
+    )
+ENDMACRO()
+
+
+MACRO(TRIBITS_ADD_TEST_DIRECTORIES)
+  FOREACH(TEST_DIR ${ARGN})
+    ADD_SUBDIRECTORY(${TEST_DIR})
+  ENDFOREACH()
+ENDMACRO()
+
+MACRO(TRIBITS_ADD_EXAMPLE_DIRECTORIES)
+
+  IF(${PACKAGE_NAME}_ENABLE_EXAMPLES OR ${PARENT_PACKAGE_NAME}_ENABLE_EXAMPLES)
+    FOREACH(EXAMPLE_DIR ${ARGN})
+      ADD_SUBDIRECTORY(${EXAMPLE_DIR})
+    ENDFOREACH()
+  ENDIF()
+
+ENDMACRO()
+
+MACRO(TARGET_TRANSFER_PROPERTY TARGET_NAME PROP_IN PROP_OUT)
+  SET(PROP_VALUES)
+  FOREACH(TARGET_X ${ARGN})
+    LIST(APPEND PROP_VALUES "$<TARGET_PROPERTY:${TARGET_X},${PROP_IN}>")
+  ENDFOREACH()
+  SET_TARGET_PROPERTIES(${TARGET_NAME} PROPERTIES ${PROP_OUT} "${PROP_VALUES}")
+ENDMACRO()
+
+MACRO(ADD_INTERFACE_LIBRARY LIB_NAME)
+  FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp "")
+  ADD_LIBRARY(${LIB_NAME} STATIC ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp)
+  SET_TARGET_PROPERTIES(${LIB_NAME} PROPERTIES INTERFACE TRUE)
+ENDMACRO()
+
+# Older versions of cmake does not make include directories transitive
+MACRO(TARGET_LINK_AND_INCLUDE_LIBRARIES TARGET_NAME)
+  TARGET_LINK_LIBRARIES(${TARGET_NAME} LINK_PUBLIC ${ARGN})
+  FOREACH(DEP_LIB ${ARGN})
+    TARGET_INCLUDE_DIRECTORIES(${TARGET_NAME} PUBLIC $<TARGET_PROPERTY:${DEP_LIB},INTERFACE_INCLUDE_DIRECTORIES>)
+    TARGET_INCLUDE_DIRECTORIES(${TARGET_NAME} PUBLIC $<TARGET_PROPERTY:${DEP_LIB},INCLUDE_DIRECTORIES>)
+  ENDFOREACH()
+ENDMACRO()
+
+FUNCTION(TRIBITS_ADD_LIBRARY LIBRARY_NAME)
+
+  SET(options STATIC SHARED TESTONLY NO_INSTALL_LIB_OR_HEADERS CUDALIBRARY)
+  SET(oneValueArgs)
+  SET(multiValueArgs HEADERS HEADERS_INSTALL_SUBDIR NOINSTALLHEADERS SOURCES DEPLIBS IMPORTEDLIBS DEFINES ADDED_LIB_TARGET_NAME_OUT)
+
+  CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  IF(PARSE_HEADERS)
+    LIST(REMOVE_DUPLICATES PARSE_HEADERS)
+  ENDIF()
+  IF(PARSE_SOURCES)
+    LIST(REMOVE_DUPLICATES PARSE_SOURCES)
+  ENDIF()
+
+  # Local variable to hold all of the libraries that will be directly linked
+  # to this library.
+  SET(LINK_LIBS ${${PACKAGE_NAME}_DEPS})
+
+  # Add dependent libraries passed directly in
+
+  IF (PARSE_IMPORTEDLIBS)
+    LIST(APPEND LINK_LIBS ${PARSE_IMPORTEDLIBS})
+  ENDIF()
+
+  IF (PARSE_DEPLIBS)
+    LIST(APPEND LINK_LIBS ${PARSE_DEPLIBS})
+  ENDIF()
+
+  # Add the library and all the dependencies
+
+  IF (PARSE_DEFINES)
+    ADD_DEFINITIONS(${PARSE_DEFINES})
+  ENDIF()
+
+  IF (PARSE_STATIC)
+    SET(STATIC_KEYWORD "STATIC")
+  ELSE()
+    SET(STATIC_KEYWORD)
+  ENDIF()
+
+  IF (PARSE_SHARED)
+    SET(SHARED_KEYWORD "SHARED")
+  ELSE()
+    SET(SHARED_KEYWORD)
+  ENDIF()
+
+  IF (PARSE_TESTONLY)
+    SET(EXCLUDE_FROM_ALL_KEYWORD "EXCLUDE_FROM_ALL")
+  ELSE()
+    SET(EXCLUDE_FROM_ALL_KEYWORD)
+  ENDIF()
+  IF (NOT PARSE_CUDALIBRARY)
+    ADD_LIBRARY(
+      ${LIBRARY_NAME}
+      ${STATIC_KEYWORD}
+      ${SHARED_KEYWORD}
+      ${EXCLUDE_FROM_ALL_KEYWORD}
+      ${PARSE_HEADERS}
+      ${PARSE_NOINSTALLHEADERS}
+      ${PARSE_SOURCES}
+      )
+  ELSE()
+    CUDA_ADD_LIBRARY(
+      ${LIBRARY_NAME}
+      ${PARSE_HEADERS}
+      ${PARSE_NOINSTALLHEADERS}
+      ${PARSE_SOURCES}
+      )
+  ENDIF()
+
+  TARGET_LINK_AND_INCLUDE_LIBRARIES(${LIBRARY_NAME} ${LINK_LIBS})
+
+  IF (NOT PARSE_TESTONLY OR PARSE_NO_INSTALL_LIB_OR_HEADERS)
+
+    INSTALL(
+      TARGETS ${LIBRARY_NAME}
+      EXPORT ${PROJECT_NAME}
+      RUNTIME DESTINATION bin
+      LIBRARY DESTINATION lib
+      ARCHIVE DESTINATION lib
+      COMPONENT ${PACKAGE_NAME}
+      )
+
+    INSTALL(
+      FILES  ${PARSE_HEADERS}
+      EXPORT ${PROJECT_NAME}
+      DESTINATION include
+      COMPONENT ${PACKAGE_NAME}
+      )
+
+      INSTALL(
+      DIRECTORY  ${PARSE_HEADERS_INSTALL_SUBDIR}
+      EXPORT ${PROJECT_NAME}
+      DESTINATION include
+      COMPONENT ${PACKAGE_NAME}
+      )
+
+  ENDIF()
+
+  IF (NOT PARSE_TESTONLY)
+    PREPEND_GLOBAL_SET(${PACKAGE_NAME}_LIBS ${LIBRARY_NAME})
+    REMOVE_GLOBAL_DUPLICATES(${PACKAGE_NAME}_LIBS)
+  ENDIF()
+
+ENDFUNCTION()
+
+FUNCTION(TRIBITS_ADD_EXECUTABLE EXE_NAME)
+
+  SET(options NOEXEPREFIX NOEXESUFFIX ADD_DIR_TO_NAME INSTALLABLE TESTONLY)
+  SET(oneValueArgs ADDED_EXE_TARGET_NAME_OUT)
+  SET(multiValueArgs SOURCES CATEGORIES HOST XHOST HOSTTYPE XHOSTTYPE DIRECTORY TESTONLYLIBS IMPORTEDLIBS DEPLIBS COMM LINKER_LANGUAGE TARGET_DEFINES DEFINES)
+
+  CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  IF (PARSE_TARGET_DEFINES)
+    TARGET_COMPILE_DEFINITIONS(${EXE_NAME} PUBLIC ${PARSE_TARGET_DEFINES})
+  ENDIF()
+
+  SET(LINK_LIBS PACKAGE_${PACKAGE_NAME})
+
+  IF (PARSE_TESTONLYLIBS)
+    LIST(APPEND LINK_LIBS ${PARSE_TESTONLYLIBS})
+  ENDIF()
+
+  IF (PARSE_IMPORTEDLIBS)
+    LIST(APPEND LINK_LIBS ${PARSE_IMPORTEDLIBS})
+  ENDIF()
+
+  SET (EXE_SOURCES)
+  IF(PARSE_DIRECTORY)
+    FOREACH( SOURCE_FILE ${PARSE_SOURCES} )
+      IF(IS_ABSOLUTE ${SOURCE_FILE})
+        SET (EXE_SOURCES ${EXE_SOURCES} ${SOURCE_FILE})
+      ELSE()
+        SET (EXE_SOURCES ${EXE_SOURCES} ${PARSE_DIRECTORY}/${SOURCE_FILE})
+      ENDIF()
+    ENDFOREACH( )
+  ELSE()
+    FOREACH( SOURCE_FILE ${PARSE_SOURCES} )
+      SET (EXE_SOURCES ${EXE_SOURCES} ${SOURCE_FILE})
+    ENDFOREACH( )
+  ENDIF()
+
+  SET(EXE_BINARY_NAME ${EXE_NAME})
+  IF(DEFINED PACKAGE_NAME AND NOT PARSE_NOEXEPREFIX)
+    SET(EXE_BINARY_NAME ${PACKAGE_NAME}_${EXE_BINARY_NAME})
+  ENDIF()
+
+  IF (PARSE_TESTONLY)
+    SET(EXCLUDE_FROM_ALL_KEYWORD "EXCLUDE_FROM_ALL")
+  ELSE()
+    SET(EXCLUDE_FROM_ALL_KEYWORD)
+  ENDIF()
+  ADD_EXECUTABLE(${EXE_BINARY_NAME} ${EXCLUDE_FROM_ALL_KEYWORD} ${EXE_SOURCES})
+
+  TARGET_LINK_AND_INCLUDE_LIBRARIES(${EXE_BINARY_NAME} ${LINK_LIBS})
+
+  IF(PARSE_ADDED_EXE_TARGET_NAME_OUT)
+    SET(${PARSE_ADDED_EXE_TARGET_NAME_OUT} ${EXE_BINARY_NAME} PARENT_SCOPE)
+  ENDIF()
+
+  IF(PARSE_INSTALLABLE)
+    INSTALL(
+      TARGETS ${EXE_BINARY_NAME}
+      EXPORT ${PROJECT_NAME}
+        DESTINATION bin
+    )
+  ENDIF()
+ENDFUNCTION()
+
+ADD_CUSTOM_TARGET(check COMMAND ${CMAKE_CTEST_COMMAND} -VV -C ${CMAKE_CFG_INTDIR})
+
+FUNCTION(TRIBITS_ADD_EXECUTABLE_AND_TEST EXE_NAME)
+
+  SET(options STANDARD_PASS_OUTPUT WILL_FAIL)
+  SET(oneValueArgs PASS_REGULAR_EXPRESSION FAIL_REGULAR_EXPRESSION ENVIRONMENT TIMEOUT CATEGORIES ADDED_TESTS_NAMES_OUT ADDED_EXE_TARGET_NAME_OUT)
+  SET(multiValueArgs)
+
+  CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  TRIBITS_ADD_EXECUTABLE(${EXE_NAME} TESTONLY ADDED_EXE_TARGET_NAME_OUT TEST_NAME ${PARSE_UNPARSED_ARGUMENTS})
+
+  IF(WIN32)
+    ADD_TEST(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} COMMAND ${TEST_NAME}${CMAKE_EXECUTABLE_SUFFIX})
+  ELSE()
+    ADD_TEST(NAME ${TEST_NAME} COMMAND ${TEST_NAME})
+  ENDIF()
+  ADD_DEPENDENCIES(check ${TEST_NAME})
+
+  IF(PARSE_FAIL_REGULAR_EXPRESSION)
+    SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${PARSE_FAIL_REGULAR_EXPRESSION})
+  ENDIF()
+
+  IF(PARSE_PASS_REGULAR_EXPRESSION)
+    SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${PARSE_PASS_REGULAR_EXPRESSION})
+  ENDIF()
+
+  IF(PARSE_WILL_FAIL)
+    SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES WILL_FAIL ${PARSE_WILL_FAIL})
+  ENDIF()
+
+  IF(PARSE_ADDED_TESTS_NAMES_OUT)
+    SET(${PARSE_ADDED_TESTS_NAMES_OUT} ${TEST_NAME} PARENT_SCOPE)
+  ENDIF()
+
+  IF(PARSE_ADDED_EXE_TARGET_NAME_OUT)
+    SET(${PARSE_ADDED_EXE_TARGET_NAME_OUT} ${TEST_NAME} PARENT_SCOPE)
+  ENDIF()
+
+ENDFUNCTION()
+
+MACRO(TIBITS_CREATE_IMPORTED_TPL_LIBRARY TPL_NAME)
+  ADD_INTERFACE_LIBRARY(TPL_LIB_${TPL_NAME})
+  TARGET_LINK_LIBRARIES(TPL_LIB_${TPL_NAME} LINK_PUBLIC ${TPL_${TPL_NAME}_LIBRARIES})
+  TARGET_INCLUDE_DIRECTORIES(TPL_LIB_${TPL_NAME} INTERFACE ${TPL_${TPL_NAME}_INCLUDE_DIRS})
+ENDMACRO()
+
+FUNCTION(TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES TPL_NAME)
+
+  SET(options MUST_FIND_ALL_LIBS MUST_FIND_ALL_HEADERS NO_PRINT_ENABLE_SUCCESS_FAIL)
+  SET(oneValueArgs)
+  SET(multiValueArgs REQUIRED_HEADERS REQUIRED_LIBS_NAMES)
+
+  CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  SET(_${TPL_NAME}_ENABLE_SUCCESS TRUE)
+  IF (PARSE_REQUIRED_LIBS_NAMES)
+    FIND_LIBRARY(TPL_${TPL_NAME}_LIBRARIES NAMES ${PARSE_REQUIRED_LIBS_NAMES})
+    IF(NOT TPL_${TPL_NAME}_LIBRARIES)
+      SET(_${TPL_NAME}_ENABLE_SUCCESS FALSE)
+    ENDIF()
+  ENDIF()
+  IF (PARSE_REQUIRED_HEADERS)
+    FIND_PATH(TPL_${TPL_NAME}_INCLUDE_DIRS NAMES ${PARSE_REQUIRED_HEADERS})
+    IF(NOT TPL_${TPL_NAME}_INCLUDE_DIRS)
+      SET(_${TPL_NAME}_ENABLE_SUCCESS FALSE)
+    ENDIF()
+  ENDIF()
+
+
+  IF (_${TPL_NAME}_ENABLE_SUCCESS)
+    TIBITS_CREATE_IMPORTED_TPL_LIBRARY(${TPL_NAME})
+  ENDIF()
+
+ENDFUNCTION()
+
+MACRO(TRIBITS_PROCESS_TPL_DEP_FILE TPL_FILE)
+  GET_FILENAME_COMPONENT(TPL_NAME ${TPL_FILE} NAME_WE)
+  INCLUDE("${TPL_FILE}")
+  IF(TARGET TPL_LIB_${TPL_NAME})
+    MESSAGE(STATUS "Found tpl library: ${TPL_NAME}")
+    SET(TPL_ENABLE_${TPL_NAME} TRUE)
+  ELSE()
+    MESSAGE(STATUS "Tpl library not found: ${TPL_NAME}")
+    SET(TPL_ENABLE_${TPL_NAME} FALSE)
+  ENDIF()
+ENDMACRO()
+
+MACRO(PREPEND_TARGET_SET VARNAME TARGET_NAME TYPE)
+  IF(TYPE STREQUAL "REQUIRED")
+    SET(REQUIRED TRUE)
+  ELSE()
+    SET(REQUIRED FALSE)
+  ENDIF()
+  IF(TARGET ${TARGET_NAME})
+    PREPEND_GLOBAL_SET(${VARNAME} ${TARGET_NAME})
+  ELSE()
+    IF(REQUIRED)
+      MESSAGE(FATAL_ERROR "Missing dependency ${TARGET_NAME}")
+    ENDIF()
+  ENDIF()
+ENDMACRO()
+
+MACRO(TRIBITS_APPEND_PACKAGE_DEPS DEP_LIST TYPE)
+  FOREACH(DEP ${ARGN})
+    PREPEND_GLOBAL_SET(${DEP_LIST} PACKAGE_${DEP})
+  ENDFOREACH()
+ENDMACRO()
+
+MACRO(TRIBITS_APPEND_TPLS_DEPS DEP_LIST TYPE)
+  FOREACH(DEP ${ARGN})
+    PREPEND_TARGET_SET(${DEP_LIST} TPL_LIB_${DEP} ${TYPE})
+  ENDFOREACH()
+ENDMACRO()
+
+MACRO(TRIBITS_ENABLE_TPLS)
+  FOREACH(TPL ${ARGN})
+    IF(TARGET ${TPL})
+      GLOBAL_SET(${PACKAGE_NAME}_ENABLE_${TPL} TRUE)
+    ELSE()
+      GLOBAL_SET(${PACKAGE_NAME}_ENABLE_${TPL} FALSE)
+    ENDIF()
+  ENDFOREACH()
+ENDMACRO()
+
+MACRO(TRIBITS_PACKAGE_DEFINE_DEPENDENCIES)
+
+  SET(options)
+  SET(oneValueArgs)
+  SET(multiValueArgs 
+    LIB_REQUIRED_PACKAGES
+    LIB_OPTIONAL_PACKAGES
+    TEST_REQUIRED_PACKAGES
+    TEST_OPTIONAL_PACKAGES
+    LIB_REQUIRED_TPLS
+    LIB_OPTIONAL_TPLS
+    TEST_REQUIRED_TPLS
+    TEST_OPTIONAL_TPLS
+    REGRESSION_EMAIL_LIST
+    SUBPACKAGES_DIRS_CLASSIFICATIONS_OPTREQS
+  )
+  CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  GLOBAL_SET(${PACKAGE_NAME}_DEPS "")
+  TRIBITS_APPEND_PACKAGE_DEPS(${PACKAGE_NAME}_DEPS REQUIRED ${PARSE_LIB_REQUIRED_PACKAGES})
+  TRIBITS_APPEND_PACKAGE_DEPS(${PACKAGE_NAME}_DEPS OPTIONAL ${PARSE_LIB_OPTIONAL_PACKAGES})
+  TRIBITS_APPEND_TPLS_DEPS(${PACKAGE_NAME}_DEPS REQUIRED ${PARSE_LIB_REQUIRED_TPLS})
+  TRIBITS_APPEND_TPLS_DEPS(${PACKAGE_NAME}_DEPS OPTIONAL ${PARSE_LIB_OPTIONAL_TPLS})
+
+  GLOBAL_SET(${PACKAGE_NAME}_TEST_DEPS "")
+  TRIBITS_APPEND_PACKAGE_DEPS(${PACKAGE_NAME}_TEST_DEPS REQUIRED ${PARSE_TEST_REQUIRED_PACKAGES})
+  TRIBITS_APPEND_PACKAGE_DEPS(${PACKAGE_NAME}_TEST_DEPS OPTIONAL ${PARSE_TEST_OPTIONAL_PACKAGES})
+  TRIBITS_APPEND_TPLS_DEPS(${PACKAGE_NAME}_TEST_DEPS REQUIRED ${PARSE_TEST_REQUIRED_TPLS})
+  TRIBITS_APPEND_TPLS_DEPS(${PACKAGE_NAME}_TEST_DEPS OPTIONAL ${PARSE_TEST_OPTIONAL_TPLS})
+
+  TRIBITS_ENABLE_TPLS(${PARSE_LIB_REQUIRED_TPLS} ${PARSE_LIB_OPTIONAL_TPLS} ${PARSE_TEST_REQUIRED_TPLS} ${PARSE_TEST_OPTIONAL_TPLS})
+
+ENDMACRO()
+
+MACRO(TRIBITS_SUBPACKAGE NAME)
+  SET(PACKAGE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+  SET(PARENT_PACKAGE_NAME ${PACKAGE_NAME})
+  SET(PACKAGE_NAME ${PACKAGE_NAME}${NAME})
+  STRING(TOUPPER ${PACKAGE_NAME} PACKAGE_NAME_UC)
+
+  ADD_INTERFACE_LIBRARY(PACKAGE_${PACKAGE_NAME})
+
+  GLOBAL_SET(${PACKAGE_NAME}_LIBS "")
+
+  INCLUDE(${PACKAGE_SOURCE_DIR}/cmake/Dependencies.cmake)
+
+ENDMACRO(TRIBITS_SUBPACKAGE)
+
+MACRO(TRIBITS_SUBPACKAGE_POSTPROCESS)
+  TARGET_LINK_AND_INCLUDE_LIBRARIES(PACKAGE_${PACKAGE_NAME} ${${PACKAGE_NAME}_LIBS})
+ENDMACRO(TRIBITS_SUBPACKAGE_POSTPROCESS)
+
+MACRO(TRIBITS_PACKAGE_DECL NAME)
+
+  PROJECT(${NAME})
+  STRING(TOUPPER ${PROJECT_NAME} PROJECT_NAME_UC)
+  SET(PACKAGE_NAME ${PROJECT_NAME})
+  STRING(TOUPPER ${PACKAGE_NAME} PACKAGE_NAME_UC)
+
+  SET(TRIBITS_DEPS_DIR "${CMAKE_SOURCE_DIR}/cmake/deps")
+  FILE(GLOB TPLS_FILES "${TRIBITS_DEPS_DIR}/*.cmake")
+  FOREACH(TPL_FILE ${TPLS_FILES})
+    TRIBITS_PROCESS_TPL_DEP_FILE(${TPL_FILE})
+  ENDFOREACH()
+
+ENDMACRO()
+
+
+MACRO(TRIBITS_PROCESS_SUBPACKAGES)
+  FILE(GLOB SUBPACKAGES RELATIVE ${CMAKE_SOURCE_DIR} */cmake/Dependencies.cmake)
+  FOREACH(SUBPACKAGE ${SUBPACKAGES})
+    GET_FILENAME_COMPONENT(SUBPACKAGE_CMAKE ${SUBPACKAGE} DIRECTORY)
+    GET_FILENAME_COMPONENT(SUBPACKAGE_DIR ${SUBPACKAGE_CMAKE} DIRECTORY)
+    ADD_SUBDIRECTORY(${SUBPACKAGE_DIR})
+  ENDFOREACH()
+ENDMACRO(TRIBITS_PROCESS_SUBPACKAGES)
+
+MACRO(TRIBITS_PACKAGE_DEF)
+ENDMACRO(TRIBITS_PACKAGE_DEF)
+
+MACRO(TRIBITS_EXCLUDE_AUTOTOOLS_FILES)
+ENDMACRO(TRIBITS_EXCLUDE_AUTOTOOLS_FILES)
+
+MACRO(TRIBITS_EXCLUDE_FILES)
+ENDMACRO(TRIBITS_EXCLUDE_FILES)
+
+MACRO(TRIBITS_PACKAGE_POSTPROCESS)
+ENDMACRO(TRIBITS_PACKAGE_POSTPROCESS)
+
--- a/lib/kokkos/config/kokkos-trilinos-integration-procedure.txt
+++ b/lib/kokkos/config/kokkos-trilinos-integration-procedure.txt
@ -0,0 +1,153 @@
+// -------------------------------------------------------------------------------- //
+
+The following steps are for workstations/servers with the SEMS environment installed.
+
+// -------------------------------------------------------------------------------- //
+Summary:
+
+- Step 1: Rigorous testing of Kokkos' develop branch for each backend (Serial, OpenMP, Threads, Cuda) with all supported compilers.
+
+- Step 2: Snapshot Kokkos' develop branch into current Trilinos develop branch.
+
+- Step 3: Build and test Trilinos with combinations of compilers, types, backends.
+
+- Step 4: Promote Kokkos develop branch to master if the snapshot does not cause any new tests to fail; else track/fix causes of new failures.
+
+- Step 5: Snapshot Kokkos tagged master branch into Trilinos and push Trilinos.
+// -------------------------------------------------------------------------------- //
+
+
+// -------------------------------------------------------------------------------- //
+
+Step 1:
+  1.1. Update kokkos develop branch (NOT a fork)
+
+         (From kokkos directory):
+         git fetch --all
+         git checkout develop
+         git reset --hard origin/develop
+
+  1.2. Create a testing directory - here the directory is created within the kokkos directory
+
+         mkdir testing
+         cd testing
+
+  1.3. Run the test_all_sandia script; various compiler and build-list options can be specified
+
+         ../config/test_all_sandia
+
+  1.4 Clean repository of untracked files
+
+        cd ../
+        git clean -df
+
+// -------------------------------------------------------------------------------- //
+
+Step 2:
+  2.1 Update Trilinos develop branch
+
+        (From Trilinos directory):
+        git checkout develop
+        git fetch --all
+        git reset --hard origin/develop
+        git clean -df
+
+  2.2 Snapshot Kokkos into Trilinos - this requires python/2.7.9 and that both Trilinos and Kokkos be clean - no untracked or modified files
+
+        module load python/2.7.9
+        python KOKKOS_PATH/config/snapshot.py KOKKOS_PATH TRILINOS_PATH/packages
+
+// -------------------------------------------------------------------------------- //
+
+Step 3:
+  3.1. Build and test Trilinos with 3 different configurations; a configure-all script is provided in Trilinos and should be modified to test each of the following 3 configurations with appropriate environment variable(s):
+
+      - GCC/4.7.2-OpenMP/Complex
+          Run tests with the following environment variable:
+
+            export OMP_NUM_THREADS=2
+
+
+      - Intel/15.0.2-Serial/NoComplex
+
+
+      - GCC/4.8.4/CUDA/7.5.18-Cuda/Serial/NoComplex
+          Run tests with the following environment variables:
+
+            export CUDA_LAUNCH_BLOCKING=1
+            export CUDA_MANAGED_FORCE_DEVICE_ALLOC=1
+
+
+        mkdir Build
+        cd Build
+        cp TRILINOS_PATH/sampleScripts/Sandia-SEMS/configure-all ./
+            ** Set the path to Trilinos appropriately within the configure-all script **
+        source $SEMS_MODULE_ROOT/utils/sems-modules-init.sh kokkos
+        source configure-all
+        make -k  (-k means "keep going" to get past build errors; -j12 can also be specified to build with 12 threads, for example)
+        ctest
+
+  3.2. Compare the failed test output to the test output on the dashboard ( testing.sandia.gov/cdash select Trilinos ); investigate and fix problems if new tests fail after the Kokkos snapshot
+
+// -------------------------------------------------------------------------------- //
+
+Step 4:
+  4.1. Once all Trilinos tests pass promote Kokkos develop branch to master on Github
+
+       - DO NOT fast-forward the merge!!!!
+
+       (From kokkos directory):
+       git checkout master
+       git fetch --all
+       # Ensure we are on the current origin/master
+       git reset --hard origin/master
+       git merge --no-ff origin/develop
+
+  4.2. Update the tag in kokkos/config/master_history.txt
+       Tag description: MajorNumber.MinorNumber.WeeksSinceMinorNumberUpdate
+       Tag format: #.#.##
+
+       # Prepend master_history.txt with 
+       
+       # tag: #.#.##
+       # date: mm/dd/yyyy
+       # master: sha1
+       # develop: sha1
+       # -----------------------
+
+       git commit --amend -a
+
+       git tag -a #.#.##
+         tag: #.#.##
+         date: mm/dd/yyyy
+         master: sha1
+         develop: sha1
+
+       git push --follow-tags origin master
+
+// -------------------------------------------------------------------------------- //
+
+Step 5:
+  5.1. Make sure Trilinos is up-to-date - chances are other changes have been committed since the integration testing process began. If a substantial change has occurred that may be affected by the snapshot the testing procedure may need to be repeated
+
+       (From Trilinos directory):
+       git checkout develop
+       git fetch --all
+       git reset --hard origin/develop
+       git clean -df
+
+  5.2. Snapshot Kokkos master branch into Trilinos
+
+       (From kokkos directory):
+       git fetch --all
+       git checkout tags/#.#.##
+       git clean -df
+
+       python KOKKOS_PATH/config/snapshot.py KOKKOS_PATH TRILINOS_PATH/packages
+       
+  5.3. Push the updated develop branch of Trilinos to Github - congratulations!!!
+
+       (From Trilinos directory):
+       git push
+
+// -------------------------------------------------------------------------------- //
--- a/lib/kokkos/config/master_history.txt
+++ b/lib/kokkos/config/master_history.txt
@ -0,0 +1,3 @@
+tag:  2.01.00    date: 07:21:2016    master: xxxxxxxx    develop: fa6dfcc4
+tag:  2.01.06    date: 09:02:2016    master: 9afaa87f    develop: 555f1a3a
+
--- a/lib/kokkos/config/nvcc_wrapper
+++ b/lib/kokkos/config/nvcc_wrapper
@ -1,17 +1,12 @@
 #!/bin/bash
 #
 # This shell script (nvcc_wrapper) wraps both the host compiler and
-# NVCC, if you are building Trilinos with CUDA enabled.  The script
-# remedies some differences between the interface of NVCC and that of
-# the host compiler, in particular for linking.  It also means that
-# Trilinos doesn't need separate .cu files; it can just use .cpp
-# files.
+# NVCC, if you are building legacy C or C++ code with CUDA enabled.
+# The script remedies some differences between the interface of NVCC
+# and that of the host compiler, in particular for linking.
+# It also means that a legacy code doesn't need separate .cu files;
+# it can just use .cpp files.
 #
-# Hopefully, at some point, NVIDIA may fix NVCC so as to make this
-# script obsolete.  For now, this script exists and if you want to
-# build Trilinos with CUDA enabled, you must use this script as your
-# compiler.
-
 # Default settings: change those according to your machine.  For
 # example, you may have have two different wrappers with either icpc
 # or g++ as their back-end compiler.  The defaults can be overwritten
@ -53,6 +48,10 @@ object_files=""
 # Link objects for the host linker only
 object_files_xlinker=""

+# Shared libraries with version numbers are not handled correctly by NVCC
+shared_versioned_libraries_host=""
+shared_versioned_libraries=""
+
 # Does the User set the architecture 
 arch_set=0

@ -76,6 +75,9 @@ first_xcompiler_arg=1

 temp_dir=${TMPDIR:-/tmp}

+# Check if we have an optimization argument already
+optimization_applied=0
+
 #echo "Arguments: $# $@"

 while [ $# -gt 0 ]
@ -97,8 +99,17 @@ do
  *.cpp|*.cxx|*.cc|*.C|*.c++|*.cu)
    cpp_files="$cpp_files $1"
    ;;
+   # Ensure we only have one optimization flag because NVCC doesn't allow muliple
+  -O*)
+    if [ $optimization_applied -eq 1 ]; then
+       echo "nvcc_wrapper - *warning* you have set multiple optimization flags (-O*), only the first is used because nvcc can only accept a single optimization setting."
+    else
+       shared_args="$shared_args $1"
+       optimization_applied=1
+    fi
+    ;;
  #Handle shared args (valid for both nvcc and the host compiler)
-  -O*|-D*|-c|-I*|-L*|-l*|-g|--help|--version|-E|-M|-shared)
+  -D*|-c|-I*|-L*|-l*|-g|--help|--version|-E|-M|-shared)
    shared_args="$shared_args $1"
    ;;
  #Handle shared args that have an argument
@ -107,7 +118,7 @@ do
    shift
    ;;
  #Handle known nvcc args
-  -gencode*|--dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|--resource-usage)
+  -gencode*|--dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|--resource-usage|-Xptxas*)
    cuda_args="$cuda_args $1"
    ;;
  #Handle known nvcc args that have an argument
@ -175,10 +186,15 @@ do
    object_files_xlinker="$object_files_xlinker -Xlinker $1"
    ;;
  #Handle object files which always need to use "-Xlinker": -x cu applies to all input files, so give them to linker, except if only linking
-  *.so.*|*.dylib)
+  *.dylib)
    object_files="$object_files -Xlinker $1"
    object_files_xlinker="$object_files_xlinker -Xlinker $1"
    ;;
+  #Handle shared libraries with *.so.* names which nvcc can't do.
+  *.so.*)
+    shared_versioned_libraries_host="$shared_versioned_libraries_host $1"
+    shared_versioned_libraries="$shared_versioned_libraries -Xlinker $1"
+  ;;
  #All other args are sent to the host compiler
  *)
    if [ $first_xcompiler_arg -eq 1 ]; then
@ -204,13 +220,13 @@ if [ $arch_set -ne 1 ]; then
 fi

 #Compose compilation command
-nvcc_command="nvcc $cuda_args $shared_args $xlinker_args"
+nvcc_command="nvcc $cuda_args $shared_args $xlinker_args $shared_versioned_libraries"
 if [ $first_xcompiler_arg -eq 0 ]; then
  nvcc_command="$nvcc_command -Xcompiler $xcompiler_args"
 fi

 #Compose host only command
-host_command="$host_compiler $shared_args $xcompiler_args $host_linker_args"
+host_command="$host_compiler $shared_args $xcompiler_args $host_linker_args $shared_versioned_libraries_host"

 #nvcc does not accept '#pragma ident SOME_MACRO_STRING' but it does accept '#ident SOME_MACRO_STRING'
 if [ $replace_pragma_ident -eq 1 ]; then
--- a/lib/kokkos/config/test_all_sandia
+++ b/lib/kokkos/config/test_all_sandia
@ -6,34 +6,36 @@

 set -o pipefail

+# Determine current machine
+
+MACHINE=""
+HOSTNAME=$(hostname)
+if [[ "$HOSTNAME" =~ (white|ride).* ]]; then
+    MACHINE=white
+elif [[ "$HOSTNAME" =~ .*bowman.* ]]; then
+    MACHINE=bowman
+elif [[ "$HOSTNAME" =~ node.* ]]; then # Warning: very generic name
+    MACHINE=shepard
+elif [ ! -z "$SEMS_MODULEFILES_ROOT" ]; then
+    MACHINE=sems
+else
+    echo "Unrecognized machine" >&2
+    exit 1
+fi
+
 GCC_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial"
+IBM_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
 INTEL_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial"
 CLANG_BUILD_LIST="Pthread,Serial,Pthread_Serial"
 CUDA_BUILD_LIST="Cuda_OpenMP,Cuda_Pthread,Cuda_Serial"

 GCC_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wignored-qualifiers,-Wempty-body,-Wclobbered,-Wuninitialized"
+IBM_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
 CLANG_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
 INTEL_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
 CUDA_WARNING_FLAGS=""

-BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>/base,hwloc/1.10.1/<COMPILER_NAME>/<COMPILER_VERSION>/base"
-CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/4.7.2/base"
-
-export OMP_NUM_THREADS=4
-
-declare -i NUM_RESULTS_TO_KEEP=7
-
-RESULT_ROOT_PREFIX=TestAll
-
-source /projects/modulefiles/utils/sems-modules-init.sh
-source /projects/modulefiles/utils/kokkos-modules-init.sh
-
-SCRIPT_KOKKOS_ROOT=$( cd "$( dirname "$0" )" && cd .. && pwd )
-
-#
-# Handle arguments
-#
-
+# Default. Machine specific can override
 DEBUG=False
 ARGS=""
 CUSTOM_BUILD_LIST=""
@ -41,6 +43,107 @@ DRYRUN=False
 BUILD_ONLY=False
 declare -i NUM_JOBS_TO_RUN_IN_PARALLEL=3
 TEST_SCRIPT=False
+SKIP_HWLOC=False
+
+ARCH_FLAG=""
+
+#
+# Machine specific config
+#
+
+if [ "$MACHINE" = "sems" ]; then
+    source /projects/modulefiles/utils/sems-modules-init.sh
+    source /projects/modulefiles/utils/kokkos-modules-init.sh
+
+    BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>/base,hwloc/1.10.1/<COMPILER_NAME>/<COMPILER_VERSION>/base"
+    CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/4.7.2/base"
+
+    # Format: (compiler module-list build-list exe-name warning-flag)
+    COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/4.9.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/5.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "clang/3.5.2 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+               "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+               "cuda/6.5.14 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
+               "cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
+               "cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
+    )
+
+elif [ "$MACHINE" = "white" ]; then
+    source /etc/profile.d/modules.sh
+    SKIP_HWLOC=True
+    export SLURM_TASKS_PER_NODE=32
+
+    BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>"
+    IBM_MODULE_LIST="<COMPILER_NAME>/xl/<COMPILER_VERSION>"
+    CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/4.9.2"
+
+    # Don't do pthread on white
+    GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
+
+    # Format: (compiler module-list build-list exe-name warning-flag)
+    COMPILERS=("gcc/4.9.2 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/5.3.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "ibm/13.1.3 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS"
+    )
+
+    ARCH_FLAG="--arch=Power8"
+    NUM_JOBS_TO_RUN_IN_PARALLEL=8
+
+elif [ "$MACHINE" = "bowman" ]; then
+    source /etc/profile.d/modules.sh
+    SKIP_HWLOC=True
+    export SLURM_TASKS_PER_NODE=32
+
+    BASE_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
+
+    OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial"
+
+    # Format: (compiler module-list build-list exe-name warning-flag)
+    COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "intel/17.0.064 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+    )
+
+    ARCH_FLAG="--arch=KNL"
+    NUM_JOBS_TO_RUN_IN_PARALLEL=8
+
+elif [ "$MACHINE" = "shepard" ]; then
+    source /etc/profile.d/modules.sh
+    SKIP_HWLOC=True
+    export SLURM_TASKS_PER_NODE=32
+
+    BASE_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
+
+    OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial"
+
+    # Format: (compiler module-list build-list exe-name warning-flag)
+    COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "intel/17.0.064 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+    )
+
+    ARCH_FLAG="--arch=HSW"
+    NUM_JOBS_TO_RUN_IN_PARALLEL=8
+
+else
+    echo "Unhandled machine $MACHINE" >&2
+    exit 1
+fi
+
+export OMP_NUM_THREADS=4
+
+declare -i NUM_RESULTS_TO_KEEP=7
+
+RESULT_ROOT_PREFIX=TestAll
+
+SCRIPT_KOKKOS_ROOT=$( cd "$( dirname "$0" )" && cd .. && pwd )
+
+#
+# Handle arguments
+#

 while [[ $# > 0 ]]
 do
@ -61,6 +164,9 @@ BUILD_ONLY=True
 --test-script*)
 TEST_SCRIPT=True
 ;;
+--skip-hwloc*)
+SKIP_HWLOC=True
+;;
 --num*)
 NUM_JOBS_TO_RUN_IN_PARALLEL="${key#*=}"
 ;;
@ -73,6 +179,7 @@ echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory"
 echo "    Defaults to root repo containing this script"
 echo "--debug: Run tests in debug. Defaults to False"
 echo "--test-script: Test this script, not Kokkos"
+echo "--skip-hwloc: Do not do hwloc tests"
 echo "--num=N: Number of jobs to run in parallel "
 echo "--dry-run: Just print what would be executed"
 echo "--build-only: Just do builds, don't run anything"
@ -82,21 +189,16 @@ echo "    Valid items:"
 echo "      OpenMP, Pthread, Serial, OpenMP_Serial, Pthread_Serial"
 echo "      Cuda_OpenMP, Cuda_Pthread, Cuda_Serial"
 echo ""
+
 echo "ARGS: list of expressions matching compilers to test"
-echo "  supported compilers"
-echo "    gcc/4.7.2"
-echo "    gcc/4.8.4"
-echo "    gcc/4.9.2"
-echo "    gcc/5.1.0"
-echo "    intel/14.0.4"
-echo "    intel/15.0.2"
-echo "    intel/16.0.1"
-echo "    clang/3.5.2"
-echo "    clang/3.6.1"
-echo "    cuda/6.5.14"
-echo "    cuda/7.0.28"
-echo "    cuda/7.5.18"
+echo "  supported compilers sems"
+for COMPILER_DATA in "${COMPILERS[@]}"; do
+    ARR=($COMPILER_DATA)
+    COMPILER=${ARR[0]}
+    echo "    $COMPILER"
+done
 echo ""
+
 echo "Examples:"
 echo "  Run all tests"
 echo "  % test_all_sandia"
@ -147,21 +249,6 @@ if [ -z "$ARGS" ]; then
    ARGS='?'
 fi

-# Format: (compiler module-list build-list exe-name warning-flag)
-COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-           "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-           "gcc/4.9.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-           "gcc/5.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-           "intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-           "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-           "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-           "clang/3.5.2 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-           "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-           "cuda/6.5.14 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
-           "cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
-           "cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
-           )
-
 # Process args to figure out which compilers to test
 COMPILERS_TO_TEST=""
 for ARG in $ARGS; do
@ -240,18 +327,19 @@ run_cmd() {
    fi
 }

-# report_and_log_test_results <SUCCESS> <DESC> <PHASE>
+# report_and_log_test_results <SUCCESS> <DESC> <COMMENT>
 report_and_log_test_result() {
    # Use sane var names
-    local success=$1; local desc=$2; local phase=$3;
+    local success=$1; local desc=$2; local comment=$3;

    if [ "$success" = "0" ]; then
 	echo "  PASSED $desc"
-        touch $PASSED_DIR/$desc
+        echo $comment > $PASSED_DIR/$desc
    else
+        # For failures, comment should be the name of the phase that failed
 	echo "  FAILED $desc" >&2
-        echo $phase > $FAILED_DIR/$desc
-        cat ${desc}.${phase}.log
+        echo $comment > $FAILED_DIR/$desc
+        cat ${desc}.${comment}.log
    fi
 }

@ -309,6 +397,8 @@ single_build_and_test() {

    echo "  Starting job $desc"

+    local comment="no_comment"
+
    if [ "$TEST_SCRIPT" = "True" ]; then
        local rand=$[ 1 + $[ RANDOM % 10 ]]
        sleep $rand
@ -316,14 +406,19 @@ single_build_and_test() {
            run_cmd ls fake_problem >& ${desc}.configure.log || { report_and_log_test_result 1 $desc configure && return 0; }
        fi
    else
-        run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
+        run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
+        local -i build_start_time=$(date +%s)
        run_cmd make build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
+        local -i build_end_time=$(date +%s)
+        comment="build_time=$(($build_end_time-$build_start_time))"
        if [[ "$BUILD_ONLY" == False ]]; then
            run_cmd make test >& ${desc}.test.log || { report_and_log_test_result 1 ${desc} test && return 0; }
+            local -i run_end_time=$(date +%s)
+            comment="$comment run_time=$(($run_end_time-$build_end_time))"
        fi
    fi

-    report_and_log_test_result 0 $desc
+    report_and_log_test_result 0 $desc "$comment"

    return 0
 }
@ -374,7 +469,7 @@ build_and_test_all() {
 	run_in_background $compiler $build $BUILD_TYPE

        # If not cuda, do a hwloc test too
-        if [[ "$compiler" != cuda* ]]; then
+        if [[ "$compiler" != cuda* && "$SKIP_HWLOC" == False ]]; then
            run_in_background $compiler $build "hwloc-$BUILD_TYPE"
        fi
    done
@ -401,7 +496,11 @@ wait_summarize_and_exit() {
    echo "PASSED TESTS"
    echo "#######################################################"

-    \ls -1 $PASSED_DIR | sort
+    local passed_test
+    for passed_test in $(\ls -1 $PASSED_DIR | sort)
+    do
+        echo $passed_test $(cat $PASSED_DIR/$passed_test)
+    done

    echo "#######################################################"
    echo "FAILED TESTS"
@ -409,7 +508,7 @@ wait_summarize_and_exit() {

    local failed_test
    local -i rv=0
-    for failed_test in $(\ls -1 $FAILED_DIR)
+    for failed_test in $(\ls -1 $FAILED_DIR | sort)
    do
        echo $failed_test "("$(cat $FAILED_DIR/$failed_test)" failed)"
        rv=$rv+1
--- a/lib/kokkos/containers/performance_tests/CMakeLists.txt
+++ b/lib/kokkos/containers/performance_tests/CMakeLists.txt
@ -16,11 +16,22 @@ IF(Kokkos_ENABLE_OpenMP)
  LIST( APPEND SOURCES TestOpenMP.cpp)
 ENDIF()

-TRIBITS_ADD_EXECUTABLE_AND_TEST(
-  PerformanceTest
+# Per #374, we always want to build this test, but we only want to run
+# it as a PERFORMANCE test.  That's why we separate building the test
+# from running the test.
+
+TRIBITS_ADD_EXECUTABLE(
+  PerfTestExec
  SOURCES ${SOURCES}
  COMM serial mpi
-  NUM_MPI_PROCS 1
-  FAIL_REGULAR_EXPRESSION "  FAILED  "
  TESTONLYLIBS kokkos_gtest
  )
+
+TRIBITS_ADD_TEST(
+  PerformanceTest
+  NAME PerfTestExec
+  COMM serial mpi
+  NUM_MPI_PROCS 1
+  CATEGORIES PERFORMANCE
+  FAIL_REGULAR_EXPRESSION "  FAILED  "
+  )
--- a/lib/kokkos/containers/performance_tests/TestCuda.cpp
+++ b/lib/kokkos/containers/performance_tests/TestCuda.cpp
@ -54,6 +54,8 @@

 #if defined( KOKKOS_HAVE_CUDA )

+#include <TestDynRankView.hpp>
+
 #include <Kokkos_UnorderedMap.hpp>

 #include <TestGlobal2LocalIds.hpp>
@ -77,6 +79,13 @@ protected:
  }
 };

+TEST_F( cuda, dynrankview_perf ) 
+{
+  std::cout << "Cuda" << std::endl;
+  std::cout << " DynRankView vs View: Initialization Only " << std::endl;
+  test_dynrankview_op_perf<Kokkos::Cuda>( 4096 );
+}
+
 TEST_F( cuda, global_2_local)
 {
  std::cout << "Cuda" << std::endl;
--- a/lib/kokkos/containers/performance_tests/TestDynRankView.hpp
+++ b/lib/kokkos/containers/performance_tests/TestDynRankView.hpp
@ -0,0 +1,265 @@
+
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+#ifndef KOKKOS_TEST_DYNRANKVIEW_HPP
+#define KOKKOS_TEST_DYNRANKVIEW_HPP
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_DynRankView.hpp>
+#include <vector>
+
+#include <impl/Kokkos_Timer.hpp>
+
+// Compare performance of DynRankView to View, specific focus on the parenthesis operators
+
+namespace Performance {
+
+//View functor
+template <typename DeviceType>
+struct InitViewFunctor {
+  typedef Kokkos::View<double***, DeviceType> inviewtype;
+  inviewtype _inview;
+
+  InitViewFunctor( inviewtype &inview_ ) : _inview(inview_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i) const {
+    for (unsigned j = 0; j < _inview.dimension(1); ++j) {
+      for (unsigned k = 0; k < _inview.dimension(2); ++k) {
+        _inview(i,j,k) = i/2 -j*j + k/3;
+      }
+    }
+  }
+
+  struct SumComputationTest
+  {
+    typedef Kokkos::View<double***, DeviceType> inviewtype;
+    inviewtype _inview;
+
+    typedef Kokkos::View<double*, DeviceType> outviewtype;
+    outviewtype _outview;
+
+    KOKKOS_INLINE_FUNCTION
+    SumComputationTest(inviewtype &inview_ , outviewtype &outview_) : _inview(inview_), _outview(outview_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const int i) const {
+      for (unsigned j = 0; j < _inview.dimension(1); ++j) {
+        for (unsigned k = 0; k < _inview.dimension(2); ++k) {
+          _outview(i) += _inview(i,j,k) ;
+        }
+      }
+    }
+  };
+
+};
+
+template <typename DeviceType>
+struct InitStrideViewFunctor {
+  typedef Kokkos::View<double***, Kokkos::LayoutStride, DeviceType> inviewtype;
+  inviewtype _inview;
+
+  InitStrideViewFunctor( inviewtype &inview_ ) : _inview(inview_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i) const {
+    for (unsigned j = 0; j < _inview.dimension(1); ++j) {
+      for (unsigned k = 0; k < _inview.dimension(2); ++k) {
+        _inview(i,j,k) = i/2 -j*j + k/3;
+      }
+    }
+  }
+
+};
+
+template <typename DeviceType>
+struct InitViewRank7Functor {
+  typedef Kokkos::View<double*******, DeviceType> inviewtype;
+  inviewtype _inview;
+
+  InitViewRank7Functor( inviewtype &inview_ ) : _inview(inview_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i) const {
+    for (unsigned j = 0; j < _inview.dimension(1); ++j) {
+      for (unsigned k = 0; k < _inview.dimension(2); ++k) {
+        _inview(i,j,k,0,0,0,0) = i/2 -j*j + k/3;
+      }
+    }
+  }
+
+};
+
+//DynRankView functor
+template <typename DeviceType>
+struct InitDynRankViewFunctor {
+  typedef Kokkos::DynRankView<double, DeviceType> inviewtype;
+  inviewtype _inview;
+
+  InitDynRankViewFunctor( inviewtype &inview_ ) : _inview(inview_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i) const {
+    for (unsigned j = 0; j < _inview.dimension(1); ++j) {
+      for (unsigned k = 0; k < _inview.dimension(2); ++k) {
+        _inview(i,j,k) = i/2 -j*j + k/3;
+      }
+    }
+  }
+
+  struct SumComputationTest
+  {
+    typedef Kokkos::DynRankView<double, DeviceType> inviewtype;
+    inviewtype _inview;
+
+    typedef Kokkos::DynRankView<double, DeviceType> outviewtype;
+    outviewtype _outview;
+
+    KOKKOS_INLINE_FUNCTION
+    SumComputationTest(inviewtype &inview_ , outviewtype &outview_) : _inview(inview_), _outview(outview_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const int i) const {
+      for (unsigned j = 0; j < _inview.dimension(1); ++j) {
+        for (unsigned k = 0; k < _inview.dimension(2); ++k) {
+          _outview(i) += _inview(i,j,k) ;
+        }
+      }
+    }
+  };
+
+};
+
+
+template <typename DeviceType>
+void test_dynrankview_op_perf( const int par_size )
+{
+
+  typedef DeviceType execution_space;
+  typedef typename execution_space::size_type size_type;
+  const size_type dim2 = 900;
+  const size_type dim3 = 300;
+
+  double elapsed_time_view = 0;
+  double elapsed_time_compview = 0;
+  double elapsed_time_strideview = 0;
+  double elapsed_time_view_rank7 = 0;
+  double elapsed_time_drview = 0;
+  double elapsed_time_compdrview = 0;
+  Kokkos::Timer timer;
+  {
+    Kokkos::View<double***,DeviceType> testview("testview",par_size,dim2,dim3);
+    typedef InitViewFunctor<DeviceType> FunctorType;
+
+    timer.reset();
+    Kokkos::RangePolicy<DeviceType> policy(0,par_size);
+    Kokkos::parallel_for( policy , FunctorType(testview) );
+    DeviceType::fence();
+    elapsed_time_view = timer.seconds();
+    std::cout << " View time (init only): " << elapsed_time_view << std::endl;
+
+
+    timer.reset();
+    Kokkos::View<double*,DeviceType> sumview("sumview",par_size);
+    Kokkos::parallel_for( policy , typename FunctorType::SumComputationTest(testview, sumview) );
+    DeviceType::fence();
+    elapsed_time_compview = timer.seconds();
+    std::cout << " View sum computation time: " << elapsed_time_view << std::endl;
+
+
+    Kokkos::View<double***,Kokkos::LayoutStride, DeviceType> teststrideview = Kokkos::subview(testview, Kokkos::ALL, Kokkos::ALL,Kokkos::ALL);
+    typedef InitStrideViewFunctor<DeviceType> FunctorStrideType;
+
+    timer.reset();
+    Kokkos::parallel_for( policy , FunctorStrideType(teststrideview) );
+    DeviceType::fence();
+    elapsed_time_strideview = timer.seconds();
+    std::cout << " Strided View time (init only): " << elapsed_time_strideview << std::endl;
+  }
+  {
+    Kokkos::View<double*******,DeviceType> testview("testview",par_size,dim2,dim3,1,1,1,1);
+    typedef InitViewRank7Functor<DeviceType> FunctorType;
+
+    timer.reset();
+    Kokkos::RangePolicy<DeviceType> policy(0,par_size);
+    Kokkos::parallel_for( policy , FunctorType(testview) );
+    DeviceType::fence();
+    elapsed_time_view_rank7 = timer.seconds();
+    std::cout << " View Rank7 time (init only): " << elapsed_time_view_rank7 << std::endl;
+  }
+  {
+    Kokkos::DynRankView<double,DeviceType> testdrview("testdrview",par_size,dim2,dim3);
+    typedef InitDynRankViewFunctor<DeviceType> FunctorType;
+
+    timer.reset();
+    Kokkos::RangePolicy<DeviceType> policy(0,par_size);
+    Kokkos::parallel_for( policy , FunctorType(testdrview) );
+    DeviceType::fence();
+    elapsed_time_drview = timer.seconds();
+    std::cout << " DynRankView time (init only): " << elapsed_time_drview << std::endl;
+
+    timer.reset();
+    Kokkos::DynRankView<double,DeviceType> sumview("sumview",par_size);
+    Kokkos::parallel_for( policy , typename FunctorType::SumComputationTest(testdrview, sumview) );
+    DeviceType::fence();
+    elapsed_time_compdrview = timer.seconds();
+    std::cout << " DynRankView sum computation time: " << elapsed_time_compdrview << std::endl;
+
+  }
+
+  std::cout << " Ratio of View to DynRankView time: " << elapsed_time_view / elapsed_time_drview << std::endl; //expect < 1
+  std::cout << " Ratio of View to DynRankView sum computation time: " << elapsed_time_compview / elapsed_time_compdrview << std::endl; //expect < 1
+  std::cout << " Ratio of View to View Rank7  time: " << elapsed_time_view / elapsed_time_view_rank7 << std::endl; //expect < 1
+  std::cout << " Ratio of StrideView to DynRankView time: " << elapsed_time_strideview / elapsed_time_drview << std::endl; //expect < 1
+  std::cout << " Ratio of DynRankView to View Rank7  time: " << elapsed_time_drview / elapsed_time_view_rank7 << std::endl; //expect ?
+
+  timer.reset();
+
+} //end test_dynrankview
+
+
+} //end Performance
+#endif
--- a/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp
+++ b/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp
@ -178,7 +178,7 @@ void test_global_to_local_ids(unsigned num_ids)
  std::cout << num_ids << ", ";

  double elasped_time = 0;
-  Kokkos::Impl::Timer timer;
+  Kokkos::Timer timer;

  local_id_view local_2_global("local_ids", num_ids);
  global_id_view global_2_local((3u*num_ids)/2u);
--- a/lib/kokkos/containers/performance_tests/TestOpenMP.cpp
+++ b/lib/kokkos/containers/performance_tests/TestOpenMP.cpp
@ -50,6 +50,8 @@
 #include <TestGlobal2LocalIds.hpp>
 #include <TestUnorderedMapPerformance.hpp>

+#include <TestDynRankView.hpp>
+
 #include <iomanip>
 #include <sstream>
 #include <string>
@ -91,6 +93,13 @@ protected:
  }
 };

+TEST_F( openmp, dynrankview_perf ) 
+{
+  std::cout << "OpenMP" << std::endl;
+  std::cout << " DynRankView vs View: Initialization Only " << std::endl;
+  test_dynrankview_op_perf<Kokkos::OpenMP>( 8192 );
+}
+
 TEST_F( openmp, global_2_local)
 {
  std::cout << "OpenMP" << std::endl;
--- a/lib/kokkos/containers/performance_tests/TestThreads.cpp
+++ b/lib/kokkos/containers/performance_tests/TestThreads.cpp
@ -52,6 +52,8 @@
 #include <TestGlobal2LocalIds.hpp>
 #include <TestUnorderedMapPerformance.hpp>

+#include <TestDynRankView.hpp>
+
 #include <iomanip>
 #include <sstream>
 #include <string>
@ -85,6 +87,13 @@ protected:
  }
 };

+TEST_F( threads, dynrankview_perf ) 
+{
+  std::cout << "Threads" << std::endl;
+  std::cout << " DynRankView vs View: Initialization Only " << std::endl;
+  test_dynrankview_op_perf<Kokkos::Threads>( 8192 );
+}
+
 TEST_F( threads, global_2_local)
 {
  std::cout << "Threads" << std::endl;
--- a/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp
+++ b/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp
@ -80,7 +80,7 @@ struct UnorderedMapTest
    , map(capacity)
    , histogram(map.get_histogram())
  {
-    Kokkos::Impl::Timer wall_clock ;
+    Kokkos::Timer wall_clock ;
    wall_clock.reset();

    value_type v = {};
@ -228,7 +228,7 @@ void run_performance_tests(std::string const & base_file_name)
  distance_out << "\b\b\b   " << std::endl;
  block_distance_out << "\b\b\b   " << std::endl;

-  Kokkos::Impl::Timer wall_clock ;
+  Kokkos::Timer wall_clock ;
  for (int i=0;  i < num_collisions ; ++i) {
    wall_clock.reset();
    std::cout << "Collisions: " << collisions[i] << std::endl;
--- a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp
--- a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
@ -77,10 +77,7 @@ private:

 public:

-  typedef Kokkos::Experimental::MemoryPool
-    < typename traits::memory_space
-    , typename traits::execution_space
-    > memory_pool ;
+  typedef Kokkos::Experimental::MemoryPool< typename traits::device_type > memory_pool ;

 private:

@ -338,7 +335,7 @@ public:
    void operator()( unsigned i ) const
      {
        if ( m_destroy && i < m_chunk_max && 0 != m_chunks[i] ) {
-          m_pool.deallocate( m_chunks[i] , m_pool.get_min_chunk_size() );
+          m_pool.deallocate( m_chunks[i] , m_pool.get_min_block_size() );
        }
        m_chunks[i] = 0 ;
      }
@ -397,7 +394,7 @@ public:
    // The memory pool chunk is guaranteed to be a power of two
    , m_chunk_shift(
        Kokkos::Impl::integral_power_of_two(
-          m_pool.get_min_chunk_size()/sizeof(typename traits::value_type)) )
+          m_pool.get_min_block_size()/sizeof(typename traits::value_type)) )
    , m_chunk_mask( ( 1 << m_chunk_shift ) - 1 )
    , m_chunk_max( ( arg_size_max + m_chunk_mask ) >> m_chunk_shift )
    {
--- a/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
+++ b/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -45,6 +45,7 @@
 #define KOKKOS_BITSET_IMPL_HPP

 #include <Kokkos_Macros.hpp>
+#include <impl/Kokkos_BitOps.hpp>
 #include <stdint.h>

 #include <cstdio>
@ -52,122 +53,57 @@
 #include <iostream>
 #include <iomanip>

-namespace Kokkos { namespace Impl {
+namespace Kokkos {
+namespace Impl {

 KOKKOS_FORCEINLINE_FUNCTION
-unsigned rotate_right(unsigned i, int r)
+unsigned rotate_right( unsigned i, int r )
 {
-  enum { size = static_cast<int>(sizeof(unsigned)*CHAR_BIT) };
-  return r ? ((i >> r) | (i << (size-r))) : i ;
+  enum { size = static_cast<int>( sizeof(unsigned) * CHAR_BIT ) };
+  return r ? ( ( i >> r ) | ( i << ( size - r ) ) ) : i ;
 }

-KOKKOS_FORCEINLINE_FUNCTION
-int bit_scan_forward(unsigned i)
-{
-#if defined( __CUDA_ARCH__ )
-  return __ffs(i) - 1;
-#elif defined( __GNUC__ ) || defined( __GNUG__ )
-  return __builtin_ffs(i) - 1;
-#elif defined( __INTEL_COMPILER )
-  return _bit_scan_forward(i);
-#else
-
-  unsigned t = 1u;
-  int r = 0;
-  while (i && (i & t == 0))
-  {
-    t = t << 1;
-    ++r;
-  }
-  return r;
-#endif
-}
-
-
-KOKKOS_FORCEINLINE_FUNCTION
-int bit_scan_reverse(unsigned i)
-{
-  enum { shift = static_cast<int>(sizeof(unsigned)*CHAR_BIT - 1) };
-#if defined( __CUDA_ARCH__ )
-  return shift - __clz(i);
-#elif defined( __GNUC__ ) || defined( __GNUG__ )
-  return shift - __builtin_clz(i);
-#elif defined( __INTEL_COMPILER )
-  return _bit_scan_reverse(i);
-#else
-  unsigned t = 1u << shift;
-  int r = 0;
-  while (i && (i & t == 0))
-  {
-    t = t >> 1;
-    ++r;
-  }
-  return r;
-#endif
-}
-
-
-// count the bits set
-KOKKOS_FORCEINLINE_FUNCTION
-int popcount(unsigned i)
-{
-#if defined( __CUDA_ARCH__ )
-  return __popc(i);
-#elif defined( __GNUC__ ) || defined( __GNUG__ )
-  return __builtin_popcount(i);
-#elif defined ( __INTEL_COMPILER )
-  return _popcnt32(i);
-#else
-  // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetNaive
-  i = i - ((i >> 1) & ~0u/3u);                                         // temp
-  i = (i & ~0u/15u*3u) + ((i >> 2) & ~0u/15u*3u);                      // temp
-  i = (i + (i >> 4)) & ~0u/255u*15u;                                   // temp
-  return (int)((i * (~0u/255u)) >> (sizeof(unsigned) - 1) * CHAR_BIT); // count
-#endif
-}
-
-
-template <typename Bitset>
+template < typename Bitset >
 struct BitsetCount
 {
-  typedef Bitset bitset_type;
-  typedef typename bitset_type::execution_space::execution_space execution_space;
-  typedef typename bitset_type::size_type size_type;
-  typedef size_type value_type;
+  typedef Bitset                                                  bitset_type;
+  typedef typename bitset_type::execution_space::execution_space  execution_space;
+  typedef typename bitset_type::size_type                         size_type;
+  typedef size_type                                               value_type;

  bitset_type m_bitset;

-  BitsetCount( bitset_type const& bitset)
+  BitsetCount( bitset_type const& bitset )
    : m_bitset(bitset)
  {}

  size_type apply() const
  {
    size_type count = 0u;
-    parallel_reduce(m_bitset.m_blocks.dimension_0(), *this, count);
+    parallel_reduce( m_bitset.m_blocks.dimension_0(), *this, count );
    return count;
  }

  KOKKOS_INLINE_FUNCTION
-  static void init( value_type & count)
+  void init( value_type & count ) const
  {
    count = 0u;
  }

  KOKKOS_INLINE_FUNCTION
-  static void join( volatile value_type & count, const volatile size_type & incr )
+  void join( volatile value_type & count, const volatile size_type & incr ) const
  {
    count += incr;
  }

  KOKKOS_INLINE_FUNCTION
-  void operator()( size_type i, value_type & count) const
+  void operator()( size_type i, value_type & count ) const
  {
-    count += popcount(m_bitset.m_blocks[i]);
+    count += bit_count( m_bitset.m_blocks[i] );
  }
 };

-}} //Kokkos::Impl
+} // namespace Impl
+} // namespace Kokkos

 #endif // KOKKOS_BITSET_IMPL_HPP
-
--- a/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp
+++ b/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp
@ -713,13 +713,20 @@ public:
  typedef Kokkos::Experimental::DynRankView< const T , device > const_dView0 ;

  typedef Kokkos::Experimental::DynRankView< T, device, Kokkos::MemoryUnmanaged > dView0_unmanaged ;
-  typedef typename dView0::host_mirror_space host ;
+  typedef typename dView0::host_mirror_space host_drv_space ;
+
+  typedef Kokkos::Experimental::View< T , device >        View0 ;
+  typedef Kokkos::Experimental::View< T* , device >       View1 ;
+  typedef Kokkos::Experimental::View< T******* , device > View7 ;
+
+  typedef typename View0::host_mirror_space  host_view_space ;

  TestDynViewAPI()
  {
+    run_test_resize_realloc();
    run_test_mirror();
-    run_test();
    run_test_scalar();
+    run_test();
    run_test_const();
    run_test_subview();
    run_test_subview_strided();
@ -735,19 +742,147 @@ public:
    TestViewOperator_LeftAndRight< int , device , 1 >::testit(2);
  }

+  static void run_test_resize_realloc()
+  {
+    dView0 drv0("drv0", 10, 20, 30);
+    ASSERT_EQ( drv0.rank(), 3);
+
+    Kokkos::Experimental::resize(drv0, 5, 10);
+    ASSERT_EQ( drv0.rank(), 2);
+    ASSERT_EQ( drv0.dimension_0(), 5);
+    ASSERT_EQ( drv0.dimension_1(), 10);
+    ASSERT_EQ( drv0.dimension_2(), 1);
+
+    Kokkos::Experimental::realloc(drv0, 10, 20);
+    ASSERT_EQ( drv0.rank(), 2);
+    ASSERT_EQ( drv0.dimension_0(), 10);
+    ASSERT_EQ( drv0.dimension_1(), 20);
+    ASSERT_EQ( drv0.dimension_2(), 1);
+
+  }
+
  static void run_test_mirror()
  {
-    typedef Kokkos::Experimental::DynRankView< int , host > view_type ;
+    typedef Kokkos::Experimental::DynRankView< int , host_drv_space > view_type ;
    typedef typename view_type::HostMirror mirror_type ;
    view_type a("a");
    mirror_type am = Kokkos::Experimental::create_mirror_view(a);
    mirror_type ax = Kokkos::Experimental::create_mirror(a);
    ASSERT_EQ( & a() , & am() );
+    ASSERT_EQ( a.rank() , am.rank() );
+    ASSERT_EQ( ax.rank() , am.rank() );
+
+    if (Kokkos::HostSpace::execution_space::is_initialized() )
+    {
+      Kokkos::DynRankView<double, Kokkos::LayoutLeft, Kokkos::HostSpace> a_h("A",1000);
+      auto a_h2 = Kokkos::create_mirror(Kokkos::HostSpace(),a_h);
+      auto a_d = Kokkos::create_mirror(typename device::memory_space(),a_h);
+  
+      int equal_ptr_h_h2  = (a_h.data() ==a_h2.data())?1:0;
+      int equal_ptr_h_d   = (a_h.data() ==a_d. data())?1:0;
+      int equal_ptr_h2_d  = (a_h2.data()==a_d. data())?1:0;
+  
+      ASSERT_EQ(equal_ptr_h_h2,0);
+      ASSERT_EQ(equal_ptr_h_d ,0);
+      ASSERT_EQ(equal_ptr_h2_d,0);
+  
+      ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
+      ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
+
+      ASSERT_EQ(a_h.rank(),a_h2.rank());
+      ASSERT_EQ(a_h.rank(),a_d.rank());
+    }
+    if (Kokkos::HostSpace::execution_space::is_initialized() )
+    {
+      Kokkos::DynRankView<double, Kokkos::LayoutRight, Kokkos::HostSpace> a_h("A",1000);
+      auto a_h2 = Kokkos::create_mirror(Kokkos::HostSpace(),a_h);
+      auto a_d = Kokkos::create_mirror(typename device::memory_space(),a_h);
+  
+      int equal_ptr_h_h2  = (a_h.data() ==a_h2.data())?1:0;
+      int equal_ptr_h_d   = (a_h.data() ==a_d. data())?1:0;
+      int equal_ptr_h2_d  = (a_h2.data()==a_d. data())?1:0;
+  
+      ASSERT_EQ(equal_ptr_h_h2,0);
+      ASSERT_EQ(equal_ptr_h_d ,0);
+      ASSERT_EQ(equal_ptr_h2_d,0);
+  
+      ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
+      ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
+
+      ASSERT_EQ(a_h.rank(),a_h2.rank());
+      ASSERT_EQ(a_h.rank(),a_d.rank());
+    }
+
+    if (Kokkos::HostSpace::execution_space::is_initialized() )
+    {
+      Kokkos::DynRankView<double, Kokkos::LayoutLeft, Kokkos::HostSpace> a_h("A",1000);
+      auto a_h2 = Kokkos::create_mirror_view(Kokkos::HostSpace(),a_h);
+      auto a_d = Kokkos::create_mirror_view(typename device::memory_space(),a_h);
+  
+      int equal_ptr_h_h2  = a_h.data() ==a_h2.data()?1:0;
+      int equal_ptr_h_d   = a_h.data() ==a_d. data()?1:0;
+      int equal_ptr_h2_d  = a_h2.data()==a_d. data()?1:0;
+  
+      int is_same_memspace = std::is_same<Kokkos::HostSpace,typename device::memory_space>::value?1:0; 
+      ASSERT_EQ(equal_ptr_h_h2,1);
+      ASSERT_EQ(equal_ptr_h_d ,is_same_memspace);
+      ASSERT_EQ(equal_ptr_h2_d ,is_same_memspace);
+  
+      ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
+      ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
+
+      ASSERT_EQ(a_h.rank(),a_h2.rank());
+      ASSERT_EQ(a_h.rank(),a_d.rank());
+    } 
+    if (Kokkos::HostSpace::execution_space::is_initialized() )
+    {
+      Kokkos::DynRankView<double, Kokkos::LayoutRight, Kokkos::HostSpace> a_h("A",1000);
+      auto a_h2 = Kokkos::create_mirror_view(Kokkos::HostSpace(),a_h);
+      auto a_d = Kokkos::create_mirror_view(typename device::memory_space(),a_h);
+  
+      int equal_ptr_h_h2  = a_h.data() ==a_h2.data()?1:0;
+      int equal_ptr_h_d   = a_h.data() ==a_d. data()?1:0;
+      int equal_ptr_h2_d  = a_h2.data()==a_d. data()?1:0;
+  
+      int is_same_memspace = std::is_same<Kokkos::HostSpace,typename device::memory_space>::value?1:0; 
+      ASSERT_EQ(equal_ptr_h_h2,1);
+      ASSERT_EQ(equal_ptr_h_d ,is_same_memspace);
+      ASSERT_EQ(equal_ptr_h2_d ,is_same_memspace);
+  
+      ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
+      ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
+
+      ASSERT_EQ(a_h.rank(),a_h2.rank());
+      ASSERT_EQ(a_h.rank(),a_d.rank());
+    } 
+    if (Kokkos::HostSpace::execution_space::is_initialized() )
+    {
+      typedef Kokkos::DynRankView< int , Kokkos::LayoutStride , Kokkos::HostSpace > view_stride_type ;
+      unsigned order[] = { 6,5,4,3,2,1,0 }, dimen[] = { N0, N1, N2, 2, 2, 2, 2 }; //LayoutRight equivalent
+      view_stride_type a_h( "a" , Kokkos::LayoutStride::order_dimensions(7, order, dimen) );
+      auto a_h2 = Kokkos::create_mirror_view(Kokkos::HostSpace(),a_h);
+      auto a_d = Kokkos::create_mirror_view(typename device::memory_space(),a_h);
+  
+      int equal_ptr_h_h2  = a_h.data() ==a_h2.data()?1:0;
+      int equal_ptr_h_d   = a_h.data() ==a_d. data()?1:0;
+      int equal_ptr_h2_d  = a_h2.data()==a_d. data()?1:0;
+  
+      int is_same_memspace = std::is_same<Kokkos::HostSpace,typename device::memory_space>::value?1:0; 
+      ASSERT_EQ(equal_ptr_h_h2,1);
+      ASSERT_EQ(equal_ptr_h_d ,is_same_memspace);
+      ASSERT_EQ(equal_ptr_h2_d ,is_same_memspace);
+  
+      ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
+      ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
+
+      ASSERT_EQ(a_h.rank(),a_h2.rank());
+      ASSERT_EQ(a_h.rank(),a_d.rank());
+    }
  }

  static void run_test_scalar()
  {
-    typedef typename dView0::HostMirror  hView0 ;
+    typedef typename dView0::HostMirror  hView0 ; //HostMirror of DynRankView is a DynRankView

    dView0 dx , dy ;
    hView0 hx , hy ;
@ -765,6 +900,79 @@ public:
    Kokkos::Experimental::deep_copy( hy , dy );

    ASSERT_EQ( hx(), hy() );
+    ASSERT_EQ( dx.rank() , hx.rank() );
+    ASSERT_EQ( dy.rank() , hy.rank() );
+
+  //View - DynRankView Interoperability tests
+  // deep_copy DynRankView to View
+    View0 vx("vx");
+    Kokkos::deep_copy( vx , dx );
+    ASSERT_EQ( rank(dx) , rank(vx) );
+
+    View0 vy("vy");
+    Kokkos::deep_copy( vy , dy );
+    ASSERT_EQ( rank(dy) , rank(vy) );
+
+  // deep_copy View to DynRankView 
+    dView0 dxx("dxx");
+    Kokkos::deep_copy( dxx , vx );
+    ASSERT_EQ( rank(dxx) , rank(vx) );
+
+
+    View7 vcast = dx.ConstDownCast();
+    ASSERT_EQ( dx.dimension_0() , vcast.dimension_0() );
+    ASSERT_EQ( dx.dimension_1() , vcast.dimension_1() );
+    ASSERT_EQ( dx.dimension_2() , vcast.dimension_2() );
+    ASSERT_EQ( dx.dimension_3() , vcast.dimension_3() );
+    ASSERT_EQ( dx.dimension_4() , vcast.dimension_4() );
+
+    View7 vcast1( dy.ConstDownCast() );
+    ASSERT_EQ( dy.dimension_0() , vcast1.dimension_0() );
+    ASSERT_EQ( dy.dimension_1() , vcast1.dimension_1() );
+    ASSERT_EQ( dy.dimension_2() , vcast1.dimension_2() );
+    ASSERT_EQ( dy.dimension_3() , vcast1.dimension_3() );
+    ASSERT_EQ( dy.dimension_4() , vcast1.dimension_4() );
+
+  //View - DynRankView Interoperability tests
+  // copy View to DynRankView
+    dView0 dfromvx( vx );
+    auto hmx = Kokkos::create_mirror_view(dfromvx) ;
+    Kokkos::deep_copy(hmx , dfromvx);
+    auto hvx = Kokkos::create_mirror_view(vx) ;
+    Kokkos::deep_copy(hvx , vx);
+    ASSERT_EQ( rank(hvx) , rank(hmx) );
+    ASSERT_EQ( hvx.dimension_0() , hmx.dimension_0() );
+    ASSERT_EQ( hvx.dimension_1() , hmx.dimension_1() );
+
+  // copy-assign View to DynRankView
+    dView0 dfromvy = vy ;
+    auto hmy = Kokkos::create_mirror_view(dfromvy) ;
+    Kokkos::deep_copy(hmy , dfromvy);
+    auto hvy = Kokkos::create_mirror_view(vy) ;
+    Kokkos::deep_copy(hvy , vy);
+    ASSERT_EQ( rank(hvy) , rank(hmy) );
+    ASSERT_EQ( hvy.dimension_0() , hmy.dimension_0() );
+    ASSERT_EQ( hvy.dimension_1() , hmy.dimension_1() );
+
+
+    View7 vtest1("vtest1",2,2,2,2,2,2,2);
+    dView0 dfromv1( vtest1 );
+    ASSERT_EQ( dfromv1.rank() , vtest1.Rank );
+    ASSERT_EQ( dfromv1.dimension_0() , vtest1.dimension_0() );
+    ASSERT_EQ( dfromv1.dimension_1() , vtest1.dimension_1() );
+    ASSERT_EQ( dfromv1.use_count() , vtest1.use_count() );
+
+    dView0 dfromv2( vcast );
+    ASSERT_EQ( dfromv2.rank() , vcast.Rank );
+    ASSERT_EQ( dfromv2.dimension_0() , vcast.dimension_0() );
+    ASSERT_EQ( dfromv2.dimension_1() , vcast.dimension_1() );
+    ASSERT_EQ( dfromv2.use_count() , vcast.use_count() );
+
+    dView0 dfromv3 = vcast1;
+    ASSERT_EQ( dfromv3.rank() , vcast1.Rank );
+    ASSERT_EQ( dfromv3.dimension_0() , vcast1.dimension_0() );
+    ASSERT_EQ( dfromv3.dimension_1() , vcast1.dimension_1() );
+    ASSERT_EQ( dfromv3.use_count() , vcast1.use_count() );
  }

  static void run_test()
@ -782,22 +990,32 @@ public:
      (void) thing;
    }

+    dView0 d_uninitialized(Kokkos::ViewAllocateWithoutInitializing("uninit"),10,20);
+    ASSERT_TRUE( d_uninitialized.data() != nullptr );
+    ASSERT_EQ( d_uninitialized.rank() , 2 );
+    ASSERT_EQ( d_uninitialized.dimension_0() , 10 );
+    ASSERT_EQ( d_uninitialized.dimension_1() , 20 );
+    ASSERT_EQ( d_uninitialized.dimension_2() , 1  );
+
    dView0 dx , dy , dz ;
    hView0 hx , hy , hz ;

-    ASSERT_TRUE( dx.ptr_on_device() == 0 );
-    ASSERT_TRUE( dy.ptr_on_device() == 0 );
-    ASSERT_TRUE( dz.ptr_on_device() == 0 );
+    ASSERT_TRUE( Kokkos::Experimental::is_dyn_rank_view<dView0>::value );
+    ASSERT_FALSE( Kokkos::Experimental::is_dyn_rank_view< Kokkos::View<double> >::value );
+
+    ASSERT_TRUE( dx.ptr_on_device() == 0 ); //Okay with UVM
+    ASSERT_TRUE( dy.ptr_on_device() == 0 );  //Okay with UVM
+    ASSERT_TRUE( dz.ptr_on_device() == 0 ); //Okay with UVM
    ASSERT_TRUE( hx.ptr_on_device() == 0 );
    ASSERT_TRUE( hy.ptr_on_device() == 0 );
    ASSERT_TRUE( hz.ptr_on_device() == 0 );
-    ASSERT_EQ( dx.dimension_0() , 0u );
-    ASSERT_EQ( dy.dimension_0() , 0u );
-    ASSERT_EQ( dz.dimension_0() , 0u );
+    ASSERT_EQ( dx.dimension_0() , 0u ); //Okay with UVM
+    ASSERT_EQ( dy.dimension_0() , 0u ); //Okay with UVM
+    ASSERT_EQ( dz.dimension_0() , 0u ); //Okay with UVM
    ASSERT_EQ( hx.dimension_0() , 0u );
    ASSERT_EQ( hy.dimension_0() , 0u );
    ASSERT_EQ( hz.dimension_0() , 0u );
-    ASSERT_EQ( dx.rank() , 0u );
+    ASSERT_EQ( dx.rank() , 0u ); //Okay with UVM
    ASSERT_EQ( hx.rank() , 0u );

    dx = dView0( "dx" , N1 , N2 , N3 );
@ -806,11 +1024,11 @@ public:
    hx = hView0( "hx" , N1 , N2 , N3 );
    hy = hView0( "hy" , N1 , N2 , N3 );

-    ASSERT_EQ( dx.dimension_0() , unsigned(N1) );
-    ASSERT_EQ( dy.dimension_0() , unsigned(N1) );
+    ASSERT_EQ( dx.dimension_0() , unsigned(N1) ); //Okay with UVM
+    ASSERT_EQ( dy.dimension_0() , unsigned(N1) ); //Okay with UVM
    ASSERT_EQ( hx.dimension_0() , unsigned(N1) );
    ASSERT_EQ( hy.dimension_0() , unsigned(N1) );
-    ASSERT_EQ( dx.rank() , 3 );
+    ASSERT_EQ( dx.rank() , 3 ); //Okay with UVM
    ASSERT_EQ( hx.rank() , 3 );

    dx = dView0( "dx" , N0 , N1 , N2 , N3 );
@ -823,19 +1041,23 @@ public:
    ASSERT_EQ( hx.dimension_0() , unsigned(N0) );
    ASSERT_EQ( hy.dimension_0() , unsigned(N0) );
    ASSERT_EQ( dx.rank() , 4 );
+    ASSERT_EQ( dy.rank() , 4 );
    ASSERT_EQ( hx.rank() , 4 );
+    ASSERT_EQ( hy.rank() , 4 );

    ASSERT_EQ( dx.use_count() , size_t(1) );

    dView0_unmanaged unmanaged_dx = dx;
    ASSERT_EQ( dx.use_count() , size_t(1) );

+
    dView0_unmanaged unmanaged_from_ptr_dx = dView0_unmanaged(dx.ptr_on_device(),
                                                              dx.dimension_0(),
                                                              dx.dimension_1(),
                                                              dx.dimension_2(),
                                                              dx.dimension_3());

+
    {
      // Destruction of this view should be harmless
      const_dView0 unmanaged_from_ptr_const_dx( dx.ptr_on_device() ,
@ -888,6 +1110,19 @@ public:
    hx = Kokkos::Experimental::create_mirror( dx );
    hy = Kokkos::Experimental::create_mirror( dy );

+    ASSERT_EQ( hx.rank() , dx.rank() );
+    ASSERT_EQ( hy.rank() , dy.rank() );
+
+    ASSERT_EQ( hx.dimension_0() , unsigned(N0) );
+    ASSERT_EQ( hx.dimension_1() , unsigned(N1) );
+    ASSERT_EQ( hx.dimension_2() , unsigned(N2) );
+    ASSERT_EQ( hx.dimension_3() , unsigned(N3) );
+
+    ASSERT_EQ( hy.dimension_0() , unsigned(N0) );
+    ASSERT_EQ( hy.dimension_1() , unsigned(N1) );
+    ASSERT_EQ( hy.dimension_2() , unsigned(N2) );
+    ASSERT_EQ( hy.dimension_3() , unsigned(N3) );
+
    // T v1 = hx() ;    // Generates compile error as intended
    // T v2 = hx(0,0) ; // Generates compile error as intended
    // hx(0,0) = v2 ;   // Generates compile error as intended
@ -990,7 +1225,9 @@ public:
      for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
        { ASSERT_EQ( hx(ip,i1,i2,i3) , T(0) ); }
      }}}}
+//    ASSERT_EQ( hx(0,0,0,0,0,0,0,0) , T(0) ); //Test rank8 op behaves properly - if implemented
    }
+
    dz = dx ; ASSERT_EQ( dx, dz); ASSERT_NE( dy, dz);
    dz = dy ; ASSERT_EQ( dy, dz); ASSERT_NE( dx, dz);

@ -1006,6 +1243,35 @@ public:
    ASSERT_TRUE( dx.ptr_on_device() == 0 );
    ASSERT_TRUE( dy.ptr_on_device() == 0 );
    ASSERT_TRUE( dz.ptr_on_device() == 0 );
+
+  //View - DynRankView Interoperability tests
+    // deep_copy from view to dynrankview
+    const int testdim = 4;
+    dView0 dxx("dxx",testdim);
+    View1  vxx("vxx",testdim);
+    auto hvxx = Kokkos::create_mirror_view(vxx); 
+    for (int i = 0; i < testdim; ++i)
+      { hvxx(i) = i; }
+    Kokkos::deep_copy(vxx,hvxx);
+    Kokkos::deep_copy(dxx,vxx);
+    auto hdxx = Kokkos::create_mirror_view(dxx);
+    Kokkos::deep_copy(hdxx,dxx);
+    for (int i = 0; i < testdim; ++i)
+      { ASSERT_EQ( hvxx(i) , hdxx(i) ); }
+
+    ASSERT_EQ( rank(hdxx) , rank(hvxx) );
+    ASSERT_EQ( hdxx.dimension_0() , testdim );
+    ASSERT_EQ( hdxx.dimension_0() , hvxx.dimension_0() );
+
+    // deep_copy from dynrankview to view
+    View1 vdxx("vdxx",testdim);
+    auto hvdxx = Kokkos::create_mirror_view(vdxx);
+    Kokkos::deep_copy(hvdxx , hdxx);
+    ASSERT_EQ( rank(hdxx) , rank(hvdxx) );
+    ASSERT_EQ( hvdxx.dimension_0() , testdim );
+    ASSERT_EQ( hdxx.dimension_0() , hvdxx.dimension_0() );
+    for (int i = 0; i < testdim; ++i)
+      { ASSERT_EQ( hvxx(i) , hvdxx(i) ); }
  }

  typedef T DataType ;
@ -1059,35 +1325,66 @@ public:
  //  N0 = 1000,N1 = 3,N2 = 5,N3 = 7 
    unsigned order[] = { 6,5,4,3,2,1,0 }, dimen[] = { N0, N1, N2, 2, 2, 2, 2 }; //LayoutRight equivalent
    sdView d7( "d7" , Kokkos::LayoutStride::order_dimensions(7, order, dimen) );
+    ASSERT_EQ( d7.rank() , 7 );

-    sdView ds0 = Kokkos::subdynrankview( d7 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ); //Should be rank0 subview
+    sdView ds0 = Kokkos::subdynrankview( d7 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ); 
+    ASSERT_EQ( ds0.rank() , 0 );

 //Basic test - ALL
-    sdView dsALL = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() ); //compiles and runs
+    sdView dsALL = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() ); 
+    ASSERT_EQ( dsALL.rank() , 7 );

-//  Send a single value for one rank
+//  Send a value to final rank returning rank 6 subview
    sdView dsm1 = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , 1 );
+    ASSERT_EQ( dsm1.rank() , 6 );

-//  Send a std::pair as a rank
+//  Send a std::pair as argument to a rank
    sdView dssp = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , std::pair<unsigned,unsigned>(1,2) );
+    ASSERT_EQ( dssp.rank() , 7 );

-//  Send a kokkos::pair as a rank; take default layout as input
+//  Send a kokkos::pair as argument to a rank; take default layout as input
    dView0 dd0("dd0" , N0 , N1 , N2 , 2 , 2 , 2 , 2 ); //default layout
+    ASSERT_EQ( dd0.rank() , 7 );
    sdView dtkp = Kokkos::Experimental::subdynrankview( dd0 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) );
+    ASSERT_EQ( dtkp.rank() , 7 );

 // Return rank 7 subview, taking a pair as one argument, layout stride input
    sdView ds7 = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) );
+    ASSERT_EQ( ds7.rank() , 7 );

 // Default Layout DynRankView
    dView dv6("dv6" , N0 , N1 , N2 , N3 , 2 , 2 );
+    ASSERT_EQ( dv6.rank() , 6 );

 // DynRankView with LayoutRight
    typedef Kokkos::Experimental::DynRankView< T , Kokkos::LayoutRight , device > drView ;
    drView dr5( "dr5" , N0 , N1 , N2 , 2 , 2 );
+    ASSERT_EQ( dr5.rank() , 5 );

 // LayoutStride but arranged as LayoutRight
-    unsigned order3[] = { 4,3,2,1,0 }, dimen3[] = { N0, N1, N2, 2, 2 };
-    sdView d5( "d5" , Kokkos::LayoutStride::order_dimensions(5, order3, dimen3) );
+  // NOTE: unused arg_layout dimensions must be set to ~size_t(0) so that 
+  //  rank deduction can properly take place
+    unsigned order5[] = { 4,3,2,1,0 }, dimen5[] = { N0, N1, N2, 2, 2 };
+    Kokkos::LayoutStride ls = Kokkos::LayoutStride::order_dimensions(5, order5, dimen5);
+    ls.dimension[5] = ~size_t(0);
+    ls.dimension[6] = ~size_t(0);
+    ls.dimension[7] = ~size_t(0);
+    sdView d5("d5", ls);
+    ASSERT_EQ( d5.rank() , 5 );
+
+//  LayoutStride arranged as LayoutRight - commented out as example that fails unit test
+//    unsigned order5[] = { 4,3,2,1,0 }, dimen5[] = { N0, N1, N2, 2, 2 };
+//    sdView d5( "d5" , Kokkos::LayoutStride::order_dimensions(5, order5, dimen5) );
+//
+//  Fails the following unit test:
+//    ASSERT_EQ( d5.rank() , dr5.rank() );
+//
+//  Explanation: In construction of the Kokkos::LayoutStride below, since the 
+//   remaining dimensions are not specified, they will default to values of 0 
+//   rather than ~size_t(0). 
+//  When passed to the DynRankView constructor the default dimensions (of 0) 
+//   will be counted toward the dynamic rank and returning an incorrect value 
+//   (i.e. rank 7 rather than 5).

 // Check LayoutRight dr5 and LayoutStride d5 dimensions agree (as they should) 
    ASSERT_EQ( d5.dimension_0() , dr5.dimension_0() );
@ -1100,21 +1397,21 @@ public:

 // Rank 5 subview of rank 5 dynamic rank view, layout stride input
    sdView ds5 = Kokkos::Experimental::subdynrankview( d5 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) );
+    ASSERT_EQ( ds5.rank() , 5 );

 // Pass in extra ALL arguments beyond the rank of the DynRank View.
 // This behavior is allowed - ignore the extra ALL arguments when
 //  the src.rank() < number of arguments, but be careful!
    sdView ds5plus = Kokkos::Experimental::subdynrankview( d5 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) , Kokkos::ALL() );

+    ASSERT_EQ( ds5.rank() , ds5plus.rank() );
    ASSERT_EQ( ds5.dimension_0() , ds5plus.dimension_0() );
    ASSERT_EQ( ds5.dimension_4() , ds5plus.dimension_4() );
    ASSERT_EQ( ds5.dimension_5() , ds5plus.dimension_5() );
-    ASSERT_EQ( ds5.rank() , ds5plus.rank() );
-    ASSERT_EQ( ds5.rank() , 5 );

 #if ! defined( KOKKOS_HAVE_CUDA ) || defined ( KOKKOS_USE_CUDA_UVM )
-    ASSERT_EQ( & ds5(1,1,1,1) - & ds5plus(1,1,1,1) , 0 );
    ASSERT_EQ( & ds5(1,1,1,1,0) - & ds5plus(1,1,1,1,0) , 0 );
+    ASSERT_EQ( & ds5(1,1,1,1,0,0) - & ds5plus(1,1,1,1,0,0) , 0 );  // passing argument to rank beyond the view's rank is allowed iff it is a 0. 
 #endif

 // Similar test to rank 5 above, but create rank 4 subview
@ -1131,9 +1428,9 @@ public:

  static void run_test_subview_strided()
  {
-    typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutLeft , host > drview_left ;
-    typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutRight , host > drview_right ;
-    typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutStride , host > drview_stride ;
+    typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutLeft , host_drv_space > drview_left ;
+    typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutRight , host_drv_space > drview_right ;
+    typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutStride , host_drv_space > drview_stride ;

    drview_left  xl2( "xl2", 100 , 200 );
    drview_right xr2( "xr2", 100 , 200 );
@ -1159,35 +1456,37 @@ public:
    drview_left  xl4( "xl4", 10 , 20 , 30 , 40 );
    drview_right xr4( "xr4", 10 , 20 , 30 , 40 );

-    drview_stride yl4 = Kokkos::Experimental::subdynrankview( xl4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() );
-    drview_stride yr4 = Kokkos::Experimental::subdynrankview( xr4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() );
+    //Replace subdynrankview with subview - test
+    drview_stride yl4 = Kokkos::Experimental::subview( xl4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() );
+    drview_stride yr4 = Kokkos::Experimental::subview( xr4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() );

    ASSERT_EQ( yl4.dimension_0() , xl4.dimension_1() );
    ASSERT_EQ( yl4.dimension_1() , xl4.dimension_3() );
    ASSERT_EQ( yr4.dimension_0() , xr4.dimension_1() );
    ASSERT_EQ( yr4.dimension_1() , xr4.dimension_3() );
+    ASSERT_EQ( yl4.rank() , 2);
+    ASSERT_EQ( yr4.rank() , 2);

    ASSERT_EQ( & yl4(4,4) - & xl4(1,4,2,4) , 0 );
    ASSERT_EQ( & yr4(4,4) - & xr4(1,4,2,4) , 0 );
-
  }

  static void run_test_vector()
  {
    static const unsigned Length = 1000 , Count = 8 ;

-    typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutLeft , host > multivector_type ; 
+    typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutLeft , host_drv_space > multivector_type ; 

-    typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutRight , host > multivector_right_type ;
+    typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutRight , host_drv_space > multivector_right_type ;

    multivector_type mv = multivector_type( "mv" , Length , Count );
    multivector_right_type mv_right = multivector_right_type( "mv" , Length , Count );

-    typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutStride , host > svector_type ;
-    typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutStride , host > smultivector_type ;
-    typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host > const_svector_right_type ; //LayoutStride, not right; setup to match original ViewAPI calls... update
-    typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host > const_svector_type ;
-    typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host > const_smultivector_type ;
+    typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutStride , host_drv_space > svector_type ;
+    typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutStride , host_drv_space > smultivector_type ;
+    typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host_drv_space > const_svector_right_type ; 
+    typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host_drv_space > const_svector_type ;
+    typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host_drv_space > const_smultivector_type ;

    svector_type v1 = Kokkos::Experimental::subdynrankview( mv , Kokkos::ALL() , 0 );
    svector_type v2 = Kokkos::Experimental::subdynrankview( mv , Kokkos::ALL() , 1 );
@ -1251,7 +1550,6 @@ public:
    const_smultivector_type cmv( mv );
    typename smultivector_type::const_type cmvX( cmv );
    typename const_smultivector_type::const_type ccmvX( cmv );
-
  }
 };

--- a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp
+++ b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp
@ -61,8 +61,7 @@ struct TestDynamicView
  typedef typename Space::execution_space  execution_space ;
  typedef typename Space::memory_space     memory_space ;

-  typedef Kokkos::Experimental::MemoryPool< memory_space , execution_space >
-    memory_pool_type ;
+  typedef Kokkos::Experimental::MemoryPool<typename Space::device_type> memory_pool_type;

  typedef Kokkos::Experimental::DynamicView<Scalar*,Space> view_type;

@ -129,11 +128,9 @@ struct TestDynamicView
    typedef Kokkos::TeamPolicy<execution_space,TEST> TestPolicy ;
    typedef Kokkos::TeamPolicy<execution_space,VERIFY> VerifyPolicy ;

-    const unsigned int chunk_size = 1024 ;
-
 // printf("TestDynamicView::run(%d) construct memory pool\n",arg_total_size);

-    memory_pool_type pool( memory_space() , chunk_size , arg_total_size * sizeof(Scalar) );
+    memory_pool_type pool( memory_space() , arg_total_size * sizeof(Scalar) * 1.2 );

 // printf("TestDynamicView::run(%d) construct dynamic view\n",arg_total_size);

--- a/lib/kokkos/core/cmake/KokkosCore_config.h.in
+++ b/lib/kokkos/core/cmake/KokkosCore_config.h.in
@ -34,6 +34,7 @@
 #cmakedefine KOKKOS_HAVE_Winthread
 #cmakedefine KOKKOS_HAVE_OPENMP
 #cmakedefine KOKKOS_HAVE_HWLOC
+#cmakedefine KOKKOS_HAVE_DEBUG
 #cmakedefine KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
 #cmakedefine KOKKOS_HAVE_CXX11
 #cmakedefine KOKKOS_HAVE_CUSPARSE
--- a/lib/kokkos/core/perf_test/CMakeLists.txt
+++ b/lib/kokkos/core/perf_test/CMakeLists.txt
@ -8,11 +8,22 @@ SET(SOURCES
  PerfTestCuda.cpp
  )

-TRIBITS_ADD_EXECUTABLE_AND_TEST(
-  PerfTest
+# Per #374, we always want to build this test, but we only want to run
+# it as a PERFORMANCE test.  That's why we separate building the test
+# from running the test.
+
+TRIBITS_ADD_EXECUTABLE(
+  PerfTestExec
  SOURCES ${SOURCES}
  COMM serial mpi
-  NUM_MPI_PROCS 1
-  FAIL_REGULAR_EXPRESSION "  FAILED  "
  TESTONLYLIBS kokkos_gtest
  )
+
+TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  PerfTest
+  NAME PerfTestExec
+  COMM serial mpi
+  NUM_MPI_PROCS 1
+  CATEGORIES PERFORMANCE
+  FAIL_REGULAR_EXPRESSION "  FAILED  "
+  )
--- a/lib/kokkos/core/perf_test/PerfTestCuda.cpp
+++ b/lib/kokkos/core/perf_test/PerfTestCuda.cpp
@ -159,7 +159,7 @@ struct TextureFetch

    Kokkos::Cuda::fence();

-    Kokkos::Impl::Timer timer;
+    Kokkos::Timer timer;
    for (int j=0; j<10; ++j) {
      RandomReduce f(array,indexes);
      f.apply(reduce);
--- a/lib/kokkos/core/perf_test/PerfTestGramSchmidt.hpp
+++ b/lib/kokkos/core/perf_test/PerfTestGramSchmidt.hpp
@ -153,7 +153,7 @@ struct ModifiedGramSchmidt

    Kokkos::deep_copy( one , (Scalar) 1 );

-    Kokkos::Impl::Timer timer ;
+    Kokkos::Timer timer ;

    for ( size_type j = 0 ; j < count ; ++j ) {
      // Reduction   : tmp = dot( Q(:,j) , Q(:,j) );
--- a/lib/kokkos/core/perf_test/PerfTestHexGrad.hpp
+++ b/lib/kokkos/core/perf_test/PerfTestHexGrad.hpp
@ -252,7 +252,7 @@ struct HexGrad
    execution_space::fence();

    for ( int i = 0 ; i < iter ; ++i ) {
-      Kokkos::Impl::Timer timer ;
+      Kokkos::Timer timer ;
      Kokkos::parallel_for( count , HexGrad<execution_space>( coord , grad ) );
      execution_space::fence();
      const double dt = timer.seconds();
--- a/lib/kokkos/core/perf_test/test_atomic.cpp
+++ b/lib/kokkos/core/perf_test/test_atomic.cpp
@ -414,24 +414,27 @@ void Loop(int loop, int test, const char* type_name) {

  Kokkos::Impl::Timer timer;
  T res = LoopVariant<T>(loop,test);
-  double time1 = timer.seconds();
+  double time = timer.seconds();

  timer.reset();
  T resNonAtomic = LoopVariantNonAtomic<T>(loop,test);
-  double time2 = timer.seconds();
+  double timeNonAtomic = timer.seconds();

  timer.reset();
  T resSerial = LoopVariantSerial<T>(loop,test);
-  double time3 = timer.seconds();
+  double timeSerial = timer.seconds();

-  time1*=1e6/loop;
-  time2*=1e6/loop;
-  time3*=1e6/loop;
+  time         *=1e6/loop;
+  timeNonAtomic*=1e6/loop;
+  timeSerial   *=1e6/loop;
  //textcolor_standard();
  bool passed = true;
  if(resSerial!=res) passed = false;
  //if(!passed) textcolor(RESET,BLACK,YELLOW);
-  printf("%s Test %i %s  --- Loop: %i Value (S,A,NA): %e %e %e Time: %7.4e %7.4e %7.4e Size of Type %i)",type_name,test,passed?"PASSED":"FAILED",loop,1.0*resSerial,1.0*res,1.0*resNonAtomic,time1,time2,time3,(int)sizeof(T));
+  printf("%s Test %i %s  --- Loop: %i Value (S,A,NA): %e %e %e Time: %7.4e %7.4e %7.4e Size of Type %i)",
+         type_name,test,passed?"PASSED":"FAILED",loop,
+         1.0*resSerial,1.0*res,1.0*resNonAtomic,
+         timeSerial,time,timeNonAtomic,(int)sizeof(T));
  //if(!passed) textcolor_standard();
  printf("\n");
 }
@ -452,7 +455,7 @@ void Test(int loop, int test, const char* type_name) {
 int main(int argc, char* argv[])
 {
  int type = -1;
-  int loop = 1000000;
+  int loop = 100000;
  int test = -1;

  for(int i=0;i<argc;i++)
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
@ -124,15 +124,31 @@ unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits:

 #endif

+
+namespace Kokkos {
+namespace Impl {
+  struct CudaLockArraysStruct {
+    int* atomic;
+    int* scratch;
+    int* threadid;
+  };
+}
+}
 __device__ __constant__
 #ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
 extern
 #endif
-int* kokkos_impl_cuda_atomic_lock_array ;
+Kokkos::Impl::CudaLockArraysStruct kokkos_impl_cuda_lock_arrays ;

 #define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
 #define CUDA_SPACE_ATOMIC_XOR_MASK 0x15A39

+namespace Kokkos {
+namespace Impl {
+  void* cuda_resize_scratch_space(size_t bytes, bool force_shrink = false);
+}
+}
+
 namespace Kokkos {
 namespace Impl {
 __device__ inline
@ -140,8 +156,7 @@ bool lock_address_cuda_space(void* ptr) {
  size_t offset = size_t(ptr);
  offset = offset >> 2;
  offset = offset & CUDA_SPACE_ATOMIC_MASK;
-  //offset = offset xor CUDA_SPACE_ATOMIC_XOR_MASK;
-  return (0 == atomicCAS(&kokkos_impl_cuda_atomic_lock_array[offset],0,1));
+  return (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[offset],0,1));
 }

 __device__ inline
@ -149,8 +164,7 @@ void unlock_address_cuda_space(void* ptr) {
  size_t offset = size_t(ptr);
  offset = offset >> 2;
  offset = offset & CUDA_SPACE_ATOMIC_MASK;
-  //offset = offset xor CUDA_SPACE_ATOMIC_XOR_MASK;
-  atomicExch( &kokkos_impl_cuda_atomic_lock_array[ offset ], 0);
+  atomicExch( &kokkos_impl_cuda_lock_arrays.atomic[ offset ], 0);
 }

 }
@ -232,8 +246,11 @@ struct CudaParallelLaunch< DriverType , true > {
      cudaMemcpyToSymbol( kokkos_impl_cuda_constant_memory_buffer , & driver , sizeof(DriverType) );

      #ifndef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
-      int* lock_array_ptr = lock_array_cuda_space_ptr();
-      cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) );
+      Kokkos::Impl::CudaLockArraysStruct locks;
+      locks.atomic = atomic_lock_array_cuda_space_ptr(false);
+      locks.scratch = scratch_lock_array_cuda_space_ptr(false);
+      locks.threadid = threadid_lock_array_cuda_space_ptr(false);
+      cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
      #endif

      // Invoke the driver function on the device
@ -271,8 +288,11 @@ struct CudaParallelLaunch< DriverType , false > {
      #endif

      #ifndef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
-      int* lock_array_ptr = lock_array_cuda_space_ptr();
-      cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) );
+      Kokkos::Impl::CudaLockArraysStruct locks;
+      locks.atomic = atomic_lock_array_cuda_space_ptr(false);
+      locks.scratch = scratch_lock_array_cuda_space_ptr(false);
+      locks.threadid = threadid_lock_array_cuda_space_ptr(false);
+      cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
      #endif

      cuda_parallel_launch_local_memory< DriverType ><<< grid , block , shmem , stream >>>( driver );
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
@ -51,10 +51,10 @@
 /* only compile this file if CUDA is enabled for Kokkos */
 #ifdef KOKKOS_HAVE_CUDA

+#include <Kokkos_Core.hpp>
 #include <Kokkos_Cuda.hpp>
 #include <Kokkos_CudaSpace.hpp>

-#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>
 #include <Cuda/Kokkos_Cuda_Internal.hpp>
 #include <impl/Kokkos_Error.hpp>

@ -107,68 +107,6 @@ void DeepCopyAsyncCuda( void * dst , const void * src , size_t n) {

 namespace Kokkos {

-#if ! KOKKOS_USING_EXP_VIEW
-
-namespace {
-
-void texture_object_attach_impl(  Impl::AllocationTracker const & tracker
-                                , unsigned type_size
-                                , ::cudaChannelFormatDesc const & desc
-                               )
-{
-  enum { TEXTURE_BOUND_1D = 2u << 27 };
-
-  if ( tracker.attribute() == NULL ) {
-    // check for correct allocator
-    const bool ok_alloc =  tracker.allocator()->support_texture_binding();
-
-    const bool ok_count = (tracker.alloc_size() / type_size) < TEXTURE_BOUND_1D;
-
-    if (ok_alloc && ok_count) {
-      Impl::TextureAttribute * attr = new Impl::TextureAttribute( tracker.alloc_ptr(), tracker.alloc_size(), desc );
-      tracker.set_attribute( attr );
-    }
-    else {
-      std::ostringstream oss;
-      oss << "Error: Cannot attach texture object";
-      if (!ok_alloc) {
-        oss << ", incompatabile allocator " << tracker.allocator()->name();
-      }
-      if (!ok_count) {
-        oss << ", array " << tracker.label() << " too large";
-      }
-      oss << ".";
-      Kokkos::Impl::throw_runtime_exception( oss.str() );
-    }
-  }
-
-  if ( NULL == dynamic_cast<Impl::TextureAttribute *>(tracker.attribute()) ) {
-    std::ostringstream oss;
-    oss << "Error: Allocation " << tracker.label() << " already has an attribute attached.";
-    Kokkos::Impl::throw_runtime_exception( oss.str() );
-  }
-
-}
-
-} // unnamed namespace
-
-/*--------------------------------------------------------------------------*/
-
-Impl::AllocationTracker CudaSpace::allocate_and_track( const std::string & label, const size_t size )
-{
-  return Impl::AllocationTracker( allocator(), size, label);
-}
-
-void CudaSpace::texture_object_attach(  Impl::AllocationTracker const & tracker
-                                      , unsigned type_size
-                                      , ::cudaChannelFormatDesc const & desc
-                                     )
-{
-  texture_object_attach_impl( tracker, type_size, desc );
-}
-
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
 void CudaSpace::access_error()
 {
  const std::string msg("Kokkos::CudaSpace::access_error attempt to execute Cuda function from non-Cuda space" );
@ -183,23 +121,6 @@ void CudaSpace::access_error( const void * const )

 /*--------------------------------------------------------------------------*/

-#if ! KOKKOS_USING_EXP_VIEW
-
-Impl::AllocationTracker CudaUVMSpace::allocate_and_track( const std::string & label, const size_t size )
-{
-  return Impl::AllocationTracker( allocator(), size, label);
-}
-
-void CudaUVMSpace::texture_object_attach(  Impl::AllocationTracker const & tracker
-                                         , unsigned type_size
-                                         , ::cudaChannelFormatDesc const & desc
-                                        )
-{
-  texture_object_attach_impl( tracker, type_size, desc );
-}
-
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
 bool CudaUVMSpace::available()
 {
 #if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && !defined(__APPLE__)
@ -212,15 +133,6 @@ bool CudaUVMSpace::available()

 /*--------------------------------------------------------------------------*/

-#if ! KOKKOS_USING_EXP_VIEW
-
-Impl::AllocationTracker CudaHostPinnedSpace::allocate_and_track( const std::string & label, const size_t size )
-{
-  return Impl::AllocationTracker( allocator(), size, label);
-}
-
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
 } // namespace Kokkos

 /*--------------------------------------------------------------------------*/
@ -824,16 +736,26 @@ print_records( std::ostream & s , const Kokkos::CudaHostPinnedSpace & space , bo

 namespace Kokkos {
 namespace {
-  __global__ void init_lock_array_kernel() {
+  __global__ void init_lock_array_kernel_atomic() {
    unsigned i = blockIdx.x*blockDim.x + threadIdx.x;

    if(i<CUDA_SPACE_ATOMIC_MASK+1)
-      kokkos_impl_cuda_atomic_lock_array[i] = 0;
+      kokkos_impl_cuda_lock_arrays.atomic[i] = 0;
+  }
+
+  __global__ void init_lock_array_kernel_scratch_threadid(int N) {
+    unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if(i<N) {
+      kokkos_impl_cuda_lock_arrays.scratch[i] = 0;
+      kokkos_impl_cuda_lock_arrays.threadid[i] = 0;
+    }
  }
 }

+
 namespace Impl {
-int* lock_array_cuda_space_ptr(bool deallocate) {
+int* atomic_lock_array_cuda_space_ptr(bool deallocate) {
  static int* ptr = NULL;
  if(deallocate) {
    cudaFree(ptr);
@ -845,13 +767,60 @@ int* lock_array_cuda_space_ptr(bool deallocate) {
  return ptr;
 }

-void init_lock_array_cuda_space() {
-  int is_initialized = 0;
-  if(! is_initialized) {
-    int* lock_array_ptr = lock_array_cuda_space_ptr();
-    cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) );
-    init_lock_array_kernel<<<(CUDA_SPACE_ATOMIC_MASK+255)/256,256>>>();
+int* scratch_lock_array_cuda_space_ptr(bool deallocate) {
+  static int* ptr = NULL;
+  if(deallocate) {
+    cudaFree(ptr);
+    ptr = NULL;
  }
+
+  if(ptr==NULL && !deallocate)
+    cudaMalloc(&ptr,sizeof(int)*(Cuda::concurrency()));
+  return ptr;
+}
+
+int* threadid_lock_array_cuda_space_ptr(bool deallocate) {
+  static int* ptr = NULL;
+  if(deallocate) {
+    cudaFree(ptr);
+    ptr = NULL;
+  }
+
+  if(ptr==NULL && !deallocate)
+    cudaMalloc(&ptr,sizeof(int)*(Cuda::concurrency()));
+  return ptr;
+}
+
+void init_lock_arrays_cuda_space() {
+  static int is_initialized = 0;
+  if(! is_initialized) {
+    Kokkos::Impl::CudaLockArraysStruct locks;
+    locks.atomic = atomic_lock_array_cuda_space_ptr(false);
+    locks.scratch = scratch_lock_array_cuda_space_ptr(false);
+    locks.threadid = threadid_lock_array_cuda_space_ptr(false);
+    cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
+    init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK+255)/256,256>>>();
+    init_lock_array_kernel_scratch_threadid<<<(Kokkos::Cuda::concurrency()+255)/256,256>>>(Kokkos::Cuda::concurrency());
+  }
+}
+
+void* cuda_resize_scratch_space(size_t bytes, bool force_shrink) {
+  static void* ptr = NULL;
+  static size_t current_size = 0;
+  if(current_size == 0) {
+    current_size = bytes;
+    ptr = Kokkos::kokkos_malloc<Kokkos::CudaSpace>("CudaSpace::ScratchMemory",current_size);
+  }
+  if(bytes > current_size) {
+    current_size = bytes;
+    ptr = Kokkos::kokkos_realloc<Kokkos::CudaSpace>(ptr,current_size);
+  }
+  if((bytes < current_size) && (force_shrink)) {
+    current_size = bytes;
+    Kokkos::kokkos_free<Kokkos::CudaSpace>(ptr);
+    ptr = Kokkos::kokkos_malloc<Kokkos::CudaSpace>("CudaSpace::ScratchMemory",current_size);
+  }
+  return ptr;
 }

 }
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Alloc.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Alloc.hpp
@ -50,7 +50,6 @@
 #ifdef KOKKOS_HAVE_CUDA

 #include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_AllocationTracker.hpp> // AllocatorAttributeBase

 namespace Kokkos {
 namespace Impl {
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp
@ -1,198 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#include <Kokkos_Macros.hpp>
-
-#if ! KOKKOS_USING_EXP_VIEW
-
-/* only compile this file if CUDA is enabled for Kokkos */
-#ifdef KOKKOS_HAVE_CUDA
-
-#include <impl/Kokkos_Error.hpp>
-#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>
-#include <Cuda/Kokkos_Cuda_Error.hpp>
-
-#include <sstream>
-
-namespace Kokkos { namespace Impl {
-
-
-/*--------------------------------------------------------------------------*/
-
-TextureAttribute::TextureAttribute(  void * const alloc_ptr
-                                   , size_t alloc_size
-                                   , cudaChannelFormatDesc const & desc
-                                  )
-  : m_tex_obj(0)
-{
-  cuda_device_synchronize();
-
-  struct cudaResourceDesc resDesc ;
-  struct cudaTextureDesc  texDesc ;
-
-  memset( & resDesc , 0 , sizeof(resDesc) );
-  memset( & texDesc , 0 , sizeof(texDesc) );
-
-  resDesc.resType                = cudaResourceTypeLinear ;
-  resDesc.res.linear.desc        = desc ;
-  resDesc.res.linear.sizeInBytes = alloc_size ;
-  resDesc.res.linear.devPtr      = alloc_ptr ;
-
-  CUDA_SAFE_CALL( cudaCreateTextureObject( & m_tex_obj , & resDesc, & texDesc, NULL) );
-
-  cuda_device_synchronize();
-}
-
-
-TextureAttribute::~TextureAttribute()
-{
-  if (m_tex_obj) {
-    cudaDestroyTextureObject( m_tex_obj );
-  }
-}
-
-/*--------------------------------------------------------------------------*/
-
-void * CudaMallocAllocator::allocate( size_t size )
-{
-  void * ptr = NULL;
-
-  CUDA_SAFE_CALL( cudaMalloc( &ptr, size ) );
-
-  return ptr;
-}
-
-void CudaMallocAllocator::deallocate( void * ptr, size_t /*size*/ )
-{
-  try {
-    CUDA_SAFE_CALL( cudaFree( ptr ) );
-  } catch(...) {}
-}
-
-void * CudaMallocAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
-{
-  void * ptr = old_ptr;
-  if (old_size != new_size) {
-    ptr = allocate( new_size );
-    size_t copy_size = old_size < new_size ? old_size : new_size;
-
-    CUDA_SAFE_CALL( cudaMemcpy( ptr , old_ptr , copy_size , cudaMemcpyDefault ) );
-
-    deallocate( old_ptr, old_size );
-  }
-  return ptr;
-}
-
-/*--------------------------------------------------------------------------*/
-
-void * CudaUVMAllocator::allocate( size_t size )
-{
-#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION )
-  void * ptr = NULL;
-  CUDA_SAFE_CALL( cudaMallocManaged( &ptr, size, cudaMemAttachGlobal ) );
-  return ptr;
-#else
-  throw_runtime_exception( "CUDA VERSION does not support UVM" );
-  return NULL;
-#endif
-}
-
-void CudaUVMAllocator::deallocate( void * ptr, size_t /*size*/ )
-{
-  try {
-    CUDA_SAFE_CALL( cudaFree( ptr ) );
-  } catch(...) {}
-}
-
-void * CudaUVMAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
-{
-  void * ptr = old_ptr;
-  if (old_size != new_size) {
-    ptr = allocate( new_size );
-    size_t copy_size = old_size < new_size ? old_size : new_size;
-
-    CUDA_SAFE_CALL( cudaMemcpy( ptr , old_ptr , copy_size , cudaMemcpyDefault ) );
-
-    deallocate( old_ptr, old_size );
-  }
-  return ptr;
-}
-
-/*--------------------------------------------------------------------------*/
-
-void * CudaHostAllocator::allocate( size_t size )
-{
-  void * ptr = NULL;
-  CUDA_SAFE_CALL( cudaHostAlloc( &ptr , size , cudaHostAllocDefault ) );
-  return ptr;
-}
-
-void CudaHostAllocator::deallocate( void * ptr, size_t /*size*/ )
-{
-  try {
-    CUDA_SAFE_CALL( cudaFreeHost( ptr ) );
-  } catch(...) {}
-}
-
-void * CudaHostAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
-{
-  void * ptr = old_ptr;
-  if (old_size != new_size) {
-    ptr = allocate( new_size );
-    size_t copy_size = old_size < new_size ? old_size : new_size;
-
-    CUDA_SAFE_CALL( cudaMemcpy( ptr , old_ptr , copy_size , cudaMemcpyHostToHost ) );
-
-    deallocate( old_ptr, old_size );
-  }
-  return ptr;
-}
-
-/*--------------------------------------------------------------------------*/
-
-}} // namespace Kokkos::Impl
-
-#endif //KOKKOS_HAVE_CUDA
-
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.hpp
@ -1,190 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_CUDA_BASIC_ALLOCATORS_HPP
-#define KOKKOS_CUDA_BASIC_ALLOCATORS_HPP
-
-#include <Kokkos_Macros.hpp>
-
-#if ! KOKKOS_USING_EXP_VIEW
-
-/* only compile this file if CUDA is enabled for Kokkos */
-#ifdef KOKKOS_HAVE_CUDA
-
-#include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_AllocationTracker.hpp> // AllocatorAttributeBase
-
-namespace Kokkos { namespace Impl {
-
-
-// Cuda 5.0 <texture_types.h> defines 'cudaTextureObject_t'
-// to be an 'unsigned long long'.  This chould change with
-// future version of Cuda and this typedef would have to
-// change accordingly.
-
-#if defined( CUDA_VERSION ) && ( 5000 <= CUDA_VERSION )
-
-typedef enable_if<
-  sizeof(::cudaTextureObject_t) == sizeof(const void *) ,
-  ::cudaTextureObject_t >::type cuda_texture_object_type ;
-
-#else
-
-typedef const void * cuda_texture_object_type ;
-
-#endif
-
-
-struct TextureAttribute : public AllocatorAttributeBase
-{
-  cuda_texture_object_type m_tex_obj ;
-
-  TextureAttribute(  void * const alloc_ptr
-                   , size_t alloc_size
-                   , cudaChannelFormatDesc const & desc
-                  );
-
-  ~TextureAttribute();
-};
-
-/// class CudaUnmanagedAllocator
-/// does nothing when deallocate(ptr,size) is called
-struct CudaUnmanagedAllocator
-{
-  static const char * name()
-  {
-    return "Cuda Unmanaged Allocator";
-  }
-
-  static void deallocate(void * /*ptr*/, size_t /*size*/) {}
-
-  static bool support_texture_binding() { return true; }
-};
-
-/// class CudaUnmanagedAllocator
-/// does nothing when deallocate(ptr,size) is called
-struct CudaUnmanagedUVMAllocator
-{
-  static const char * name()
-  {
-    return "Cuda Unmanaged UVM Allocator";
-  }
-
-  static void deallocate(void * /*ptr*/, size_t /*size*/) {}
-
-  static bool support_texture_binding() { return true; }
-};
-
-/// class CudaUnmanagedHostAllocator
-/// does nothing when deallocate(ptr,size) is called
-class CudaUnmanagedHostAllocator
-{
-public:
-  static const char * name()
-  {
-    return "Cuda Unmanaged Host Allocator";
-  }
-  // Unmanaged deallocate does nothing
-  static void deallocate(void * /*ptr*/, size_t /*size*/) {}
-};
-
-/// class CudaMallocAllocator
-class CudaMallocAllocator
-{
-public:
-  static const char * name()
-  {
-    return "Cuda Malloc Allocator";
-  }
-
-  static void* allocate(size_t size);
-
-  static void deallocate(void * ptr, size_t);
-
-  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
-
-  static bool support_texture_binding() { return true; }
-};
-
-/// class CudaUVMAllocator
-class CudaUVMAllocator
-{
-public:
-  static const char * name()
-  {
-    return "Cuda UVM Allocator";
-  }
-
-  static void* allocate(size_t size);
-
-  static void deallocate(void * ptr, size_t);
-
-  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
-
-  static bool support_texture_binding() { return true; }
-};
-
-/// class CudaHostAllocator
-class CudaHostAllocator
-{
-public:
-  static const char * name()
-  {
-    return "Cuda Host Allocator";
-  }
-
-  static void* allocate(size_t size);
-
-  static void deallocate(void * ptr, size_t);
-
-  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
-};
-
-
-}} // namespace Kokkos::Impl
-
-#endif //KOKKOS_HAVE_CUDA
-
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
-#endif //KOKKOS_CUDA_BASIC_ALLOCATORS_HPP
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
@ -51,8 +51,8 @@

 #include <Cuda/Kokkos_Cuda_Error.hpp>
 #include <Cuda/Kokkos_Cuda_Internal.hpp>
-#include <impl/Kokkos_AllocationTracker.hpp>
 #include <impl/Kokkos_Error.hpp>
+#include <impl/Kokkos_Profiling_Interface.hpp>

 /*--------------------------------------------------------------------------*/
 /* Standard 'C' libraries */
@ -70,7 +70,7 @@ __device__ __constant__
 unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits::ConstantMemoryUsage / sizeof(unsigned long) ] ;

 __device__ __constant__
-int* kokkos_impl_cuda_atomic_lock_array ;
+Kokkos::Impl::CudaLockArraysStruct kokkos_impl_cuda_lock_arrays ;

 #endif

@ -190,7 +190,7 @@ namespace {

 class CudaInternalDevices {
 public:
-  enum { MAXIMUM_DEVICE_COUNT = 8 };
+  enum { MAXIMUM_DEVICE_COUNT = 64 };
  struct cudaDeviceProp  m_cudaProp[ MAXIMUM_DEVICE_COUNT ] ;
  int                    m_cudaDevCount ;

@ -206,6 +206,9 @@ CudaInternalDevices::CudaInternalDevices()

  CUDA_SAFE_CALL (cudaGetDeviceCount( & m_cudaDevCount ) );

+  if(m_cudaDevCount > MAXIMUM_DEVICE_COUNT) {
+    Kokkos::abort("Sorry, you have more GPUs per node than we thought anybody would ever have. Please report this to github.com/kokkos/kokkos.");
+  }
  for ( int i = 0 ; i < m_cudaDevCount ; ++i ) {
    CUDA_SAFE_CALL( cudaGetDeviceProperties( m_cudaProp + i , i ) );
  }
@ -226,14 +229,6 @@ private:
  CudaInternal( const CudaInternal & );
  CudaInternal & operator = ( const CudaInternal & );

-#if ! KOKKOS_USING_EXP_VIEW
-
-  AllocationTracker m_scratchFlagsTracker;
-  AllocationTracker m_scratchSpaceTracker;
-  AllocationTracker m_scratchUnifiedTracker;
-
-#endif
-

 public:

@ -255,6 +250,8 @@ public:
  size_type * m_scratchUnified ;
  cudaStream_t * m_stream ;

+  static int was_initialized;
+  static int was_finalized;

  static CudaInternal & singleton();

@ -293,6 +290,8 @@ public:
  size_type * scratch_unified( const size_type size );
 };

+int CudaInternal::was_initialized = 0;
+int CudaInternal::was_finalized = 0;
 //----------------------------------------------------------------------------


@ -367,6 +366,10 @@ CudaInternal & CudaInternal::singleton()

 void CudaInternal::initialize( int cuda_device_id , int stream_count )
 {
+  if ( was_finalized ) Kokkos::abort("Calling Cuda::initialize after Cuda::finalize is illegal\n");
+  was_initialized = 1;
+  if ( is_initialized() ) return;
+
  enum { WordSize = sizeof(size_type) };

  if ( ! HostSpace::execution_space::is_initialized() ) {
@ -526,11 +529,14 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
  cudaThreadSetCacheConfig(cudaFuncCachePreferShared);

  // Init the array for used for arbitrarily sized atomics
-  Impl::init_lock_array_cuda_space();
+  Impl::init_lock_arrays_cuda_space();

  #ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
-  int* lock_array_ptr = lock_array_cuda_space_ptr();
-  cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) );
+  Kokkos::Impl::CudaLockArraysStruct locks;
+  locks.atomic = atomic_lock_array_cuda_space_ptr(false);
+  locks.scratch = scratch_lock_array_cuda_space_ptr(false);
+  locks.threadid = threadid_lock_array_cuda_space_ptr(false);
+  cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
  #endif
 }

@ -548,14 +554,6 @@ CudaInternal::scratch_flags( const Cuda::size_type size )

    m_scratchFlagsCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;

-#if ! KOKKOS_USING_EXP_VIEW
-
-    m_scratchFlagsTracker = CudaSpace::allocate_and_track( std::string("InternalScratchFlags") , sizeof( ScratchGrain ) * m_scratchFlagsCount );
-
-    m_scratchFlags = reinterpret_cast<size_type *>(m_scratchFlagsTracker.alloc_ptr());
-
-#else
-
    typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;

    Record * const r = Record::allocate( Kokkos::CudaSpace()
@ -566,9 +564,6 @@ CudaInternal::scratch_flags( const Cuda::size_type size )

    m_scratchFlags = reinterpret_cast<size_type *>( r->data() );

-#endif
-
-
    CUDA_SAFE_CALL( cudaMemset( m_scratchFlags , 0 , m_scratchFlagsCount * sizeScratchGrain ) );
  }

@ -582,26 +577,15 @@ CudaInternal::scratch_space( const Cuda::size_type size )

    m_scratchSpaceCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;

-#if ! KOKKOS_USING_EXP_VIEW
+     typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;

-    m_scratchSpaceTracker = CudaSpace::allocate_and_track( std::string("InternalScratchSpace") , sizeof( ScratchGrain ) * m_scratchSpaceCount );
+     Record * const r = Record::allocate( Kokkos::CudaSpace()
+                                        , "InternalScratchSpace"
+                                        , ( sizeof( ScratchGrain ) * m_scratchSpaceCount ) );

-    m_scratchSpace = reinterpret_cast<size_type *>(m_scratchSpaceTracker.alloc_ptr());
-
-#else
-
-    typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;
-
-    Record * const r = Record::allocate( Kokkos::CudaSpace()
-                                       , "InternalScratchSpace"
-                                       , ( sizeof( ScratchGrain ) * m_scratchSpaceCount ) );
-
-    Record::increment( r );
-
-    m_scratchSpace = reinterpret_cast<size_type *>( r->data() );
-
-#endif
+     Record::increment( r );

+     m_scratchSpace = reinterpret_cast<size_type *>( r->data() );
  }

  return m_scratchSpace ;
@ -615,14 +599,6 @@ CudaInternal::scratch_unified( const Cuda::size_type size )

    m_scratchUnifiedCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;

-#if ! KOKKOS_USING_EXP_VIEW
-
-    m_scratchUnifiedTracker = CudaHostPinnedSpace::allocate_and_track( std::string("InternalScratchUnified") , sizeof( ScratchGrain ) * m_scratchUnifiedCount );
-
-    m_scratchUnified = reinterpret_cast<size_type *>( m_scratchUnifiedTracker.alloc_ptr() );
-
-#else
-
    typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void > Record ;

    Record * const r = Record::allocate( Kokkos::CudaHostPinnedSpace()
@ -632,9 +608,6 @@ CudaInternal::scratch_unified( const Cuda::size_type size )
    Record::increment( r );

    m_scratchUnified = reinterpret_cast<size_type *>( r->data() );
-
-#endif
-
  }

  return m_scratchUnified ;
@ -644,9 +617,13 @@ CudaInternal::scratch_unified( const Cuda::size_type size )

 void CudaInternal::finalize()
 {
+  was_finalized = 1;
  if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {

-    lock_array_cuda_space_ptr(true);
+    atomic_lock_array_cuda_space_ptr(false);
+    scratch_lock_array_cuda_space_ptr(false);
+    threadid_lock_array_cuda_space_ptr(false);
+
    if ( m_stream ) {
      for ( size_type i = 1 ; i < m_streamCount ; ++i ) {
        cudaStreamDestroy( m_stream[i] );
@ -655,14 +632,6 @@ void CudaInternal::finalize()
      ::free( m_stream );
    }

-#if ! KOKKOS_USING_EXP_VIEW
-
-    m_scratchSpaceTracker.clear();
-    m_scratchFlagsTracker.clear();
-    m_scratchUnifiedTracker.clear();
-
-#else
-
    typedef Kokkos::Experimental::Impl::SharedAllocationRecord< CudaSpace > RecordCuda ;
    typedef Kokkos::Experimental::Impl::SharedAllocationRecord< CudaHostPinnedSpace > RecordHost ;

@ -670,8 +639,6 @@ void CudaInternal::finalize()
    RecordCuda::decrement( RecordCuda::get_record( m_scratchSpace ) );
    RecordHost::decrement( RecordHost::get_record( m_scratchUnified ) );

-#endif
-
    m_cudaDev             = -1 ;
    m_multiProcCount      = 0 ;
    m_maxWarpCount        = 0 ;
@ -730,7 +697,13 @@ int Cuda::is_initialized()
 { return Impl::CudaInternal::singleton().is_initialized(); }

 void Cuda::initialize( const Cuda::SelectDevice config , size_t num_instances )
-{ Impl::CudaInternal::singleton().initialize( config.cuda_device_id , num_instances ); }
+{
+  Impl::CudaInternal::singleton().initialize( config.cuda_device_id , num_instances );
+
+  #if (KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::initialize();
+  #endif
+}

 std::vector<unsigned>
 Cuda::detect_device_arch()
@ -763,7 +736,13 @@ Cuda::size_type Cuda::device_arch()
 }

 void Cuda::finalize()
-{ Impl::CudaInternal::singleton().finalize(); }
+{
+  Impl::CudaInternal::singleton().finalize();
+
+  #if (KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::finalize();
+  #endif
+}

 Cuda::Cuda()
  : m_device( Impl::CudaInternal::singleton().m_cudaDev )
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp
@ -57,17 +57,20 @@ template<class DriverType, bool Large>
 struct CudaGetMaxBlockSize;

 template<class DriverType, bool Large = (CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>
-int cuda_get_max_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
-  return CudaGetMaxBlockSize<DriverType,Large>::get_block_size(f,vector_length, shmem_extra);
+int cuda_get_max_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
+                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
+  return CudaGetMaxBlockSize<DriverType,Large>::get_block_size(f,vector_length, shmem_extra_block,shmem_extra_thread);
 }


 template<class DriverType>
 struct CudaGetMaxBlockSize<DriverType,true> {
-  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
+  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
+                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
    int numBlocks;
    int blockSize=32;
-    int sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
+    int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
+                    FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &numBlocks,
        cuda_parallel_launch_constant_memory<DriverType>,
@ -76,7 +79,8 @@ struct CudaGetMaxBlockSize<DriverType,true> {

    while (blockSize<1024 && numBlocks>0) {
      blockSize*=2;
-      sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length);
+      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
+                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );

      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
          &numBlocks,
@ -91,11 +95,13 @@ struct CudaGetMaxBlockSize<DriverType,true> {

 template<class DriverType>
 struct CudaGetMaxBlockSize<DriverType,false> {
-  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
+  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
+                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
    int numBlocks;

    int blockSize=32;
-    int sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
+    int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
+                    FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &numBlocks,
        cuda_parallel_launch_local_memory<DriverType>,
@ -104,7 +110,8 @@ struct CudaGetMaxBlockSize<DriverType,false> {

    while (blockSize<1024 && numBlocks>0) {
      blockSize*=2;
-      sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
+      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
+                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );

      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
          &numBlocks,
@ -123,13 +130,15 @@ template<class DriverType, bool Large>
 struct CudaGetOptBlockSize;

 template<class DriverType, bool Large = (CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>
-int cuda_get_opt_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
-  return CudaGetOptBlockSize<DriverType,Large>::get_block_size(f,vector_length,shmem_extra);
+int cuda_get_opt_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
+                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
+  return CudaGetOptBlockSize<DriverType,Large>::get_block_size(f,vector_length,shmem_extra_block,shmem_extra_thread);
 }

 template<class DriverType>
 struct CudaGetOptBlockSize<DriverType,true> {
-  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
+  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
+                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
    int blockSize=16;
    int numBlocks;
    int sharedmem;
@ -140,7 +149,8 @@ struct CudaGetOptBlockSize<DriverType,true> {
      blockSize*=2;

      //calculate the occupancy with that optBlockSize and check whether its larger than the largest one found so far
-      sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
+      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
+                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
              &numBlocks,
              cuda_parallel_launch_constant_memory<DriverType>,
@ -157,7 +167,8 @@ struct CudaGetOptBlockSize<DriverType,true> {

 template<class DriverType>
 struct CudaGetOptBlockSize<DriverType,false> {
-  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
+  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
+                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
    int blockSize=16;
    int numBlocks;
    int sharedmem;
@ -166,7 +177,8 @@ struct CudaGetOptBlockSize<DriverType,false> {

    while(blockSize<1024) {
      blockSize*=2;
-      sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
+      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
+                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );

      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
              &numBlocks,
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@ -130,16 +130,17 @@ inline void cuda_intra_block_reduction( ValueType& value,
  cuda_inter_warp_reduction(value,join,max_active_thread);
 }

-template< class FunctorType , class JoinOp>
+template< class FunctorType , class JoinOp , class ArgTag = void >
 __device__
-bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , void >::reference_type  value,
+bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgTag >::reference_type  value,
+                                 typename FunctorValueTraits< FunctorType , ArgTag >::reference_type  neutral,
                                 const JoinOp& join,
                                 Cuda::size_type * const m_scratch_space,
-                                 typename FunctorValueTraits< FunctorType , void >::pointer_type const result,
+                                 typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type const result,
                                 Cuda::size_type * const m_scratch_flags,
                                 const int max_active_thread = blockDim.y) {
-  typedef typename FunctorValueTraits< FunctorType , void >::pointer_type pointer_type;
-  typedef typename FunctorValueTraits< FunctorType , void >::value_type value_type;
+  typedef typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type pointer_type;
+  typedef typename FunctorValueTraits< FunctorType , ArgTag >::value_type value_type;

  //Do the intra-block reduction with shfl operations and static shared memory
  cuda_intra_block_reduction(value,join,max_active_thread);
@ -170,7 +171,7 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , void
      if(id == 0)
        *m_scratch_flags = 0;
      last_block = true;
-      value = 0;
+      value = neutral;

      pointer_type const volatile global = (pointer_type) m_scratch_space ;

@ -366,7 +367,12 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType     & functor ,
    size_type * const shared = shared_data + word_count.value * BlockSizeMask ;
    size_type * const global = global_data + word_count.value * block_id ;

+#if (__CUDA_ARCH__ < 500)
    for ( size_type i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i] ; }
+#else
+    for ( size_type i = 0 ; i < word_count.value ; i += 1 ) { global[i] = shared[i] ; }
+#endif
+
  }

  // Contributing blocks note that their contribution has been completed via an atomic-increment flag
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp
@ -0,0 +1,179 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY )
+
+#include <impl/Kokkos_TaskQueue_impl.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template class TaskQueue< Kokkos::Cuda > ;
+
+//----------------------------------------------------------------------------
+
+__device__
+void TaskQueueSpecialization< Kokkos::Cuda >::driver
+  ( TaskQueueSpecialization< Kokkos::Cuda >::queue_type * const queue )
+{
+  using Member = TaskExec< Kokkos::Cuda > ;
+  using Queue  = TaskQueue< Kokkos::Cuda > ;
+  using task_root_type = TaskBase< Kokkos::Cuda , void , void > ;
+
+  task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
+
+  Member single_exec( 1 );
+  Member team_exec( blockDim.y );
+
+  const int warp_lane = threadIdx.x + threadIdx.y * blockDim.x ;
+
+  union {
+    task_root_type * ptr ;
+    int              raw[2] ;
+  } task ;
+
+  // Loop until all queues are empty and no tasks in flight
+
+  do {
+
+    // Each team lead attempts to acquire either a thread team task
+    // or collection of single thread tasks for the team.
+
+    if ( 0 == warp_lane ) {
+
+      task.ptr = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
+
+      // Loop by priority and then type
+      for ( int i = 0 ; i < Queue::NumQueue && end == task.ptr ; ++i ) {
+        for ( int j = 0 ; j < 2 && end == task.ptr ; ++j ) {
+          task.ptr = Queue::pop_task( & queue->m_ready[i][j] );
+        }
+      }
+
+#if 0
+printf("TaskQueue<Cuda>::driver(%d,%d) task(%lx)\n",threadIdx.z,blockIdx.x
+      , uintptr_t(task.ptr));
+#endif
+
+    }
+
+    // shuffle broadcast
+
+    task.raw[0] = __shfl( task.raw[0] , 0 );
+    task.raw[1] = __shfl( task.raw[1] , 0 );
+
+    if ( 0 == task.ptr ) break ; // 0 == queue->m_ready_count
+
+    if ( end != task.ptr ) {
+      if ( task_root_type::TaskTeam == task.ptr->m_task_type ) {
+        // Thread Team Task
+        (*task.ptr->m_apply)( task.ptr , & team_exec );
+      }
+      else if ( 0 == threadIdx.y ) {
+        // Single Thread Task
+        (*task.ptr->m_apply)( task.ptr , & single_exec );
+      }
+
+      if ( 0 == warp_lane ) {
+        queue->complete( task.ptr ); 
+      }
+    }
+  } while(1);
+}
+
+namespace {
+
+__global__
+void cuda_task_queue_execute( TaskQueue< Kokkos::Cuda > * queue )
+{ TaskQueueSpecialization< Kokkos::Cuda >::driver( queue ); }
+
+}
+
+void TaskQueueSpecialization< Kokkos::Cuda >::execute
+  ( TaskQueue< Kokkos::Cuda > * const queue )
+{
+  const int warps_per_block = 4 ;
+  const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 );
+  const dim3 block( 1 , Kokkos::Impl::CudaTraits::WarpSize , warps_per_block );
+  const int shared = 0 ;
+  const cudaStream_t stream = 0 ;
+
+  CUDA_SAFE_CALL( cudaDeviceSynchronize() );
+
+#if 0
+printf("cuda_task_queue_execute before\n");
+#endif
+
+  // Query the stack size, in bytes:
+  //
+  // size_t stack_size = 0 ;
+  // CUDA_SAFE_CALL( cudaDeviceGetLimit( & stack_size , cudaLimitStackSize ) );
+  //
+  // If not large enough then set the stack size, in bytes:
+  //
+  // CUDA_SAFE_CALL( cudaDeviceSetLimit( cudaLimitStackSize , stack_size ) );
+ 
+  cuda_task_queue_execute<<< grid , block , shared , stream >>>( queue );
+
+  CUDA_SAFE_CALL( cudaGetLastError() );
+
+  CUDA_SAFE_CALL( cudaDeviceSynchronize() );
+
+#if 0
+printf("cuda_task_queue_execute after\n");
+#endif
+
+}
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
+
+
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
@ -0,0 +1,519 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_CUDA_TASK_HPP
+#define KOKKOS_IMPL_CUDA_TASK_HPP
+
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+template< typename TaskType >
+__global__
+void set_cuda_task_base_apply_function_pointer
+  ( TaskBase<Kokkos::Cuda,void,void>::function_type * ptr )
+{ *ptr = TaskType::apply ; }
+
+}
+
+template<>
+class TaskQueueSpecialization< Kokkos::Cuda >
+{
+public:
+
+  using execution_space = Kokkos::Cuda ;
+  using memory_space    = Kokkos::CudaUVMSpace ;
+  using queue_type      = TaskQueue< execution_space > ;
+
+  static
+  void iff_single_thread_recursive_execute( queue_type * const ) {}
+
+  __device__
+  static void driver( queue_type * const );
+
+  static
+  void execute( queue_type * const );
+
+  template< typename FunctorType >
+  static
+  void proc_set_apply( TaskBase<execution_space,void,void>::function_type * ptr )
+    {
+      using TaskType = TaskBase< execution_space
+                               , typename FunctorType::value_type
+                               , FunctorType > ;
+
+      CUDA_SAFE_CALL( cudaDeviceSynchronize() );
+
+      set_cuda_task_base_apply_function_pointer<TaskType><<<1,1>>>(ptr);
+
+      CUDA_SAFE_CALL( cudaGetLastError() );
+      CUDA_SAFE_CALL( cudaDeviceSynchronize() );
+    }
+};
+
+extern template class TaskQueue< Kokkos::Cuda > ;
+
+//----------------------------------------------------------------------------
+/**\brief  Impl::TaskExec<Cuda> is the TaskPolicy<Cuda>::member_type
+ *         passed to tasks running in a Cuda space.
+ *
+ *  Cuda thread blocks for tasking are dimensioned:
+ *    blockDim.x == vector length
+ *    blockDim.y == team size
+ *    blockDim.z == number of teams
+ *  where
+ *    blockDim.x * blockDim.y == WarpSize
+ *
+ *  Both single thread and thread team tasks are run by a full Cuda warp.
+ *  A single thread task is called by warp lane #0 and the remaining
+ *  lanes of the warp are idle.
+ */
+template<>
+class TaskExec< Kokkos::Cuda >
+{
+private:
+
+  TaskExec( TaskExec && ) = delete ;
+  TaskExec( TaskExec const & ) = delete ;
+  TaskExec & operator = ( TaskExec && ) = delete ;
+  TaskExec & operator = ( TaskExec const & ) = delete ;
+
+  friend class Kokkos::Impl::TaskQueue< Kokkos::Cuda > ;
+  friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::Cuda > ;
+
+  const int m_team_size ;
+
+  __device__
+  TaskExec( int arg_team_size = blockDim.y )
+    : m_team_size( arg_team_size ) {}
+
+public:
+
+#if defined( __CUDA_ARCH__ )
+  __device__ void team_barrier() { /* __threadfence_block(); */ }
+  __device__ int  team_rank() const { return threadIdx.y ; }
+  __device__ int  team_size() const { return m_team_size ; }
+#else
+  __host__ void team_barrier() {}
+  __host__ int  team_rank() const { return 0 ; }
+  __host__ int  team_size() const { return 0 ; }
+#endif
+
+};
+
+//----------------------------------------------------------------------------
+
+template<typename iType>
+struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
+{
+  typedef iType index_type;
+  const iType start ;
+  const iType end ;
+  const iType increment ;
+  const TaskExec< Kokkos::Cuda > & thread;
+
+#if defined( __CUDA_ARCH__ )
+
+  __device__ inline
+  TeamThreadRangeBoundariesStruct
+    ( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count)
+    : start( threadIdx.y )
+    , end(arg_count)
+    , increment( blockDim.y )
+    , thread(arg_thread)
+    {}
+
+  __device__ inline
+  TeamThreadRangeBoundariesStruct
+    ( const TaskExec< Kokkos::Cuda > & arg_thread
+    , const iType & arg_start
+    , const iType & arg_end
+    )
+    : start( arg_start + threadIdx.y )
+    , end(   arg_end)
+    , increment( blockDim.y )
+    , thread( arg_thread )
+    {}
+
+#else
+
+  TeamThreadRangeBoundariesStruct
+    ( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count);
+
+  TeamThreadRangeBoundariesStruct
+    ( const TaskExec< Kokkos::Cuda > & arg_thread
+    , const iType & arg_start
+    , const iType & arg_end
+    );
+
+#endif
+
+};
+
+//----------------------------------------------------------------------------
+
+template<typename iType>
+struct ThreadVectorRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
+{
+  typedef iType index_type;
+  const iType start ;
+  const iType end ;
+  const iType increment ;
+  const TaskExec< Kokkos::Cuda > & thread;
+
+#if defined( __CUDA_ARCH__ )
+
+  __device__ inline
+  ThreadVectorRangeBoundariesStruct
+    ( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count)
+    : start( threadIdx.x )
+    , end(arg_count)
+    , increment( blockDim.x )
+    , thread(arg_thread)
+    {}
+
+#else
+
+  ThreadVectorRangeBoundariesStruct
+    ( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count);
+
+#endif
+
+};
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
+TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread
+               , const iType & count )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >(thread,count);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
+TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread, const iType & start , const iType & end )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Cuda > >(thread,start,end);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
+ThreadVectorRange( const Impl::TaskExec< Kokkos::Cuda > & thread
+               , const iType & count )
+{
+  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >(thread,count);
+}
+
+/** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team.
+ * This functionality requires C++11 support.
+*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for
+  ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Cuda > >& loop_boundaries
+  , const Lambda& lambda
+  )
+{
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i);
+  }
+}
+
+// reduce across corresponding lanes between team members within warp
+// assume stride*team_size == warp_size
+template< typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void strided_shfl_warp_reduction
+  (const JoinType& join,
+   ValueType& val,
+   int team_size,
+   int stride)
+{
+  for (int lane_delta=(team_size*stride)>>1; lane_delta>=stride; lane_delta>>=1) {
+    join(val, Kokkos::shfl_down(val, lane_delta, team_size*stride));
+  }
+}
+
+// multiple within-warp non-strided reductions
+template< typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void multi_shfl_warp_reduction
+  (const JoinType& join,
+   ValueType& val,
+   int vec_length)
+{
+  for (int lane_delta=vec_length>>1; lane_delta; lane_delta>>=1) {
+    join(val, Kokkos::shfl_down(val, lane_delta, vec_length));
+  }
+}
+
+// broadcast within warp
+template< class ValueType >
+KOKKOS_INLINE_FUNCTION
+ValueType shfl_warp_broadcast
+  (ValueType& val,
+   int src_lane,
+   int width)
+{
+  return Kokkos::shfl(val, src_lane, width);
+}
+
+// all-reduce across corresponding vector lanes between team members within warp
+// assume vec_length*team_size == warp_size 
+// blockDim.x == vec_length == stride
+// blockDim.y == team_size
+// threadIdx.x == position in vec
+// threadIdx.y == member number
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
+   const Lambda & lambda,
+   const JoinType& join,
+   ValueType& initialized_result) {
+
+  ValueType result = initialized_result;
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,result);
+  }
+  initialized_result = result;
+
+  strided_shfl_warp_reduction<ValueType, JoinType>(
+                          join,
+                          initialized_result,
+                          loop_boundaries.thread.team_size(),
+                          blockDim.x);
+  initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, threadIdx.x, Impl::CudaTraits::WarpSize );
+}
+
+// all-reduce across corresponding vector lanes between team members within warp
+// if no join() provided, use sum
+// assume vec_length*team_size == warp_size 
+// blockDim.x == vec_length == stride
+// blockDim.y == team_size
+// threadIdx.x == position in vec
+// threadIdx.y == member number
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
+   const Lambda & lambda,
+   ValueType& initialized_result) {
+
+  //TODO what is the point of creating this temporary?
+  ValueType result = initialized_result;
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,result);
+  }
+  initialized_result = result;
+
+  strided_shfl_warp_reduction(
+                          [&] (ValueType& val1, const ValueType& val2) { val1 += val2; },
+                          initialized_result,
+                          loop_boundaries.thread.team_size(),
+                          blockDim.x);
+  initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, threadIdx.x, Impl::CudaTraits::WarpSize );
+}
+
+// all-reduce within team members within warp
+// assume vec_length*team_size == warp_size 
+// blockDim.x == vec_length == stride
+// blockDim.y == team_size
+// threadIdx.x == position in vec
+// threadIdx.y == member number
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
+   const Lambda & lambda,
+   const JoinType& join,
+   ValueType& initialized_result) {
+
+  ValueType result = initialized_result;
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,result);
+  }
+  initialized_result = result;
+
+  multi_shfl_warp_reduction<ValueType, JoinType>(join, initialized_result, blockDim.x);
+  initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, 0, blockDim.x );
+}
+
+// all-reduce within team members within warp
+// if no join() provided, use sum
+// assume vec_length*team_size == warp_size 
+// blockDim.x == vec_length == stride
+// blockDim.y == team_size
+// threadIdx.x == position in vec
+// threadIdx.y == member number
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
+   const Lambda & lambda,
+   ValueType& initialized_result) {
+
+  ValueType result = initialized_result;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,result);
+  }
+
+  initialized_result = result;
+
+  //initialized_result = multi_shfl_warp_reduction(
+  multi_shfl_warp_reduction(
+                          [&] (ValueType& val1, const ValueType& val2) { val1 += val2; },
+                          initialized_result,
+                          blockDim.x);
+  initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, 0, blockDim.x );
+}
+
+// scan across corresponding vector lanes between team members within warp
+// assume vec_length*team_size == warp_size 
+// blockDim.x == vec_length == stride
+// blockDim.y == team_size
+// threadIdx.x == position in vec
+// threadIdx.y == member number
+template< typename ValueType, typename iType, class Lambda >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan
+  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
+   const Lambda & lambda) {
+
+  ValueType accum = 0 ;
+  ValueType val, y, local_total;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    val = 0;
+    lambda(i,val,false);
+
+    // intra-blockDim.y exclusive scan on 'val'
+    // accum = accumulated, sum in total for this iteration
+
+    // INCLUSIVE scan
+    for( int offset = blockDim.x ; offset < Impl::CudaTraits::WarpSize ; offset <<= 1 ) {
+      y = Kokkos::shfl_up(val, offset, Impl::CudaTraits::WarpSize);
+      if(threadIdx.y*blockDim.x >= offset) { val += y; }
+    }
+
+    // pass accum to all threads
+    local_total = shfl_warp_broadcast<ValueType>(val,
+                                            threadIdx.x+Impl::CudaTraits::WarpSize-blockDim.x,
+                                            Impl::CudaTraits::WarpSize);
+
+    // make EXCLUSIVE scan by shifting values over one
+    val = Kokkos::shfl_up(val, blockDim.x, Impl::CudaTraits::WarpSize);
+    if ( threadIdx.y == 0 ) { val = 0 ; }
+
+    val += accum;
+    lambda(i,val,true);
+    accum += local_total;
+  }
+}
+
+// scan within team member (vector) within warp
+// assume vec_length*team_size == warp_size 
+// blockDim.x == vec_length == stride
+// blockDim.y == team_size
+// threadIdx.x == position in vec
+// threadIdx.y == member number
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
+   const Lambda & lambda)
+{
+  ValueType accum = 0 ;
+  ValueType val, y, local_total;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    val = 0;
+    lambda(i,val,false);
+
+    // intra-blockDim.x exclusive scan on 'val'
+    // accum = accumulated, sum in total for this iteration
+
+    // INCLUSIVE scan
+    for( int offset = 1 ; offset < blockDim.x ; offset <<= 1 ) {
+      y = Kokkos::shfl_up(val, offset, blockDim.x);
+      if(threadIdx.x >= offset) { val += y; }
+    }
+
+    // pass accum to all threads
+    local_total = shfl_warp_broadcast<ValueType>(val, blockDim.x-1, blockDim.x);
+
+    // make EXCLUSIVE scan by shifting values over one
+    val = Kokkos::shfl_up(val, 1, blockDim.x);
+    if ( threadIdx.x == 0 ) { val = 0 ; }
+
+    val += accum;
+    lambda(i,val,true);
+    accum += local_total;
+  }
+}
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+#endif /* #ifndef KOKKOS_IMPL_CUDA_TASK_HPP */
+
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp
@ -46,9 +46,10 @@
 #include <stdio.h>
 #include <iostream>
 #include <sstream>
+#include <Kokkos_Core.hpp>
 #include <Cuda/Kokkos_Cuda_TaskPolicy.hpp>

-#if defined( KOKKOS_ENABLE_CUDA_TASK_POLICY )
+#if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY )

 // #define DETAILED_PRINT

@ -93,9 +94,8 @@ CudaTaskPolicyQueue
  , const unsigned arg_team_size
  )
  : m_space( Kokkos::CudaUVMSpace()
-           , arg_task_max_size
-           , arg_task_max_size * arg_task_max_count
-           , 1 /* only one level of memory pool */
+           , arg_task_max_size * arg_task_max_count * 1.2
+           , 16 /* log2(superblock size) */
           )
  , m_team { 0 , 0 , 0 }
  , m_serial { 0 , 0 , 0 }
@ -172,6 +172,8 @@ if ( IS_TEAM_LEAD && 0 != team_task ) {
          member( kokkos_impl_cuda_shared_memory<void>()
                , 16                      /* shared_begin */
                , team_task->m_shmem_size /* shared size */
+                , 0                       /* scratch level 1 pointer */
+                , 0                       /* scratch level 1 size */
                , 0                       /* league rank */
                , 1                       /* league size */
                );
@ -926,5 +928,5 @@ void Task::clear_dependence()
 } /* namespace Kokkos */


-#endif  /* #if defined( KOKKOS_ENABLE_CUDA_TASK_POLICY ) */
+#endif  /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */

--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.hpp
@ -47,19 +47,11 @@
 #define KOKKOS_CUDA_TASKPOLICY_HPP

 #include <Kokkos_Core_fwd.hpp>
-
-#if defined( KOKKOS_HAVE_CUDA ) && \
-    defined( KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE )
-
-#define KOKKOS_ENABLE_CUDA_TASK_POLICY
-
-/* The TaskPolicy< Cuda > capability requires nvcc using the option:
- *    --relocatable-device-code=true
- */
-
 #include <Kokkos_Cuda.hpp>
 #include <Kokkos_TaskPolicy.hpp>

+#if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY )
+
 //----------------------------------------------------------------------------

 namespace Kokkos {
@ -81,8 +73,6 @@ public:

 private:

-  friend struct CudaTaskPolicyQueue ;
-
  CudaTaskPolicyQueue   * m_policy ;
  TaskMember * volatile * m_queue ;
  function_team_type      m_team ;    ///< Apply function on CUDA
@ -819,9 +809,11 @@ public:
  static member_type member_single()
    {
      return
-        member_type( 0 /* shared memory */
-                   , 0 /* shared memory begin */
-                   , 0 /* shared memory size */
+        member_type( 0 /* shared memory pointer */
+                   , 0 /* shared memory begin offset */
+                   , 0 /* shared memory end offset */
+                   , 0 /* scratch level_1 pointer */
+                   , 0 /* scratch level_1 size */
                   , 0 /* league rank */
                   , 1 /* league size */ );
    }
@ -832,10 +824,10 @@ public:
 } /* namespace Experimental */
 } /* namespace Kokkos */

-#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE ) */

 //----------------------------------------------------------------------------

+#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
 #endif /* #ifndef KOKKOS_CUDA_TASKPOLICY_HPP */


--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
@ -56,8 +56,6 @@
 #include <impl/Kokkos_Shape.hpp>
 #include <Kokkos_View.hpp>

-#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>
-
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

@ -90,343 +88,6 @@ struct AssertShapeBoundsAbort< CudaSpace >
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

-#if ! KOKKOS_USING_EXP_VIEW
-
-namespace Kokkos {
-namespace Impl {
-
-//----------------------------------------------------------------------------
-// Cuda Texture fetches can be performed for 4, 8 and 16 byte objects (int,int2,int4)
-// Via reinterpret_case this can be used to support all scalar types of those sizes.
-// Any other scalar type falls back to either normal reads out of global memory,
-// or using the __ldg intrinsic on Kepler GPUs or newer (Compute Capability >= 3.0)
-
-template< typename ValueType
-        , class MemorySpace
-        , class AliasType =
-            typename Kokkos::Impl::if_c< ( sizeof(ValueType) ==  4 ) , int ,
-            typename Kokkos::Impl::if_c< ( sizeof(ValueType) ==  8 ) , ::int2 ,
-            typename Kokkos::Impl::if_c< ( sizeof(ValueType) == 16 ) , ::int4 ,
-            typename Kokkos::Impl::if_c< ( sizeof(ValueType) == 32 ) , ::float4 ,void
-            >::type
-            >::type
-            >::type
-            >::type
-        >
-class CudaTextureFetch {
-private:
-
-  cuda_texture_object_type  m_obj ;
-  const ValueType         * m_alloc_ptr ;
-  int                       m_offset ;
-
-  void attach( const ValueType * const arg_ptr, AllocationTracker const & tracker )
-  {
-    typedef char const * const byte;
-
-    m_alloc_ptr = reinterpret_cast<ValueType *>(tracker.alloc_ptr());
-
-    size_t byte_offset = reinterpret_cast<byte>(arg_ptr) - reinterpret_cast<byte>(m_alloc_ptr);
-    const bool ok_aligned = 0 == byte_offset % sizeof(ValueType);
-
-    const size_t count = tracker.alloc_size() / sizeof(ValueType);
-    const bool ok_contains = (m_alloc_ptr <= arg_ptr) && (arg_ptr < (m_alloc_ptr + count));
-
-    if (ok_aligned && ok_contains) {
-      if (tracker.attribute() == NULL ) {
-        MemorySpace::texture_object_attach(
-            tracker
-            , sizeof(ValueType)
-            , cudaCreateChannelDesc< AliasType >()
-            );
-      }
-      m_obj = dynamic_cast<TextureAttribute*>(tracker.attribute())->m_tex_obj;
-      m_offset = arg_ptr - m_alloc_ptr;
-    }
-    else if( !ok_contains ) {
-      throw_runtime_exception("Error: cannot attach a texture object to a tracker which does not bound the pointer.");
-    }
-    else {
-      throw_runtime_exception("Error: cannot attach a texture object to an incorrectly aligned pointer.");
-    }
-  }
-
-public:
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch() : m_obj() , m_alloc_ptr() , m_offset() {}
-
-  KOKKOS_INLINE_FUNCTION
-  ~CudaTextureFetch() {}
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch( const CudaTextureFetch & rhs )
-    : m_obj(       rhs.m_obj )
-    , m_alloc_ptr( rhs.m_alloc_ptr )
-    , m_offset(    rhs.m_offset )
-    {}
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch & operator = ( const CudaTextureFetch & rhs )
-    {
-      m_obj       = rhs.m_obj ;
-      m_alloc_ptr = rhs.m_alloc_ptr ;
-      m_offset    = rhs.m_offset ;
-      return *this ;
-    }
-
-  KOKKOS_INLINE_FUNCTION explicit
-  CudaTextureFetch( const ValueType * const arg_ptr, AllocationTracker const & tracker )
-    : m_obj( 0 ) , m_alloc_ptr(0) , m_offset(0)
-    {
-      #if defined( KOKKOS_USE_LDG_INTRINSIC )
-        m_alloc_ptr(arg_ptr);
-      #elif defined( __CUDACC__ ) && ! defined( __CUDA_ARCH__ )
-        if ( arg_ptr != NULL ) {
-          if ( tracker.is_valid() ) {
-            attach( arg_ptr, tracker );
-          }
-          else {
-            AllocationTracker found_tracker = AllocationTracker::find<typename MemorySpace::allocator>(arg_ptr);
-            if ( found_tracker.is_valid() ) {
-              attach( arg_ptr, found_tracker );
-            } else {
-              throw_runtime_exception("Error: cannot attach a texture object to an untracked pointer!");
-            }
-          }
-        }
-      #endif
-    }
-
-  KOKKOS_INLINE_FUNCTION
-  operator const ValueType * () const { return m_alloc_ptr + m_offset ; }
-
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  ValueType operator[]( const iType & i ) const
-    {
-      #if defined( KOKKOS_USE_LDG_INTRINSIC ) && defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
-        AliasType v = __ldg(reinterpret_cast<AliasType*>(&m_alloc_ptr[i]));
-        return  *(reinterpret_cast<ValueType*> (&v));
-      #elif defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
-        AliasType v = tex1Dfetch<AliasType>( m_obj , i + m_offset );
-        return  *(reinterpret_cast<ValueType*> (&v));
-      #else
-        return m_alloc_ptr[ i + m_offset ];
-      #endif
-  }
-};
-
-
-template< typename ValueType, class MemorySpace >
-class CudaTextureFetch< const ValueType, MemorySpace, float4 > {
-private:
-  typedef float4 AliasType;
-  cuda_texture_object_type  m_obj ;
-  const ValueType         * m_alloc_ptr ;
-  int                       m_offset ;
-
-  void attach( const ValueType * const arg_ptr, AllocationTracker const & tracker )
-  {
-    typedef char const * const byte;
-
-    m_alloc_ptr = reinterpret_cast<ValueType *>(tracker.alloc_ptr());
-
-    size_t byte_offset = reinterpret_cast<byte>(arg_ptr) - reinterpret_cast<byte>(m_alloc_ptr);
-    const bool ok_aligned = 0 == byte_offset % sizeof(ValueType);
-
-    const size_t count = tracker.alloc_size() / sizeof(ValueType);
-    const bool ok_contains = (m_alloc_ptr <= arg_ptr) && (arg_ptr < (m_alloc_ptr + count));
-
-    if (ok_aligned && ok_contains) {
-      if (tracker.attribute() == NULL ) {
-        MemorySpace::texture_object_attach(
-            tracker
-            , sizeof(ValueType)
-            , cudaCreateChannelDesc< AliasType >()
-            );
-      }
-      m_obj = dynamic_cast<TextureAttribute*>(tracker.attribute())->m_tex_obj;
-      m_offset = arg_ptr - m_alloc_ptr;
-    }
-    else if( !ok_contains ) {
-      throw_runtime_exception("Error: cannot attach a texture object to a tracker which does not bound the pointer.");
-    }
-    else {
-      throw_runtime_exception("Error: cannot attach a texture object to an incorrectly aligned pointer.");
-    }
-  }
-
-public:
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch() : m_obj() , m_alloc_ptr() , m_offset() {}
-
-  KOKKOS_INLINE_FUNCTION
-  ~CudaTextureFetch() {}
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch( const CudaTextureFetch & rhs )
-    : m_obj(       rhs.m_obj )
-    , m_alloc_ptr( rhs.m_alloc_ptr )
-    , m_offset(    rhs.m_offset )
-    {}
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch & operator = ( const CudaTextureFetch & rhs )
-    {
-      m_obj       = rhs.m_obj ;
-      m_alloc_ptr = rhs.m_alloc_ptr ;
-      m_offset    = rhs.m_offset ;
-      return *this ;
-    }
-
-  KOKKOS_INLINE_FUNCTION explicit
-  CudaTextureFetch( const ValueType * const arg_ptr, AllocationTracker const & tracker )
-    : m_obj( 0 ) , m_alloc_ptr(0) , m_offset(0)
-    {
-      #if defined( KOKKOS_USE_LDG_INTRINSIC )
-        m_alloc_ptr(arg_ptr);
-      #elif defined( __CUDACC__ ) && ! defined( __CUDA_ARCH__ )
-        if ( arg_ptr != NULL ) {
-          if ( tracker.is_valid() ) {
-            attach( arg_ptr, tracker );
-          }
-          else {
-            AllocationTracker found_tracker = AllocationTracker::find<typename MemorySpace::allocator>(arg_ptr);
-            if ( found_tracker.is_valid() ) {
-              attach( arg_ptr, found_tracker );
-            } else {
-              throw_runtime_exception("Error: cannot attach a texture object to an untracked pointer!");
-            }
-          }
-        }
-      #endif
-    }
-
-  KOKKOS_INLINE_FUNCTION
-  operator const ValueType * () const { return m_alloc_ptr + m_offset ; }
-
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  ValueType operator[]( const iType & i ) const
-    {
-      #if defined( KOKKOS_USE_LDG_INTRINSIC ) && defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
-        AliasType v = __ldg(reinterpret_cast<AliasType*>(&m_alloc_ptr[i]));
-        return  *(reinterpret_cast<ValueType*> (&v));
-      #elif defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
-        union Float4ValueType {
-          float4 f4[2];
-          ValueType val;
-        };
-        Float4ValueType convert;
-        convert.f4[0] = tex1Dfetch<AliasType>( m_obj , 2*(i + m_offset) );
-        convert.f4[1] = tex1Dfetch<AliasType>( m_obj , 2*(i + m_offset)+1 );
-        return  convert.val;
-      #else
-        return m_alloc_ptr[ i + m_offset ];
-      #endif
-  }
-};
-
-template< typename ValueType, class MemorySpace >
-class CudaTextureFetch< const ValueType, MemorySpace, void >
-{
-private:
-  const ValueType * m_ptr ;
-public:
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch() : m_ptr(0) {};
-
-  KOKKOS_INLINE_FUNCTION
-  ~CudaTextureFetch() {
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch( const ValueType * ptr, const AllocationTracker & ) : m_ptr(ptr) {}
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch( const CudaTextureFetch & rhs ) : m_ptr(rhs.m_ptr) {}
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch & operator = ( const CudaTextureFetch & rhs ) {
-    m_ptr = rhs.m_ptr;
-    return *this ;
-  }
-
-  explicit KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch( ValueType * const base_view_ptr, AllocationTracker const & /*tracker*/ ) {
-    m_ptr = base_view_ptr;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch & operator = (const ValueType* base_view_ptr) {
-    m_ptr = base_view_ptr;
-    return *this;
-  }
-
-
-  KOKKOS_INLINE_FUNCTION
-  operator const ValueType * () const { return m_ptr ; }
-
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  ValueType operator[]( const iType & i ) const
-  {
-    return m_ptr[ i ];
-  }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-/** \brief  Replace Default ViewDataHandle with Cuda texture fetch specialization
- *          if 'const' value type, CudaSpace and random access.
- */
-template< class ViewTraits >
-class ViewDataHandle< ViewTraits ,
-  typename enable_if< ( is_same< typename ViewTraits::memory_space,CudaSpace>::value ||
-                        is_same< typename ViewTraits::memory_space,CudaUVMSpace>::value )
-                      &&
-                      is_same<typename ViewTraits::const_value_type,typename ViewTraits::value_type>::value
-                      &&
-                      ViewTraits::memory_traits::RandomAccess
-                    >::type >
-{
-public:
-  enum { ReturnTypeIsReference = false };
-
-  typedef Impl::CudaTextureFetch< typename ViewTraits::value_type
-                                , typename ViewTraits::memory_space> handle_type;
-
-  KOKKOS_INLINE_FUNCTION
-  static handle_type create_handle( typename ViewTraits::value_type * arg_data_ptr, AllocationTracker const & arg_tracker )
-  {
-    return handle_type(arg_data_ptr, arg_tracker);
-  }
-
-  typedef typename ViewTraits::value_type return_type;
-};
-
-}
-}
-
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
 #endif // KOKKOS_HAVE_CUDA
 #endif /* #ifndef KOKKOS_CUDA_VIEW_HPP */

--- a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
+++ b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
@ -0,0 +1,611 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
+#define KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
+
+#include <Kokkos_ExecPolicy.hpp>
+#include <Kokkos_Parallel.hpp>
+#include <initializer_list>
+
+#if defined(KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION) && defined(KOKKOS_HAVE_PRAGMA_IVDEP) && !defined(__CUDA_ARCH__)
+#define KOKKOS_MDRANGE_IVDEP
+#endif
+
+namespace Kokkos { namespace Experimental {
+
+enum class Iterate
+{
+  Default, // Default for the device
+  Left,    // Left indices stride fastest
+  Right,   // Right indices stride fastest
+  Flat,    // Do not tile, only valid for inner direction
+};
+
+template <typename ExecSpace>
+struct default_outer_direction
+{
+  using type = Iterate;
+  static constexpr Iterate value = Iterate::Right;
+};
+
+template <typename ExecSpace>
+struct default_inner_direction
+{
+  using type = Iterate;
+  static constexpr Iterate value = Iterate::Right;
+};
+
+
+// Iteration Pattern
+template < unsigned N
+         , Iterate OuterDir = Iterate::Default
+         , Iterate InnerDir = Iterate::Default
+         >
+struct Rank
+{
+  static_assert( N != 0u, "Kokkos Error: rank 0 undefined");
+  static_assert( N != 1u, "Kokkos Error: rank 1 is not a multi-dimensional range");
+  static_assert( N < 4u, "Kokkos Error: Unsupported rank...");
+
+  using iteration_pattern = Rank<N, OuterDir, InnerDir>;
+
+  static constexpr int rank = N;
+  static constexpr Iterate outer_direction = OuterDir;
+  static constexpr Iterate inner_direction = InnerDir;
+};
+
+
+
+// multi-dimensional iteration pattern
+template <typename... Properties>
+struct MDRangePolicy
+{
+  using range_policy = RangePolicy<Properties...>;
+
+  static_assert( !std::is_same<range_policy,void>::value
+               , "Kokkos Error: MD iteration pattern not defined" );
+
+  using iteration_pattern   = typename range_policy::iteration_pattern;
+  using work_tag            = typename range_policy::work_tag;
+
+  static constexpr int rank = iteration_pattern::rank;
+
+  static constexpr int outer_direction = static_cast<int> (
+      (iteration_pattern::outer_direction != Iterate::Default && iteration_pattern::outer_direction != Iterate::Flat)
+    ? iteration_pattern::outer_direction
+    : default_outer_direction< typename range_policy::execution_space>::value );
+
+  static constexpr int inner_direction = static_cast<int> (
+      iteration_pattern::inner_direction != Iterate::Default
+    ? iteration_pattern::inner_direction
+    : default_inner_direction< typename range_policy::execution_space>::value ) ;
+
+
+  // Ugly ugly workaround intel 14 not handling scoped enum correctly
+  static constexpr int Flat = static_cast<int>( Iterate::Flat );
+  static constexpr int Right = static_cast<int>( Iterate::Right );
+
+
+  using size_type   = typename range_policy::index_type;
+  using index_type  = typename std::make_signed<size_type>::type;
+
+
+  template <typename I>
+  MDRangePolicy( std::initializer_list<I> upper_corner )
+  {
+    static_assert( std::is_integral<I>::value, "Kokkos Error: corner defined with non-integral type" );
+
+    // TODO check size of lists equal to rank
+    // static_asserts on initializer_list.size() require c++14
+
+    //static_assert( upper_corner.size() == rank, "Kokkos Error: upper_corner has incorrect rank" );
+
+    const auto u = upper_corner.begin();
+
+    m_num_tiles = 1;
+    for (int i=0; i<rank; ++i) {
+      m_offset[i] = static_cast<index_type>(0);
+      m_dim[i]    = static_cast<index_type>(u[i]);
+      if (inner_direction != Flat) {
+        // default tile size to 4
+        m_tile[i] = 4;
+      } else {
+        m_tile[i] = 1;
+      }
+      m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i];
+      m_num_tiles *= m_tile_dim[i];
+    }
+  }
+
+  template <typename IA, typename IB>
+  MDRangePolicy( std::initializer_list<IA> corner_a
+               , std::initializer_list<IB> corner_b
+               )
+  {
+    static_assert( std::is_integral<IA>::value, "Kokkos Error: corner A defined with non-integral type" );
+    static_assert( std::is_integral<IB>::value, "Kokkos Error: corner B defined with non-integral type" );
+
+    // TODO check size of lists equal to rank
+    // static_asserts on initializer_list.size() require c++14
+    //static_assert( corner_a.size() == rank, "Kokkos Error: corner_a has incorrect rank" );
+    //static_assert( corner_b.size() == rank, "Kokkos Error: corner_b has incorrect rank" );
+
+
+    using A = typename std::make_signed<IA>::type;
+    using B = typename std::make_signed<IB>::type;
+
+    const auto a = [=](int i) { return static_cast<A>(corner_a.begin()[i]); };
+    const auto b = [=](int i) { return static_cast<B>(corner_b.begin()[i]); };
+
+    m_num_tiles = 1;
+    for (int i=0; i<rank; ++i) {
+      m_offset[i] = static_cast<index_type>(a(i) <= b(i) ? a(i) : b(i));
+      m_dim[i]    = static_cast<index_type>(a(i) <= b(i) ? b(i) - a(i) : a(i) - b(i));
+      if (inner_direction != Flat) {
+        // default tile size to 4
+        m_tile[i] = 4;
+      } else {
+        m_tile[i] = 1;
+      }
+      m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i];
+      m_num_tiles *= m_tile_dim[i];
+    }
+  }
+
+  template <typename IA, typename IB, typename T>
+  MDRangePolicy( std::initializer_list<IA> corner_a
+               , std::initializer_list<IB> corner_b
+               , std::initializer_list<T> tile
+               )
+  {
+    static_assert( std::is_integral<IA>::value, "Kokkos Error: corner A defined with non-integral type" );
+    static_assert( std::is_integral<IB>::value, "Kokkos Error: corner B defined with non-integral type" );
+    static_assert( std::is_integral<T>::value, "Kokkos Error: tile defined with non-integral type" );
+    static_assert( inner_direction != Flat, "Kokkos Error: tiling not support with flat iteration" );
+
+    // TODO check size of lists equal to rank
+    // static_asserts on initializer_list.size() require c++14
+    //static_assert( corner_a.size() == rank, "Kokkos Error: corner_a has incorrect rank" );
+    //static_assert( corner_b.size() == rank, "Kokkos Error: corner_b has incorrect rank" );
+    //static_assert( tile.size() == rank, "Kokkos Error: tile has incorrect rank" );
+
+    using A = typename std::make_signed<IA>::type;
+    using B = typename std::make_signed<IB>::type;
+
+    const auto a = [=](int i) { return static_cast<A>(corner_a.begin()[i]); };
+    const auto b = [=](int i) { return static_cast<B>(corner_b.begin()[i]); };
+    const auto t = tile.begin();
+
+    m_num_tiles = 1;
+    for (int i=0; i<rank; ++i) {
+      m_offset[i] = static_cast<index_type>(a(i) <= b(i) ? a(i) : b(i));
+      m_dim[i]    = static_cast<index_type>(a(i) <= b(i) ? b(i) - a(i) : a(i) - b(i));
+      m_tile[i]   = static_cast<int>(t[i] > (T)0 ? t[i] : (T)1 );
+      m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i];
+      m_num_tiles *= m_tile_dim[i];
+    }
+  }
+
+  index_type   m_offset[rank];
+  index_type   m_dim[rank];
+  int          m_tile[rank];
+  index_type   m_tile_dim[rank];
+  size_type    m_num_tiles;       // product of tile dims
+};
+
+namespace Impl {
+
+// Serial, Threads, OpenMP
+// use enable_if to overload for Cuda
+template < typename MDRange, typename Functor, typename Enable = void >
+struct MDForFunctor
+{
+  using work_tag   = typename MDRange::work_tag;
+  using index_type = typename MDRange::index_type;
+  using size_type  = typename MDRange::size_type;
+
+  MDRange m_range;
+  Functor m_func;
+
+  KOKKOS_INLINE_FUNCTION
+  MDForFunctor( MDRange const& range, Functor const& f )
+    : m_range(range)
+    , m_func( f )
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  MDForFunctor( MDRange const& range, Functor && f )
+    : m_range(range)
+    , m_func( std::forward<Functor>(f) )
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  MDForFunctor( MDRange && range, Functor const& f )
+    : m_range( std::forward<MDRange>(range) )
+    , m_func( f )
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  MDForFunctor( MDRange && range, Functor && f )
+    : m_range( std::forward<MDRange>(range) )
+    , m_func( std::forward<Functor>(f) )
+  {}
+
+
+  KOKKOS_INLINE_FUNCTION
+  MDForFunctor( MDForFunctor const& ) = default;
+
+  KOKKOS_INLINE_FUNCTION
+  MDForFunctor& operator=( MDForFunctor const& ) = default;
+
+  KOKKOS_INLINE_FUNCTION
+  MDForFunctor( MDForFunctor && ) = default;
+
+  KOKKOS_INLINE_FUNCTION
+  MDForFunctor& operator=( MDForFunctor && ) = default;
+
+  // Rank-2, Flat, No Tag
+  template <typename Idx>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<(  std::is_integral<Idx>::value
+                          && std::is_same<void, work_tag>::value
+                          && MDRange::rank == 2
+                          && MDRange::inner_direction == MDRange::Flat
+                          )>::type
+  operator()(Idx t) const
+  {
+    if (  MDRange::outer_direction == MDRange::Right ) {
+      m_func( m_range.m_offset[0] + ( t / m_range.m_dim[1] )
+            , m_range.m_offset[1] + ( t % m_range.m_dim[1] ) );
+    } else {
+      m_func( m_range.m_offset[0] + ( t % m_range.m_dim[0] )
+            , m_range.m_offset[1] + ( t / m_range.m_dim[0] ) );
+    }
+  }
+
+  // Rank-2, Flat, Tag
+  template <typename Idx>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<(  std::is_integral<Idx>::value
+                          && !std::is_same<void, work_tag>::value
+                          && MDRange::rank == 2
+                          && MDRange::inner_direction == MDRange::Flat
+                          )>::type
+  operator()(Idx t) const
+  {
+    if (  MDRange::outer_direction == MDRange::Right ) {
+      m_func( work_tag{}, m_range.m_offset[0] + ( t / m_range.m_dim[1] )
+            , m_range.m_offset[1] + ( t % m_range.m_dim[1] ) );
+    } else {
+      m_func( work_tag{}, m_range.m_offset[0] + ( t % m_range.m_dim[0] )
+            , m_range.m_offset[1] + ( t / m_range.m_dim[0] ) );
+    }
+  }
+
+  // Rank-2, Not Flat, No Tag
+  template <typename Idx>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<(  std::is_integral<Idx>::value
+                          && std::is_same<void, work_tag>::value
+                          && MDRange::rank == 2
+                          && MDRange::inner_direction != MDRange::Flat
+                          )>::type
+  operator()(Idx t) const
+  {
+    index_type t0, t1;
+    if (  MDRange::outer_direction == MDRange::Right ) {
+      t0 = t / m_range.m_tile_dim[1];
+      t1 = t % m_range.m_tile_dim[1];
+    } else {
+      t0 = t % m_range.m_tile_dim[0];
+      t1 = t / m_range.m_tile_dim[0];
+    }
+
+    const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
+    const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
+
+    const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
+    const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
+
+    if (  MDRange::inner_direction == MDRange::Right ) {
+      for (int i0=b0; i0<e0; ++i0) {
+      #if defined(KOKKOS_MDRANGE_IVDEP)
+      #pragma ivdep
+      #endif
+      for (int i1=b1; i1<e1; ++i1) {
+        m_func( i0, i1 );
+      }}
+    } else {
+      for (int i1=b1; i1<e1; ++i1) {
+      #if defined(KOKKOS_MDRANGE_IVDEP)
+      #pragma ivdep
+      #endif
+      for (int i0=b0; i0<e0; ++i0) {
+        m_func( i0, i1 );
+      }}
+    }
+  }
+
+  // Rank-2, Not Flat, Tag
+  template <typename Idx>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<(  std::is_integral<Idx>::value
+                          && !std::is_same<void, work_tag>::value
+                          && MDRange::rank == 2
+                          && MDRange::inner_direction != MDRange::Flat
+                          )>::type
+  operator()(Idx t) const
+  {
+    work_tag tag;
+
+    index_type t0, t1;
+    if (  MDRange::outer_direction == MDRange::Right ) {
+      t0 = t / m_range.m_tile_dim[1];
+      t1 = t % m_range.m_tile_dim[1];
+    } else {
+      t0 = t % m_range.m_tile_dim[0];
+      t1 = t / m_range.m_tile_dim[0];
+    }
+
+    const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
+    const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
+
+    const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
+    const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
+
+    if (  MDRange::inner_direction == MDRange::Right ) {
+      for (int i0=b0; i0<e0; ++i0) {
+      #if defined(KOKKOS_MDRANGE_IVDEP)
+      #pragma ivdep
+      #endif
+      for (int i1=b1; i1<e1; ++i1) {
+        m_func( tag, i0, i1 );
+      }}
+    } else {
+      for (int i1=b1; i1<e1; ++i1) {
+      #if defined(KOKKOS_MDRANGE_IVDEP)
+      #pragma ivdep
+      #endif
+      for (int i0=b0; i0<e0; ++i0) {
+        m_func( tag, i0, i1 );
+      }}
+    }
+  }
+
+  //---------------------------------------------------------------------------
+
+  // Rank-3, Flat, No Tag
+  template <typename Idx>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<(  std::is_integral<Idx>::value
+                          && std::is_same<void, work_tag>::value
+                          && MDRange::rank == 3
+                          && MDRange::inner_direction == MDRange::Flat
+                          )>::type
+  operator()(Idx t) const
+  {
+    if (  MDRange::outer_direction == MDRange::Right ) {
+    const int64_t tmp_prod = m_range.m_dim[1]*m_range.m_dim[2];
+    m_func( m_range.m_offset[0] + (  t / tmp_prod )
+          , m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[2] )
+          , m_range.m_offset[2] + ( (t % tmp_prod) % m_range.m_dim[2] )
+          );
+    } else {
+    const int64_t tmp_prod = m_range.m_dim[0]*m_range.m_dim[1];
+    m_func( m_range.m_offset[0] + ( (t % tmp_prod) % m_range.m_dim[0] )
+          , m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[0] )
+          , m_range.m_offset[2] + (  t / tmp_prod )
+          );
+    }
+  }
+
+  // Rank-3, Flat, Tag
+  template <typename Idx>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<(  std::is_integral<Idx>::value
+                          && !std::is_same<void, work_tag>::value
+                          && MDRange::rank == 3
+                          && MDRange::inner_direction == MDRange::Flat
+                          )>::type
+  operator()(Idx t) const
+  {
+    if (  MDRange::outer_direction == MDRange::Right ) {
+      const int64_t tmp_prod = m_range.m_dim[1]*m_range.m_dim[2];
+      m_func( work_tag{}
+            , m_range.m_offset[0] + (  t / tmp_prod )
+            , m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[2] )
+            , m_range.m_offset[2] + ( (t % tmp_prod) % m_range.m_dim[2] )
+            );
+    } else {
+      const int64_t tmp_prod = m_range.m_dim[0]*m_range.m_dim[1];
+      m_func( work_tag{}
+            , m_range.m_offset[0] + ( (t % tmp_prod) % m_range.m_dim[0] )
+            , m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[0] )
+            , m_range.m_offset[2] + (  t / tmp_prod )
+            );
+    }
+  }
+
+  // Rank-3, Not Flat, No Tag
+  template <typename Idx>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<(  std::is_integral<Idx>::value
+                          && std::is_same<void, work_tag>::value
+                          && MDRange::rank == 3
+                          && MDRange::inner_direction != MDRange::Flat
+                          )>::type
+  operator()(Idx t) const
+  {
+    index_type t0, t1, t2;
+    if (  MDRange::outer_direction == MDRange::Right ) {
+      const index_type tmp_prod = ( m_range.m_tile_dim[1]*m_range.m_tile_dim[2]);
+      t0 = t / tmp_prod;
+      t1 = ( t % tmp_prod ) / m_range.m_tile_dim[2];
+      t2 = ( t % tmp_prod ) % m_range.m_tile_dim[2];
+    } else {
+      const index_type tmp_prod = ( m_range.m_tile_dim[0]*m_range.m_tile_dim[1]);
+      t0 = ( t % tmp_prod ) % m_range.m_tile_dim[0];
+      t1 = ( t % tmp_prod ) / m_range.m_tile_dim[0];
+      t2 = t / tmp_prod;
+    }
+
+    const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
+    const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
+    const index_type b2 = t2 * m_range.m_tile[2] + m_range.m_offset[2];
+
+    const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
+    const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
+    const index_type e2 = b2 + m_range.m_tile[2] <= (m_range.m_dim[2] + m_range.m_offset[2] ) ? b2 + m_range.m_tile[2] : ( m_range.m_dim[2] + m_range.m_offset[2] );
+
+    if (  MDRange::inner_direction == MDRange::Right ) {
+      for (int i0=b0; i0<e0; ++i0) {
+      for (int i1=b1; i1<e1; ++i1) {
+      #if defined(KOKKOS_MDRANGE_IVDEP)
+      #pragma ivdep
+      #endif
+      for (int i2=b2; i2<e2; ++i2) {
+        m_func( i0, i1, i2 );
+      }}}
+    } else {
+      for (int i2=b2; i2<e2; ++i2) {
+      for (int i1=b1; i1<e1; ++i1) {
+      #if defined(KOKKOS_MDRANGE_IVDEP)
+      #pragma ivdep
+      #endif
+      for (int i0=b0; i0<e0; ++i0) {
+        m_func( i0, i1, i2 );
+      }}}
+    }
+  }
+
+  // Rank-3, Not Flat, Tag
+  template <typename Idx>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<(  std::is_integral<Idx>::value
+                          && !std::is_same<void, work_tag>::value
+                          && MDRange::rank == 3
+                          && MDRange::inner_direction != MDRange::Flat
+                          )>::type
+  operator()(Idx t) const
+  {
+    work_tag tag;
+
+    index_type t0, t1, t2;
+    if (  MDRange::outer_direction == MDRange::Right ) {
+      const index_type tmp_prod = ( m_range.m_tile_dim[1]*m_range.m_tile_dim[2]);
+      t0 = t / tmp_prod;
+      t1 = ( t % tmp_prod ) / m_range.m_tile_dim[2];
+      t2 = ( t % tmp_prod ) % m_range.m_tile_dim[2];
+    } else {
+      const index_type tmp_prod = ( m_range.m_tile_dim[0]*m_range.m_tile_dim[1]);
+      t0 = ( t % tmp_prod ) % m_range.m_tile_dim[0];
+      t1 = ( t % tmp_prod ) / m_range.m_tile_dim[0];
+      t2 = t / tmp_prod;
+    }
+
+    const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
+    const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
+    const index_type b2 = t2 * m_range.m_tile[2] + m_range.m_offset[2];
+
+    const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
+    const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
+    const index_type e2 = b2 + m_range.m_tile[2] <= (m_range.m_dim[2] + m_range.m_offset[2] ) ? b2 + m_range.m_tile[2] : ( m_range.m_dim[2] + m_range.m_offset[2] );
+
+    if (  MDRange::inner_direction == MDRange::Right ) {
+      for (int i0=b0; i0<e0; ++i0) {
+      for (int i1=b1; i1<e1; ++i1) {
+      #if defined(KOKKOS_MDRANGE_IVDEP)
+      #pragma ivdep
+      #endif
+      for (int i2=b2; i2<e2; ++i2) {
+        m_func( tag, i0, i1, i2 );
+      }}}
+    } else {
+      for (int i2=b2; i2<e2; ++i2) {
+      for (int i1=b1; i1<e1; ++i1) {
+      #if defined(KOKKOS_MDRANGE_IVDEP)
+      #pragma ivdep
+      #endif
+      for (int i0=b0; i0<e0; ++i0) {
+        m_func( tag, i0, i1, i2 );
+      }}}
+    }
+  }
+};
+
+
+
+} // namespace Impl
+
+
+template <typename MDRange, typename Functor>
+void md_parallel_for( MDRange const& range
+                    , Functor const& f
+                    , const std::string& str = ""
+                    )
+{
+  Impl::MDForFunctor<MDRange, Functor> g(range, f);
+
+  using range_policy = typename MDRange::range_policy;
+
+  Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
+}
+
+template <typename MDRange, typename Functor>
+void md_parallel_for( const std::string& str
+                    , MDRange const& range
+                    , Functor const& f
+                    )
+{
+  Impl::MDForFunctor<MDRange, Functor> g(range, f);
+
+  using range_policy = typename MDRange::range_policy;
+
+  Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
+}
+
+}} // namespace Kokkos::Experimental
+
+#endif //KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
+
--- a/lib/kokkos/core/src/KokkosExp_View.hpp
+++ b/lib/kokkos/core/src/KokkosExp_View.hpp
--- a/lib/kokkos/core/src/Kokkos_Complex.hpp
+++ b/lib/kokkos/core/src/Kokkos_Complex.hpp
@ -121,13 +121,22 @@ public:
    return *this;
  }

-  //! Assignment operator.
+  /// \brief Assignment operator, for volatile <tt>*this</tt> and
+  ///   nonvolatile input.
+  ///
+  /// \param src [in] Input; right-hand side of the assignment.
+  ///
+  /// This operator returns \c void instead of <tt>volatile
+  /// complex<RealType>& </tt>.  See Kokkos Issue #177 for the
+  /// explanation.  In practice, this means that you should not chain
+  /// assignments with volatile lvalues.
  template<class InputRealType>
  KOKKOS_INLINE_FUNCTION
-  volatile complex<RealType>& operator= (const complex<InputRealType>& src) volatile {
+  void operator= (const complex<InputRealType>& src) volatile {
    re_ = src.re_;
    im_ = src.im_;
-    return *this;
+    // We deliberately do not return anything here.  See explanation
+    // in public documentation above.
  }

  //! Assignment operator.
--- a/lib/kokkos/core/src/impl/Kokkos_BasicAllocators.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_BasicAllocators.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,86 +36,43 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */

-#ifndef KOKKOS_BASIC_ALLOCATORS_HPP
-#define KOKKOS_BASIC_ALLOCATORS_HPP
+#ifndef KOKKOS_CORE_CONCEPTS_HPP
+#define KOKKOS_CORE_CONCEPTS_HPP

-#if ! KOKKOS_USING_EXP_VIEW
+#include <type_traits>

-namespace Kokkos { namespace Impl {
+namespace Kokkos {
+//Schedules for Execution Policies
+struct Static {};
+struct Dynamic {};

-/// class UnmanagedAllocator
-/// does nothing when deallocate(ptr,size) is called
-class UnmanagedAllocator
+//Schedule Wrapper Type
+template<class T>
+struct Schedule
 {
-public:
-  static const char * name() { return "Unmanaged Allocator"; }
-
-  static void deallocate(void * /*ptr*/, size_t /*size*/) {}
+  static_assert(  std::is_same<T,Static>::value
+               || std::is_same<T,Dynamic>::value
+               , "Kokkos: Invalid Schedule<> type."
+               );
+  using schedule_type = Schedule<T>;
+  using type = T;
 };

-
-/// class MallocAllocator
-class MallocAllocator
+//Specify Iteration Index Type
+template<typename T>
+struct IndexType
 {
-public:
-  static const char * name()
-  {
-    return "Malloc Allocator";
-  }
-
-  static void* allocate(size_t size);
-
-  static void deallocate(void * ptr, size_t size);
-
-  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
+  static_assert(std::is_integral<T>::value,"Kokkos: Invalid IndexType<>.");
+  using index_type = IndexType<T>;
+  using type = T;
 };

+} // namespace Kokkos

-/// class AlignedAllocator
-/// memory aligned to Kokkos::Impl::MEMORY_ALIGNMENT
-class AlignedAllocator
-{
-public:
-  static const char * name()
-  {
-    return "Aligned Allocator";
-  }
-
-  static void* allocate(size_t size);
-
-  static void deallocate(void * ptr, size_t size);
-
-  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
-};
-
-
-/// class PageAlignedAllocator
-/// memory aligned to PAGE_SIZE
-class PageAlignedAllocator
-{
-public:
-  static const char * name()
-  {
-    return "Page Aligned Allocator";
-  }
-
-  static void* allocate(size_t size);
-
-  static void deallocate(void * ptr, size_t size);
-
-  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
-};
-
-
-}} // namespace Kokkos::Impl
-
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
-#endif //KOKKOS_BASIC_ALLOCATORS_HPP
-
+#endif // KOKKOS_CORE_CONCEPTS_HPP

--- a/lib/kokkos/core/src/Kokkos_Core.hpp
+++ b/lib/kokkos/core/src/Kokkos_Core.hpp
@ -159,8 +159,6 @@ void * kokkos_realloc( void * arg_alloc , const size_t arg_alloc_size )
 } // namespace Kokkos


-#if KOKKOS_USING_EXP_VIEW
-
 namespace Kokkos {

 using Kokkos::Experimental::kokkos_malloc ;
@ -169,76 +167,6 @@ using Kokkos::Experimental::kokkos_free ;

 }

-#else
-
-namespace Kokkos {
-
-namespace Impl {
-// should only by used by kokkos_malloc and kokkos_free
-struct MallocHelper
-{
-  static void increment_ref_count( AllocationTracker const & tracker )
-  {
-    tracker.increment_ref_count();
-  }
-
-  static void decrement_ref_count( AllocationTracker const & tracker )
-  {
-    tracker.decrement_ref_count();
-  }
-};
-} // namespace Impl
-
-/* Allocate memory from a memory space.
- * The allocation is tracked in Kokkos memory tracking system, so
- * leaked memory can be identified.
- */
-template< class Arg = DefaultExecutionSpace>
-void* kokkos_malloc(const std::string label, size_t count) {
-  if(count == 0) return NULL;
-  typedef typename Arg::memory_space MemorySpace;
-  Impl::AllocationTracker tracker = MemorySpace::allocate_and_track(label,count);;
-  Impl::MallocHelper::increment_ref_count( tracker );
-  return tracker.alloc_ptr();
-}
-
-template< class Arg = DefaultExecutionSpace>
-void* kokkos_malloc(const size_t& count) {
-  return kokkos_malloc<Arg>("DefaultLabel",count);
-}
-
-
-/* Free memory from a memory space.
- */
-template< class Arg = DefaultExecutionSpace>
-void kokkos_free(const void* ptr) {
-  typedef typename Arg::memory_space MemorySpace;
-  typedef typename MemorySpace::allocator allocator;
-  Impl::AllocationTracker tracker = Impl::AllocationTracker::find<allocator>(ptr);
-  if (tracker.is_valid()) {
-    Impl::MallocHelper::decrement_ref_count( tracker );
-  }
-}
-
-
-template< class Arg = DefaultExecutionSpace>
-void* kokkos_realloc(const void* old_ptr, size_t size) {
-  if(old_ptr == NULL)
-    return kokkos_malloc<Arg>(size);
-
-  typedef typename Arg::memory_space MemorySpace;
-  typedef typename MemorySpace::allocator allocator;
-  Impl::AllocationTracker tracker = Impl::AllocationTracker::find<allocator>(old_ptr);
-
-  tracker.reallocate(size);
-
-  return tracker.alloc_ptr();
-}
-
-} // namespace Kokkos
-
-#endif
-
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

--- a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
+++ b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
@ -69,6 +69,9 @@ namespace {
 /**\brief Token to indicate that a parameter's value is to be automatically selected */
 constexpr AUTO_t AUTO = Kokkos::AUTO_t();
 }
+
+struct InvalidType {};
+
 }

 //----------------------------------------------------------------------------
@ -205,7 +208,7 @@ namespace Impl {
 template< class Functor
        , class Policy
        , class EnableFunctor = void 
-	, class EnablePolicy = void
+	      , class EnablePolicy = void
        >
 struct FunctorPolicyExecutionSpace;

@ -225,7 +228,7 @@ template< class FunctorType , class ExecPolicy , class ExecutionSpace =
 ///
 /// This is an implementation detail of parallel_reduce.  Users should
 /// skip this and go directly to the nonmember function parallel_reduce.
-template< class FunctorType , class ExecPolicy , class ExecutionSpace = 
+template< class FunctorType , class ExecPolicy , class ReducerType = InvalidType, class ExecutionSpace =
          typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space 
        > class ParallelReduce ;

--- a/lib/kokkos/core/src/Kokkos_Cuda.hpp
+++ b/lib/kokkos/core/src/Kokkos_Cuda.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -56,11 +56,14 @@
 #include <Kokkos_CudaSpace.hpp>

 #include <Kokkos_Parallel.hpp>
+#include <Kokkos_TaskPolicy.hpp>
 #include <Kokkos_Layout.hpp>
 #include <Kokkos_ScratchSpace.hpp>
 #include <Kokkos_MemoryTraits.hpp>
 #include <impl/Kokkos_Tags.hpp>

+#include <KokkosExp_MDRangePolicy.hpp>
+
 /*--------------------------------------------------------------------------*/

 namespace Kokkos {
@ -108,7 +111,7 @@ public:
  //! This execution space's preferred array layout.
  typedef LayoutLeft            array_layout ;

-  //! 
+  //!
  typedef ScratchMemorySpace< Cuda >  scratch_memory_space ;

  //@}
@ -257,10 +260,10 @@ struct VerifyExecutionCanAccessMemorySpace
 #include <Cuda/Kokkos_CudaExec.hpp>
 #include <Cuda/Kokkos_Cuda_View.hpp>

-#include <KokkosExp_View.hpp>
 #include <Cuda/KokkosExp_Cuda_View.hpp>

 #include <Cuda/Kokkos_Cuda_Parallel.hpp>
+#include <Cuda/Kokkos_Cuda_Task.hpp>

 //----------------------------------------------------------------------------

--- a/lib/kokkos/core/src/Kokkos_CudaSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp
@ -54,10 +54,7 @@

 #include <Kokkos_HostSpace.hpp>

-#include <impl/Kokkos_AllocationTracker.hpp>
-
 #include <Cuda/Kokkos_Cuda_abort.hpp>
-#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>

 /*--------------------------------------------------------------------------*/

@ -77,33 +74,6 @@ public:

  /*--------------------------------*/

-#if ! KOKKOS_USING_EXP_VIEW
-
-  typedef Impl::CudaMallocAllocator allocator;
-
-  /** \brief  Allocate a contiguous block of memory.
-   *
-   *  The input label is associated with the block of memory.
-   *  The block of memory is tracked via reference counting where
-   *  allocation gives it a reference count of one.
-   */
-  static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
-
-  /*--------------------------------*/
-  /** \brief  Cuda specific function to attached texture object to an allocation.
-   *          Output the texture object, base pointer, and offset from the input pointer.
-   */
-#if defined( __CUDACC__ )
-  static void texture_object_attach(  Impl::AllocationTracker const & tracker
-                                    , unsigned type_size
-                                    , ::cudaChannelFormatDesc const & desc
-                                   );
-#endif
-
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
-  /*--------------------------------*/
-
  CudaSpace();
  CudaSpace( CudaSpace && rhs ) = default ;
  CudaSpace( const CudaSpace & rhs ) = default ;
@ -137,7 +107,7 @@ namespace Impl {
 /// where the hash value is derived from the address of the
 /// object for which an atomic operation is performed.
 /// This function initializes the locks to zero (unset).
-void init_lock_array_cuda_space();
+void init_lock_arrays_cuda_space();

 /// \brief Retrieve the pointer to the lock array for arbitrary size atomics.
 ///
@ -146,7 +116,23 @@ void init_lock_array_cuda_space();
 /// object for which an atomic operation is performed.
 /// This function retrieves the lock array pointer.
 /// If the array is not yet allocated it will do so.
-int* lock_array_cuda_space_ptr(bool deallocate = false);
+int* atomic_lock_array_cuda_space_ptr(bool deallocate = false);
+
+/// \brief Retrieve the pointer to the scratch array for team and thread private global memory.
+///
+/// Team and Thread private scratch allocations in
+/// global memory are aquired via locks.
+/// This function retrieves the lock array pointer.
+/// If the array is not yet allocated it will do so.
+int* scratch_lock_array_cuda_space_ptr(bool deallocate = false);
+
+/// \brief Retrieve the pointer to the scratch array for unique identifiers.
+///
+/// Unique identifiers in the range 0-Cuda::concurrency
+/// are provided via locks.
+/// This function retrieves the lock array pointer.
+/// If the array is not yet allocated it will do so.
+int* threadid_lock_array_cuda_space_ptr(bool deallocate = false);
 }
 } // namespace Kokkos

@ -172,33 +158,6 @@ public:

  /*--------------------------------*/

-#if ! KOKKOS_USING_EXP_VIEW
-
-  typedef Impl::CudaUVMAllocator allocator;
-
-  /** \brief  Allocate a contiguous block of memory.
-   *
-   *  The input label is associated with the block of memory.
-   *  The block of memory is tracked via reference counting where
-   *  allocation gives it a reference count of one.
-   */
-  static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
-
-
-  /** \brief  Cuda specific function to attached texture object to an allocation.
-   *          Output the texture object, base pointer, and offset from the input pointer.
-   */
-#if defined( __CUDACC__ )
-  static void texture_object_attach(  Impl::AllocationTracker const & tracker
-                                    , unsigned type_size
-                                    , ::cudaChannelFormatDesc const & desc
-                                   );
-#endif
-
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
-  /*--------------------------------*/
-
  CudaUVMSpace();
  CudaUVMSpace( CudaUVMSpace && rhs ) = default ;
  CudaUVMSpace( const CudaUVMSpace & rhs ) = default ;
@ -242,22 +201,6 @@ public:

  /*--------------------------------*/

-#if ! KOKKOS_USING_EXP_VIEW
-
-  typedef Impl::CudaHostAllocator allocator ;
-
-  /** \brief  Allocate a contiguous block of memory.
-   *
-   *  The input label is associated with the block of memory.
-   *  The block of memory is tracked via reference counting where
-   *  allocation gives it a reference count of one.
-   */
-  static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
-
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
-  /*--------------------------------*/
-
  CudaHostPinnedSpace();
  CudaHostPinnedSpace( CudaHostPinnedSpace && rhs ) = default ;
  CudaHostPinnedSpace( const CudaHostPinnedSpace & rhs ) = default ;
--- a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp
+++ b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -47,167 +47,15 @@
 #include <Kokkos_Core_fwd.hpp>
 #include <impl/Kokkos_Traits.hpp>
 #include <impl/Kokkos_StaticAssert.hpp>
+#include <impl/Kokkos_Error.hpp>
 #include <impl/Kokkos_Tags.hpp>
+#include <impl/Kokkos_AnalyzePolicy.hpp>
+#include <Kokkos_Concepts.hpp>
 #include <iostream>
 //----------------------------------------------------------------------------

 namespace Kokkos {

-//Schedules for Execution Policies
-struct Static {
-};
-
-struct Dynamic {
-};
-
-//Schedule Wrapper Type
-template<class ScheduleType>
-struct Schedule {
-  static_assert(std::is_same<ScheduleType,Static>::value ||
-                std::is_same<ScheduleType,Dynamic>::value,
-                "Kokkos: Invalid Schedule<> type.");
-  typedef Schedule<ScheduleType> schedule_type;
-  typedef ScheduleType type;
-};
-
-//Specif Iteration Index Type
-template<typename iType>
-struct IndexType {
-  static_assert(std::is_integral<iType>::value,"Kokkos: Invalid IndexType<>.");
-  typedef IndexType<iType> index_type;
-  typedef iType type;
-};
-
-namespace Impl {
-
-template<class Arg>
-struct is_schedule_type {
-  enum { value = 0};
-};
-
-template<class ScheduleType>
-struct is_schedule_type<Schedule<ScheduleType> > {
-  enum {value = 1 };
-};
-
-template<class Arg>
-struct is_index_type {
-  enum { value = 0 };
-};
-
-template<typename iType>
-struct is_index_type<IndexType<iType> > {
-  enum { value = 1 };
-};
-
-template<typename Arg>
-struct is_tag_type {
-  enum { value = !(is_execution_space<Arg>::value ||
-                   is_schedule_type<Arg>::value ||
-                   is_index_type<Arg>::value ||
-                   std::is_integral<Arg>::value)};
-};
-
-//Policy Traits
-template<class ... Properties>
-struct PolicyTraits;
-
-template<>
-struct PolicyTraits<void> {
-  typedef void execution_space;
-  typedef void schedule_type;
-  typedef void index_type;
-  typedef void tag_type;
-};
-
-
-//Strip off ExecutionSpace
-template<class ExecutionSpace, class ... Props>
-struct PolicyTraits<typename std::enable_if<is_execution_space<ExecutionSpace>::value >::type,ExecutionSpace,Props ...> {
-  static_assert( std::is_same<typename PolicyTraits<void, Props ...>::execution_space, void>::value,
-                 "ExecutionPolicy: Only one execution space template argument may be used.");
-  typedef ExecutionSpace execution_space;
-  typedef typename PolicyTraits<void, Props ...>::schedule_type schedule_type;
-  typedef typename PolicyTraits<void, Props ...>::index_type index_type;
-  typedef typename PolicyTraits<void, Props ...>::tag_type tag_type;
-};
-
-//Strip off ScheduleType
-template<class ScheduleType, class ... Props>
-struct PolicyTraits<typename std::enable_if<is_schedule_type<Schedule<ScheduleType> >::value >::type,Schedule<ScheduleType>,Props ...> {
-  static_assert( std::is_same<typename PolicyTraits<void, Props ...>::schedule_type, void>::value,
-                 "ExecutionPolicy: Only one Schedule<..> template argument may be used.");
-  typedef typename PolicyTraits<void, Props ...>::execution_space execution_space;
-  typedef ScheduleType schedule_type;
-  typedef typename PolicyTraits<void, Props ...>::index_type index_type;
-  typedef typename PolicyTraits<void, Props ...>::tag_type tag_type;
-};
-
-//Strip off IndexType
-template<typename iType, class ... Props>
-struct PolicyTraits<void, IndexType<iType>,Props ...> {
-  static_assert( std::is_same<typename PolicyTraits<void, Props ...>::index_type, void>::value,
-                 "ExecutionPolicy: Only one IndexType<..> template argument may be used.");
-  typedef typename PolicyTraits<void, Props ...>::execution_space execution_space;
-  typedef typename PolicyTraits<void, Props ...>::schedule_type schedule_type;
-  typedef iType index_type;
-  typedef typename PolicyTraits<void, Props ...>::tag_type tag_type;
-};
-
-//Strip off raw IndexType
-template<typename iType, class ... Props>
-struct PolicyTraits<typename std::enable_if<std::is_integral<iType>::value>::type, iType,Props ...> {
-  static_assert( std::is_same<typename PolicyTraits<void, Props ...>::index_type, void>::value,
-                 "ExecutionPolicy: Only one IndexType<..> template argument may be used.");
-  typedef typename PolicyTraits<void, Props ...>::execution_space execution_space;
-  typedef typename PolicyTraits<void, Props ...>::schedule_type schedule_type;
-  typedef iType index_type;
-  typedef typename PolicyTraits<void, Props ...>::tag_type tag_type;
-};
-
-//Strip off TagType
-template<class TagType, class ... Props>
-struct PolicyTraits<typename std::enable_if<!is_schedule_type<TagType>::value &&
-                                            !is_execution_space<TagType>::value &&
-                                            !is_index_type<TagType>::value &&
-                                            !std::is_integral<TagType>::value 
-                                           >::type,
-                    TagType,Props ...> {
-  static_assert( std::is_same<typename PolicyTraits<void, Props ...>::tag_type, void>::value,
-                 "ExecutionPolicy: Only one tag type template argument may be used.");
-
-  typedef typename PolicyTraits<void, Props ...>::execution_space execution_space;
-  typedef typename PolicyTraits<void, Props ...>::schedule_type schedule_type;
-  typedef typename PolicyTraits<void, Props ...>::index_type index_type;
-  typedef TagType tag_type;
-};
-
-
-template<class ... Props>
-struct PolicyTraits {
-#ifdef KOKKOS_DIRECT_VARIADIC_EXPANSION
-  typedef typename std::conditional<std::is_same<void, typename PolicyTraits<void, Props ...>::execution_space>::value,
-    Kokkos::DefaultExecutionSpace, typename PolicyTraits<void,Props ...>::execution_space>::type execution_space;
-  typedef typename std::conditional<std::is_same<void, typename PolicyTraits<void, Props ...>::schedule_type>::value,
-    Kokkos::Static, typename PolicyTraits<void,Props ...>::schedule_type>::type schedule_type;
-  typedef typename std::conditional<std::is_same<void, typename PolicyTraits<void, Props ...>::index_type>::value,
-    typename execution_space::size_type, typename PolicyTraits<void,Props ...>::index_type>::type index_type;
-  typedef typename std::conditional<std::is_same<void, typename PolicyTraits<void, Props ...>::tag_type>::value, 
-    void, typename PolicyTraits<void,Props ...>::tag_type>::type work_tag;
-#else
-  typedef typename has_condition<Kokkos::DefaultExecutionSpace,is_execution_space,Props ...>::type execution_space;
-  typedef typename has_condition<Kokkos::Schedule<Kokkos::Static>,is_schedule_type,Props ...>::type schedule_type;
-  typedef typename has_condition<void,is_tag_type,Props ...>::type work_tag;
-  typedef typename has_condition<typename execution_space::size_type, std::is_integral, Props ... >::type default_index_type;
-  typedef typename has_condition<Kokkos::IndexType<default_index_type>,is_index_type,Props ...>::type::type index_type;
-#endif
-};
-
-}
-
-}
-
-namespace Kokkos {
 /** \brief  Execution policy for work over a range of an integral type.
 *
 * Valid template argument options:
@ -230,7 +78,9 @@ namespace Kokkos {
 *  Blocking is the granularity of partitioning the range among threads.
 */
 template<class ... Properties>
-class RangePolicy: public Impl::PolicyTraits<Properties ... > {
+class RangePolicy
+  : public Impl::PolicyTraits<Properties ... >
+{
 private:

  typedef Impl::PolicyTraits<Properties ... > traits;
@ -243,6 +93,7 @@ private:
 public:

  //! Tag this class as an execution policy
+  typedef RangePolicy execution_policy;
  typedef typename traits::index_type member_type ;

  KOKKOS_INLINE_FUNCTION const typename traits::execution_space & space() const { return m_space ; }
@ -348,7 +199,7 @@ public:
      : m_begin(0), m_end(0)
      {
        if ( part_size ) {
-  
+
          // Split evenly among partitions, then round up to the granularity.
          const member_type work_part =
            ( ( ( ( range.end() - range.begin() ) + ( part_size - 1 ) ) / part_size )
@ -356,7 +207,7 @@ public:

          m_begin = range.begin() + work_part * part_rank ;
          m_end   = m_begin       + work_part ;
-  
+
          if ( range.end() < m_begin ) m_begin = range.end() ;
          if ( range.end() < m_end )   m_end   = range.end() ;
        }
@ -366,10 +217,11 @@ public:
     member_type m_end ;
     WorkRange();
     WorkRange & operator = ( const WorkRange & );
-   
+
  };
 };

+
 } // namespace Kokkos

 //----------------------------------------------------------------------------
@ -377,38 +229,6 @@ public:

 namespace Kokkos {

-namespace Experimental {
-
-/** \brief Scratch memory request accepting per team and per thread value
- *
- * An instance of this class can be given as the last argument to a 
- * TeamPolicy constructor. It sets the amount of user requested shared
- * memory for the team.
- */
-
-template< class MemorySpace >
-class TeamScratchRequest {
-  size_t m_per_team;
-  size_t m_per_thread;
-  
-public:
-  TeamScratchRequest(size_t per_team_, size_t per_thread_ = 0):
-   m_per_team(per_team_), m_per_thread(per_thread_) {
-  } 
-
-  size_t per_team() const {
-    return m_per_team;
-  }
-  size_t per_thread() const {
-    return m_per_thread;
-  }
-  size_t total(const size_t team_size) const {
-    return m_per_team + m_per_thread * team_size;
-  }
-}; 
-
-}
-
 namespace Impl {


@ -451,11 +271,9 @@ public:

  TeamPolicyInternal( int league_size_request , const Kokkos::AUTO_t & , int vector_length_request = 1 );

-  template<class MemorySpace>
-  TeamPolicyInternal( int league_size_request , int team_size_request , const Experimental::TeamScratchRequest<MemorySpace>& team_scratch_memory_request );
+/*  TeamPolicyInternal( int league_size_request , int team_size_request );

-  template<class MemorySpace>
-  TeamPolicyInternal( int league_size_request , const Kokkos::AUTO_t & , const Experimental::TeamScratchRequest<MemorySpace>& team_scratch_memory_request );
+  TeamPolicyInternal( int league_size_request , const Kokkos::AUTO_t & );*/

  /** \brief  The actual league size (number of teams) of the policy.
   *
@ -574,12 +392,14 @@ class TeamPolicy: public
  typedef Impl::TeamPolicyInternal<
       typename Impl::PolicyTraits<Properties ... >::execution_space,
       Properties ...> internal_policy;
+
  typedef Impl::PolicyTraits<Properties ... > traits;

 public:
+  typedef TeamPolicy execution_policy;

  TeamPolicy& operator = (const TeamPolicy&) = default;
- 
+
  /** \brief  Construct policy with the given instance of the execution space */
  TeamPolicy( const typename traits::execution_space & , int league_size_request , int team_size_request , int vector_length_request = 1 )
    : internal_policy(typename traits::execution_space(),league_size_request,team_size_request, vector_length_request) {}
@ -594,13 +414,11 @@ public:
  TeamPolicy( int league_size_request , const Kokkos::AUTO_t & , int vector_length_request = 1 )
    : internal_policy(league_size_request,Kokkos::AUTO(), vector_length_request) {}

-  template<class MemorySpace>
-  TeamPolicy( int league_size_request , int team_size_request , const Experimental::TeamScratchRequest<MemorySpace>& team_scratch_memory_request )
-    : internal_policy(league_size_request,team_size_request, team_scratch_memory_request) {}
+/*  TeamPolicy( int league_size_request , int team_size_request  )
+    : internal_policy(league_size_request,team_size_request) {}

-  template<class MemorySpace>
-  TeamPolicy( int league_size_request , const Kokkos::AUTO_t & , const Experimental::TeamScratchRequest<MemorySpace>& team_scratch_memory_request )
-    : internal_policy(league_size_request,Kokkos::AUTO(), team_scratch_memory_request) {}
+  TeamPolicy( int league_size_request , const Kokkos::AUTO_t &  )
+    : internal_policy(league_size_request,Kokkos::AUTO()) {}*/

 private:
  TeamPolicy(const internal_policy& p):internal_policy(p) {}
@ -744,6 +562,7 @@ Impl::ThreadVectorRangeBoundariesStruct<iType,TeamMemberType> ThreadVectorRange(

 } // namespace Kokkos

+
 #endif /* #define KOKKOS_EXECPOLICY_HPP */

 //----------------------------------------------------------------------------
--- a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
@ -120,21 +120,6 @@ public:
  //! This memory space preferred device_type
  typedef Kokkos::Device<execution_space,memory_space> device_type;

-  /*--------------------------------*/
-#if ! KOKKOS_USING_EXP_VIEW
-
-  typedef Impl::HBWMallocAllocator allocator ;
-
-  /** \brief  Allocate a contiguous block of memory.
-   *
-   *  The input label is associated with the block of memory.
-   *  The block of memory is tracked via reference counting where
-   *  allocation gives it a reference count of one.
-   */
-  static Kokkos::Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
-
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
  /*--------------------------------*/
  /* Functions unique to the HBWSpace */
  static int in_parallel();
--- a/lib/kokkos/core/src/Kokkos_HostSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_HostSpace.hpp
@ -55,9 +55,6 @@
 #include <impl/Kokkos_Traits.hpp>
 #include <impl/Kokkos_Error.hpp>

-#include <impl/Kokkos_AllocationTracker.hpp>
-#include <impl/Kokkos_BasicAllocators.hpp>
-
 #include <impl/KokkosExp_SharedAlloc.hpp>

 /*--------------------------------------------------------------------------*/
@ -128,25 +125,6 @@ public:
  //! This memory space preferred device_type
  typedef Kokkos::Device<execution_space,memory_space> device_type;

-  /*--------------------------------*/
-#if ! KOKKOS_USING_EXP_VIEW
-
-#if defined( KOKKOS_USE_PAGE_ALIGNED_HOST_MEMORY )
-  typedef Impl::PageAlignedAllocator allocator ;
-#else
-  typedef Impl::AlignedAllocator allocator ;
-#endif
-
-  /** \brief  Allocate a contiguous block of memory.
-   *
-   *  The input label is associated with the block of memory.
-   *  The block of memory is tracked via reference counting where
-   *  allocation gives it a reference count of one.
-   */
-  static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
-
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
  /*--------------------------------*/
  /* Functions unique to the HostSpace */
  static int in_parallel();
--- a/lib/kokkos/core/src/Kokkos_Macros.hpp
+++ b/lib/kokkos/core/src/Kokkos_Macros.hpp
@ -133,11 +133,23 @@
 // still identifies as 7.0
 #error "Cuda version 7.5 or greater required for host-to-device Lambda support"
 #endif
+#if ( CUDA_VERSION < 8000 )
 #define KOKKOS_LAMBDA [=]__device__
+#else
+#define KOKKOS_LAMBDA [=]__host__ __device__
+#endif
 #define KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA 1
 #endif
 #endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( __CUDACC__ ) */

+
+#if defined(KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA)
+   // Cuda version 8.0 still needs the functor wrapper
+   #if (KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA /* && (CUDA_VERSION < 8000) */ )
+      #define KOKKOS_IMPL_NEED_FUNCTOR_WRAPPER
+   #endif
+#endif
+
 /*--------------------------------------------------------------------------*/
 /* Language info: C++, CUDA, OPENMP */

@ -440,27 +452,16 @@

 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
-/* Transitional macro to change between old and new View,
- * default to use new View.
+/* Transitional macro to change between old and new View
+ * are no longer supported.
 */

-#if ! defined( KOKKOS_USING_EXP_VIEW )
 #if defined( KOKKOS_USING_DEPRECATED_VIEW )
-#define KOKKOS_USING_EXP_VIEW 0
-#else
-#define KOKKOS_USING_EXP_VIEW 1
-#endif
+#error "Kokkos deprecated View has been removed"
 #endif

-#if KOKKOS_USING_EXP_VIEW
-#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
+#define KOKKOS_USING_EXP_VIEW 1
 #define KOKKOS_USING_EXPERIMENTAL_VIEW
-#endif
-#else /* ! KOKKOS_USING_EXP_VIEW */
-#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
-#error "KOKKOS_USING_EXP_VIEW and KOKKOS_USING_EXPERIMENAL_VIEW are both defined and are incompatible"
-#endif
-#endif

 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
--- a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
+++ b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
--- a/lib/kokkos/core/src/Kokkos_OpenMP.hpp
+++ b/lib/kokkos/core/src/Kokkos_OpenMP.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -58,9 +58,11 @@
 #endif
 #include <Kokkos_ScratchSpace.hpp>
 #include <Kokkos_Parallel.hpp>
+#include <Kokkos_TaskPolicy.hpp>
 #include <Kokkos_Layout.hpp>
 #include <impl/Kokkos_Tags.hpp>

+#include <KokkosExp_MDRangePolicy.hpp>
 /*--------------------------------------------------------------------------*/

 namespace Kokkos {
@ -177,6 +179,7 @@ struct VerifyExecutionCanAccessMemorySpace

 #include <OpenMP/Kokkos_OpenMPexec.hpp>
 #include <OpenMP/Kokkos_OpenMP_Parallel.hpp>
+#include <OpenMP/Kokkos_OpenMP_Task.hpp>

 /*--------------------------------------------------------------------------*/

--- a/lib/kokkos/core/src/Kokkos_Pair.hpp
+++ b/lib/kokkos/core/src/Kokkos_Pair.hpp
@ -1,12 +1,12 @@
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -35,7 +35,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER

@ -125,17 +125,26 @@ struct pair
    return *this;
  }

-  /// \brief Assignment operator.
+
+  /// \brief Assignment operator, for volatile <tt>*this</tt>.
  ///
-  /// This calls the assignment operators of T1 and T2.  It won't
+  /// \param p [in] Input; right-hand side of the assignment.
+  ///
+  /// This calls the assignment operators of T1 and T2.  It will not
  /// compile if the assignment operators are not defined and public.
+  ///
+  /// This operator returns \c void instead of <tt>volatile pair<T1,
+  /// T2>& </tt>.  See Kokkos Issue #177 for the explanation.  In
+  /// practice, this means that you should not chain assignments with
+  /// volatile lvalues.
  template <class U, class V>
  KOKKOS_FORCEINLINE_FUNCTION
-  volatile pair<T1, T2> & operator=(const volatile pair<U,V> &p) volatile
+  void operator=(const volatile pair<U,V> &p) volatile
  {
    first = p.first;
    second = p.second;
-    return *this;
+    // We deliberately do not return anything here.  See explanation
+    // in public documentation above.
  }

  // from std::pair<U,V>
--- a/lib/kokkos/core/src/Kokkos_Parallel.hpp
+++ b/lib/kokkos/core/src/Kokkos_Parallel.hpp
@ -57,7 +57,6 @@
 #include <typeinfo>
 #endif

-#include <impl/Kokkos_AllocationTracker.hpp>
 #include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_Traits.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
@ -178,8 +177,8 @@ void parallel_for( const ExecPolicy  & policy
 {
 #if (KOKKOS_ENABLE_PROFILING)
    uint64_t kpID = 0;
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-     	Kokkos::Experimental::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     if(Kokkos::Profiling::profileLibraryLoaded()) {
+     	Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
     }
 #endif

@ -190,8 +189,8 @@ void parallel_for( const ExecPolicy  & policy
   closure.execute();

 #if (KOKKOS_ENABLE_PROFILING)
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-        Kokkos::Experimental::endParallelFor(kpID);
+     if(Kokkos::Profiling::profileLibraryLoaded()) {
+        Kokkos::Profiling::endParallelFor(kpID);
     }
 #endif
 }
@ -210,8 +209,8 @@ void parallel_for( const size_t        work_count

 #if (KOKKOS_ENABLE_PROFILING)
  uint64_t kpID = 0;
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-  	Kokkos::Experimental::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     if(Kokkos::Profiling::profileLibraryLoaded()) {
+  	Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
     }
 #endif
    
@ -222,8 +221,8 @@ void parallel_for( const size_t        work_count
  closure.execute();

 #if (KOKKOS_ENABLE_PROFILING)
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::endParallelFor(kpID);
+     if(Kokkos::Profiling::profileLibraryLoaded()) {
+	Kokkos::Profiling::endParallelFor(kpID);
     }
 #endif
 }
@ -248,405 +247,9 @@ void parallel_for( const std::string & str
  (void) str;
 }

-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-/** \brief  Parallel reduction
- *
- * Example of a parallel_reduce functor for a POD (plain old data) value type:
- * \code
- *  class FunctorType { // For POD value type
- *  public:
- *    typedef    ...     execution_space ;
- *    typedef <podType>  value_type ;
- *    void operator()( <intType> iwork , <podType> & update ) const ;
- *    void init( <podType> & update ) const ;
- *    void join( volatile       <podType> & update ,
- *               volatile const <podType> & input ) const ;
- *
- *    typedef true_type has_final ;
- *    void final( <podType> & update ) const ;
- *  };
- * \endcode
- *
- * Example of a parallel_reduce functor for an array of POD (plain old data) values:
- * \code
- *  class FunctorType { // For array of POD value
- *  public:
- *    typedef    ...     execution_space ;
- *    typedef <podType>  value_type[] ;
- *    void operator()( <intType> , <podType> update[] ) const ;
- *    void init( <podType> update[] ) const ;
- *    void join( volatile       <podType> update[] ,
- *               volatile const <podType> input[] ) const ;
- *
- *    typedef true_type has_final ;
- *    void final( <podType> update[] ) const ;
- *  };
- * \endcode
- */
-template< class ExecPolicy , class FunctorType >
-inline
-void parallel_reduce( const ExecPolicy  & policy
-                    , const FunctorType & functor
-                    , const std::string& str = ""
-                    , typename Impl::enable_if< ! Impl::is_integral< ExecPolicy >::value >::type * = 0
-                    )
-{
-  // typedef typename
-  //   Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
-  //     execution_space ;
-
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >  ValueTraits ;
-
-  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
-                                     , typename ValueTraits::value_type
-                                     , typename ValueTraits::pointer_type
-                                     >::type value_type ;
-
-  Kokkos::View< value_type
-              , HostSpace
-              , Kokkos::MemoryUnmanaged
-              >
-    result_view ;
-
-#if (KOKKOS_ENABLE_PROFILING)
-  uint64_t kpID = 0;
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-  	Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
-     }
-#endif
-
-    Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
-    Impl::ParallelReduce< FunctorType , ExecPolicy > closure( functor , policy , result_view );
-    Kokkos::Impl::shared_allocation_tracking_release_and_enable();
-
-    closure.execute();
-
-#if (KOKKOS_ENABLE_PROFILING)
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::endParallelReduce(kpID);
-     }
-#endif
 }

-// integral range policy
-template< class FunctorType >
-inline
-void parallel_reduce( const size_t        work_count
-                    , const FunctorType & functor
-                    , const std::string& str = ""
-                    )
-{
-  typedef typename
-    Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
-      execution_space ;
-
-  typedef RangePolicy< execution_space > policy ;
-
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void >  ValueTraits ;
-
-  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
-                                     , typename ValueTraits::value_type
-                                     , typename ValueTraits::pointer_type
-                                     >::type value_type ;
-
-  Kokkos::View< value_type
-              , HostSpace
-              , Kokkos::MemoryUnmanaged
-              >
-    result_view ;
-
-#if (KOKKOS_ENABLE_PROFILING)
-  uint64_t kpID = 0;
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-  	Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
-     }
-#endif
-    
-  Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
-  Impl::ParallelReduce< FunctorType , policy > closure( functor , policy(0,work_count) , result_view );
-  Kokkos::Impl::shared_allocation_tracking_release_and_enable();
-
-  closure.execute();
-
-#if (KOKKOS_ENABLE_PROFILING)
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::endParallelReduce(kpID);
-     }
-#endif
-
-}
-
-// general policy and view ouput
-template< class ExecPolicy , class FunctorType , class ViewType >
-inline
-void parallel_reduce( const ExecPolicy  & policy
-                    , const FunctorType & functor
-                    , const ViewType    & result_view
-                    , const std::string& str = ""
-                    , typename Impl::enable_if<
-                      ( Kokkos::is_view<ViewType>::value && ! Impl::is_integral< ExecPolicy >::value
-#ifdef KOKKOS_HAVE_CUDA
-                        && ! Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value
-#endif
-                      )>::type * = 0 )
-{
-    
-#if (KOKKOS_ENABLE_PROFILING)
-  uint64_t kpID = 0;
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
-     }
-#endif
-    
-  Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
-  Impl::ParallelReduce< FunctorType, ExecPolicy > closure( functor , policy , result_view );
-  Kokkos::Impl::shared_allocation_tracking_release_and_enable();
-
-  closure.execute();
-
-#if (KOKKOS_ENABLE_PROFILING)
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::endParallelReduce(kpID);
-     }
-#endif
-
-}
-
-// general policy and pod or array of pod output
-template< class ExecPolicy , class FunctorType >
-void parallel_reduce( const ExecPolicy  & policy
-                    , const FunctorType & functor
-#ifdef KOKKOS_HAVE_CUDA
-                    , typename Impl::enable_if<
-                      ( ! Impl::is_integral< ExecPolicy >::value &&
-                        ! Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value )
-                      , typename Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >::reference_type>::type result_ref
-                      , const std::string& str = ""
-                      , typename Impl::enable_if<! Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value >::type* = 0
-                      )
-#else
-                      , typename Impl::enable_if<
-                        ( ! Impl::is_integral< ExecPolicy >::value)
-                        , typename Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >::reference_type
-                        >::type result_ref
-                      , const std::string& str = ""
-                        )
-#endif
-{
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >  ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueOps<    FunctorType , typename ExecPolicy::work_tag >  ValueOps ;
-
-  // Wrap the result output request in a view to inform the implementation
-  // of the type and memory space.
-
-  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
-                                     , typename ValueTraits::value_type
-                                     , typename ValueTraits::pointer_type
-                                     >::type value_type ;
-
-  Kokkos::View< value_type
-              , HostSpace
-              , Kokkos::MemoryUnmanaged
-              >
-    result_view( ValueOps::pointer( result_ref )
-               , ValueTraits::value_count( functor )
-               );
-
-#if (KOKKOS_ENABLE_PROFILING)
-  uint64_t kpID = 0;
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
-     }
-#endif
-    
-  Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
-  Impl::ParallelReduce< FunctorType, ExecPolicy > closure( functor , policy , result_view );
-  Kokkos::Impl::shared_allocation_tracking_release_and_enable();
-
-  closure.execute();
-
-#if (KOKKOS_ENABLE_PROFILING)
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::endParallelReduce(kpID);
-     }
-#endif
-
-}
-
-// integral range policy and view ouput
-template< class FunctorType , class ViewType >
-inline
-void parallel_reduce( const size_t        work_count
-                    , const FunctorType & functor
-                    , const ViewType    & result_view
-                    , const std::string& str = ""
-                    , typename Impl::enable_if<( Kokkos::is_view<ViewType>::value
-#ifdef KOKKOS_HAVE_CUDA
-                        && ! Impl::is_same<
-                          typename Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space,
-                          Kokkos::Cuda>::value
-#endif
-                        )>::type * = 0 )
-{
-  typedef typename
-    Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
-      execution_space ;
-
-  typedef RangePolicy< execution_space > ExecPolicy ;
-
-#if (KOKKOS_ENABLE_PROFILING)
-  uint64_t kpID = 0;
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
-     }
-#endif
-
-  Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
-  Impl::ParallelReduce< FunctorType, ExecPolicy > closure( functor , ExecPolicy(0,work_count) , result_view );
-  Kokkos::Impl::shared_allocation_tracking_release_and_enable();
-
-  closure.execute();
-    
-#if (KOKKOS_ENABLE_PROFILING)
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::endParallelReduce(kpID);
-     }
-#endif
-
-}
-
-// integral range policy and pod or array of pod output
-template< class FunctorType >
-inline
-void parallel_reduce( const size_t        work_count
-                    , const FunctorType & functor
-                    , typename Kokkos::Impl::FunctorValueTraits<
-                         typename Impl::if_c<Impl::is_execution_policy<FunctorType>::value ||
-                                             Impl::is_integral<FunctorType>::value,
-                            void,FunctorType>::type
-                         , void >::reference_type result
-                    , const std::string& str = ""
-                    , typename Impl::enable_if< true
-#ifdef KOKKOS_HAVE_CUDA
-                              && ! Impl::is_same<
-                             typename Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space,
-                             Kokkos::Cuda>::value
-#endif
-                     >::type * = 0 )
-{
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void >  ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueOps<    FunctorType , void >  ValueOps ;
-
-  typedef typename
-    Kokkos::Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
-      execution_space ;
-
-  typedef Kokkos::RangePolicy< execution_space > policy ;
-
-  // Wrap the result output request in a view to inform the implementation
-  // of the type and memory space.
-
-  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
-                                     , typename ValueTraits::value_type
-                                     , typename ValueTraits::pointer_type
-                                     >::type value_type ;
-
-  Kokkos::View< value_type
-              , HostSpace
-              , Kokkos::MemoryUnmanaged
-              >
-    result_view( ValueOps::pointer( result )
-               , ValueTraits::value_count( functor )
-               );
-
-#if (KOKKOS_ENABLE_PROFILING)
-  uint64_t kpID = 0;
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
-     }
-#endif
-
-  Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
-  Impl::ParallelReduce< FunctorType , policy > closure( functor , policy(0,work_count) , result_view );
-  Kokkos::Impl::shared_allocation_tracking_release_and_enable();
-
-  closure.execute();
-
-#if (KOKKOS_ENABLE_PROFILING)
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::endParallelReduce(kpID);
-     }
-#endif
-
-}
-#ifndef KOKKOS_HAVE_CUDA
-template< class ExecPolicy , class FunctorType , class ResultType >
-inline
-void parallel_reduce( const std::string & str
-                    , const ExecPolicy  & policy
-                    , const FunctorType & functor
-                    , ResultType * result)
-{
-  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl;
-  #endif
-
-  parallel_reduce(policy,functor,result,str);
-
-  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG End   parallel_reduce kernel: " << str << std::endl;
-  #endif
-  (void) str;
-}
-
-template< class ExecPolicy , class FunctorType , class ResultType >
-inline
-void parallel_reduce( const std::string & str
-                    , const ExecPolicy  & policy
-                    , const FunctorType & functor
-                    , ResultType & result)
-{
-  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl;
-  #endif
-
-  parallel_reduce(policy,functor,result,str);
-
-  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG End   parallel_reduce kernel: " << str << std::endl;
-  #endif
-  (void) str;
-}
-
-template< class ExecPolicy , class FunctorType >
-inline
-void parallel_reduce( const std::string & str
-                    , const ExecPolicy  & policy
-                    , const FunctorType & functor)
-{
-  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl;
-  #endif
-
-  parallel_reduce(policy,functor,str);
-
-  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG End   parallel_reduce kernel: " << str << std::endl;
-  #endif
-  (void) str;
-}
-#endif
-
-} // namespace Kokkos
-
+#include <Kokkos_Parallel_Reduce.hpp>
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

@ -816,8 +419,8 @@ void parallel_scan( const ExecutionPolicy & policy
 {
 #if (KOKKOS_ENABLE_PROFILING)
  uint64_t kpID = 0;
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     if(Kokkos::Profiling::profileLibraryLoaded()) {
+	Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
     }
 #endif

@ -828,8 +431,8 @@ void parallel_scan( const ExecutionPolicy & policy
  closure.execute();

 #if (KOKKOS_ENABLE_PROFILING)
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::endParallelScan(kpID);
+     if(Kokkos::Profiling::profileLibraryLoaded()) {
+	Kokkos::Profiling::endParallelScan(kpID);
     }
 #endif

@ -849,8 +452,8 @@ void parallel_scan( const size_t        work_count

 #if (KOKKOS_ENABLE_PROFILING)
  uint64_t kpID = 0;
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     if(Kokkos::Profiling::profileLibraryLoaded()) {
+	Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
     }
 #endif
    
@ -861,8 +464,8 @@ void parallel_scan( const size_t        work_count
  closure.execute();

 #if (KOKKOS_ENABLE_PROFILING)
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::endParallelScan(kpID);
+     if(Kokkos::Profiling::profileLibraryLoaded()) {
+	Kokkos::Profiling::endParallelScan(kpID);
     }
 #endif

--- a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
+++ b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
--- a/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp
@ -66,11 +66,15 @@ public:

 private:

-  mutable char * m_iter ;
-  char *         m_end ;
+  mutable char * m_iter_L0 ;
+  char *         m_end_L0 ;
+  mutable char * m_iter_L1 ;
+  char *         m_end_L1 ;
+

  mutable int m_multiplier;
  mutable int m_offset;
+  mutable int m_default_level;

  ScratchMemorySpace();
  ScratchMemorySpace & operator = ( const ScratchMemorySpace & );
@ -95,34 +99,58 @@ public:

  template< typename IntType >
  KOKKOS_INLINE_FUNCTION
-  void* get_shmem (const IntType& size) const {
-    void* tmp = m_iter + m_offset * align (size);
-    if (m_end < (m_iter += align (size) * m_multiplier)) {
-      m_iter -= align (size) * m_multiplier; // put it back like it was
-  #ifdef KOKKOS_HAVE_DEBUG
-      // mfh 23 Jun 2015: printf call consumes 25 registers
-      // in a CUDA build, so only print in debug mode.  The
-      // function still returns NULL if not enough memory.
-      printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate "
-              "%ld byte(s); remaining capacity is %ld byte(s)\n", long(size),
-              long(m_end-m_iter));
-  #endif // KOKKOS_HAVE_DEBUG
-      tmp = 0;
+  void* get_shmem (const IntType& size, int level = -1) const {
+    if(level == -1)
+      level = m_default_level;
+    if(level == 0) {
+      void* tmp = m_iter_L0 + m_offset * align (size);
+      if (m_end_L0 < (m_iter_L0 += align (size) * m_multiplier)) {
+        m_iter_L0 -= align (size) * m_multiplier; // put it back like it was
+        #ifdef KOKKOS_HAVE_DEBUG
+        // mfh 23 Jun 2015: printf call consumes 25 registers
+        // in a CUDA build, so only print in debug mode.  The
+        // function still returns NULL if not enough memory.
+        printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate "
+                "%ld byte(s); remaining capacity is %ld byte(s)\n", long(size),
+                long(m_end_L0-m_iter_L0));
+        #endif // KOKKOS_HAVE_DEBUG
+        tmp = 0;
+      }
+      return tmp;
+    } else {
+      void* tmp = m_iter_L1 + m_offset * align (size);
+      if (m_end_L1 < (m_iter_L1 += align (size) * m_multiplier)) {
+        m_iter_L1 -= align (size) * m_multiplier; // put it back like it was
+        #ifdef KOKKOS_HAVE_DEBUG
+        // mfh 23 Jun 2015: printf call consumes 25 registers
+        // in a CUDA build, so only print in debug mode.  The
+        // function still returns NULL if not enough memory.
+        printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate "
+                "%ld byte(s); remaining capacity is %ld byte(s)\n", long(size),
+                long(m_end_L1-m_iter_L1));
+        #endif // KOKKOS_HAVE_DEBUG
+        tmp = 0;
+      }
+      return tmp;
+
    }
-    return tmp;
  }

  template< typename IntType >
  KOKKOS_INLINE_FUNCTION
-  ScratchMemorySpace( void * ptr , const IntType & size )
-    : m_iter( (char *) ptr )
-    , m_end(  m_iter + size )
+  ScratchMemorySpace( void * ptr_L0 , const IntType & size_L0 , void * ptr_L1 = NULL , const IntType & size_L1 = 0)
+    : m_iter_L0( (char *) ptr_L0 )
+    , m_end_L0(  m_iter_L0 + size_L0 )
+    , m_iter_L1( (char *) ptr_L1 )
+    , m_end_L1(  m_iter_L1 + size_L1 )
    , m_multiplier( 1 )
    , m_offset( 0 )
+    , m_default_level( 0 )
    {}

  KOKKOS_INLINE_FUNCTION
-  const ScratchMemorySpace& set_team_thread_mode(const int& multiplier, const int& offset) const {
+  const ScratchMemorySpace& set_team_thread_mode(const int& level, const int& multiplier, const int& offset) const {
+    m_default_level = level;
    m_multiplier = multiplier;
    m_offset = offset;
    return *this;
--- a/lib/kokkos/core/src/Kokkos_Serial.hpp
+++ b/lib/kokkos/core/src/Kokkos_Serial.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -50,12 +50,17 @@
 #include <cstddef>
 #include <iosfwd>
 #include <Kokkos_Parallel.hpp>
+#include <Kokkos_TaskPolicy.hpp>
 #include <Kokkos_Layout.hpp>
 #include <Kokkos_HostSpace.hpp>
 #include <Kokkos_ScratchSpace.hpp>
 #include <Kokkos_MemoryTraits.hpp>
 #include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
+#include <impl/Kokkos_Profiling_Interface.hpp>
+
+
+#include <KokkosExp_MDRangePolicy.hpp>

 #if defined( KOKKOS_HAVE_SERIAL )

@ -142,7 +147,9 @@ public:

    // Init the array of locks used for arbitrarily sized atomics
    Impl::init_lock_array_host_space();
-
+    #if (KOKKOS_ENABLE_PROFILING)
+      Kokkos::Profiling::initialize();
+    #endif
  }

  static int is_initialized() { return 1 ; }
@ -151,7 +158,11 @@ public:
  static int concurrency() {return 1;};

  //! Free any resources being consumed by the device.
-  static void finalize() {}
+  static void finalize() {
+    #if (KOKKOS_ENABLE_PROFILING)
+      Kokkos::Profiling::finalize();
+    #endif
+  }

  //! Print configuration information to the given output stream.
  static void print_configuration( std::ostream & , const bool /* detail */ = false ) {}
@ -307,8 +318,8 @@ class TeamPolicyInternal< Kokkos::Serial , Properties ... >:public PolicyTraits<
 {
 private:

-  size_t m_team_scratch_size ;
-  size_t m_thread_scratch_size ;
+  size_t m_team_scratch_size[2] ;
+  size_t m_thread_scratch_size[2] ;
  int    m_league_size ;
  int    m_chunk_size;

@ -324,8 +335,10 @@ public:

  TeamPolicyInternal& operator = (const TeamPolicyInternal& p) {
    m_league_size = p.m_league_size;
-    m_team_scratch_size = p.m_team_scratch_size;
-    m_thread_scratch_size = p.m_thread_scratch_size;
+    m_team_scratch_size[0] = p.m_team_scratch_size[0];
+    m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
+    m_team_scratch_size[1] = p.m_team_scratch_size[1];
+    m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
    m_chunk_size = p.m_chunk_size;
    return *this;
  }
@ -348,15 +361,15 @@ public:

  inline int team_size() const { return 1 ; }
  inline int league_size() const { return m_league_size ; }
-  inline size_t scratch_size() const { return m_team_scratch_size + m_thread_scratch_size; }
+  inline size_t scratch_size(const int& level, int = 0) const { return m_team_scratch_size[level] + m_thread_scratch_size[level]; }

  /** \brief  Specify league size, request team size */
  TeamPolicyInternal( execution_space &
            , int league_size_request
            , int /* team_size_request */
            , int /* vector_length_request */ = 1 )
-    : m_team_scratch_size ( 0 )
-    , m_thread_scratch_size ( 0 )
+    : m_team_scratch_size { 0 , 0 }
+    , m_thread_scratch_size { 0 , 0 }
    , m_league_size( league_size_request )
    , m_chunk_size ( 32 )
    {}
@ -365,8 +378,8 @@ public:
            , int league_size_request
            , const Kokkos::AUTO_t & /* team_size_request */
            , int /* vector_length_request */ = 1 )
-    : m_team_scratch_size ( 0 )
-    , m_thread_scratch_size ( 0 )
+    : m_team_scratch_size { 0 , 0 }
+    , m_thread_scratch_size { 0 , 0 }
    , m_league_size( league_size_request )
    , m_chunk_size ( 32 )
    {}
@ -374,8 +387,8 @@ public:
  TeamPolicyInternal( int league_size_request
            , int /* team_size_request */
            , int /* vector_length_request */ = 1 )
-    : m_team_scratch_size ( 0 )
-    , m_thread_scratch_size ( 0 )
+    : m_team_scratch_size { 0 , 0 }
+    , m_thread_scratch_size { 0 , 0 }
    , m_league_size( league_size_request )
    , m_chunk_size ( 32 )
    {}
@ -383,8 +396,8 @@ public:
  TeamPolicyInternal( int league_size_request
            , const Kokkos::AUTO_t & /* team_size_request */
            , int /* vector_length_request */ = 1 )
-    : m_team_scratch_size ( 0 )
-    , m_thread_scratch_size ( 0 )
+    : m_team_scratch_size { 0 , 0 }
+    , m_thread_scratch_size { 0 , 0 }
    , m_league_size( league_size_request )
    , m_chunk_size ( 32 )
    {}
@ -401,26 +414,23 @@ public:

  /** \brief set per team scratch size for a specific level of the scratch hierarchy */
  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
-    (void) level;
    TeamPolicyInternal p = *this;
-    p.m_team_scratch_size = per_team.value;
+    p.m_team_scratch_size[level] = per_team.value;
    return p;
  };

  /** \brief set per thread scratch size for a specific level of the scratch hierarchy */
  inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
-    (void) level;
    TeamPolicyInternal p = *this;
-    p.m_thread_scratch_size = per_thread.value;
+    p.m_thread_scratch_size[level] = per_thread.value;
    return p;
  };

  /** \brief set per thread and per team scratch size for a specific level of the scratch hierarchy */
  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
-    (void) level;
    TeamPolicyInternal p = *this;
-    p.m_team_scratch_size = per_team.value;
-    p.m_thread_scratch_size = per_thread.value;
+    p.m_team_scratch_size[level] = per_team.value;
+    p.m_thread_scratch_size[level] = per_thread.value;
    return p;
  };

@ -440,7 +450,7 @@ namespace Kokkos {
 namespace Impl {

 template< class FunctorType , class ... Traits >
-class ParallelFor< FunctorType , 
+class ParallelFor< FunctorType ,
                   Kokkos::RangePolicy< Traits ... > ,
                   Kokkos::Serial
                 >
@ -489,9 +499,10 @@ public:

 /*--------------------------------------------------------------------------*/

-template< class FunctorType , class ... Traits >
+template< class FunctorType , class ReducerType , class ... Traits >
 class ParallelReduce< FunctorType
                    , Kokkos::RangePolicy< Traits ... >
+                    , ReducerType
                    , Kokkos::Serial
                    >
 {
@ -499,14 +510,19 @@ private:

  typedef Kokkos::RangePolicy< Traits ... > Policy ;
  typedef typename Policy::work_tag                                  WorkTag ;
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag >  ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , WorkTag >  ValueInit ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTag >  ValueInit ;

  typedef typename ValueTraits::pointer_type    pointer_type ;
  typedef typename ValueTraits::reference_type  reference_type ;

  const FunctorType   m_functor ;
  const Policy        m_policy ;
+  const ReducerType   m_reducer ;
  const pointer_type  m_result_ptr ;


@ -515,15 +531,15 @@ private:
  typename std::enable_if< std::is_same< TagType , void >::value >::type
  exec( pointer_type ptr ) const
    {
-      reference_type update = ValueInit::init( m_functor , ptr );
+      reference_type update = ValueInit::init(  ReducerConditional::select(m_functor , m_reducer) , ptr );

      const typename Policy::member_type e = m_policy.end();
      for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
        m_functor( i , update );
      }

-      Kokkos::Impl::FunctorFinal< FunctorType , TagType >::
-        final( m_functor , ptr );
+      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
+        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
    }

  template< class TagType >
@ -532,15 +548,15 @@ private:
  exec( pointer_type ptr ) const
    {
      const TagType t{} ;
-      reference_type update = ValueInit::init( m_functor , ptr );
+      reference_type update = ValueInit::init(  ReducerConditional::select(m_functor , m_reducer) , ptr );

      const typename Policy::member_type e = m_policy.end();
      for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
        m_functor( t , i , update );
      }

-      Kokkos::Impl::FunctorFinal< FunctorType , TagType >::
-        final( m_functor , ptr );
+      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
+        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
    }

 public:
@ -549,25 +565,43 @@ public:
  void execute() const
    {
      pointer_type ptr = (pointer_type) Kokkos::Serial::scratch_memory_resize
-           ( ValueTraits::value_size( m_functor ) , 0 );
+           ( ValueTraits::value_size(  ReducerConditional::select(m_functor , m_reducer) ) , 0 );

      this-> template exec< WorkTag >( m_result_ptr ? m_result_ptr : ptr );
    }

-  template< class ViewType >
+  template< class HostViewType >
+  ParallelReduce( const FunctorType  & arg_functor ,
+                  const Policy       & arg_policy ,
+                  const HostViewType & arg_result_view ,
+                  typename std::enable_if<
+                               Kokkos::is_view< HostViewType >::value &&
+                              !Kokkos::is_reducer_type<ReducerType>::value
+                  ,void*>::type = NULL)
+    : m_functor( arg_functor )
+    , m_policy( arg_policy )
+    , m_reducer( InvalidType() )
+    , m_result_ptr( arg_result_view.ptr_on_device() )
+    {
+      static_assert( Kokkos::is_view< HostViewType >::value
+        , "Kokkos::Serial reduce result must be a View" );
+
+      static_assert( std::is_same< typename HostViewType::memory_space , HostSpace >::value
+        , "Kokkos::Serial reduce result must be a View in HostSpace" );
+    }
+
+  inline
  ParallelReduce( const FunctorType & arg_functor
-                , const Policy      & arg_policy
-                , const ViewType    & arg_result )
+                , Policy       arg_policy
+                , const ReducerType& reducer )
    : m_functor( arg_functor )
    , m_policy(  arg_policy )
-    , m_result_ptr( arg_result.ptr_on_device() )
+    , m_reducer( reducer )
+    , m_result_ptr(  reducer.result_view().data() )
    {
-      static_assert( Kokkos::is_view< ViewType >::value
-        , "Reduction result on Kokkos::Serial must be a Kokkos::View" );
-
-      static_assert( std::is_same< typename ViewType::memory_space
+      /*static_assert( std::is_same< typename ViewType::memory_space
                                      , Kokkos::HostSpace >::value
-        , "Reduction result on Kokkos::Serial must be a Kokkos::View in HostSpace" );
+        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
    }
 };

@ -697,15 +731,16 @@ public:
             , const Policy      & arg_policy )
    : m_functor( arg_functor )
    , m_league(  arg_policy.league_size() )
-    , m_shared( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) )
+    , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) )
    { }
 };

 /*--------------------------------------------------------------------------*/

-template< class FunctorType , class ... Properties >
+template< class FunctorType , class ReducerType , class ... Properties >
 class ParallelReduce< FunctorType
                    , Kokkos::TeamPolicy< Properties ... >
+                    , ReducerType
                    , Kokkos::Serial
                    >
 {
@ -714,30 +749,35 @@ private:
  typedef TeamPolicyInternal< Kokkos::Serial, Properties ... > Policy ;
  typedef typename Policy::member_type                       Member ;
  typedef typename Policy::work_tag                          WorkTag ;
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag > ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , WorkTag > ValueInit ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTag >  ValueInit ;

  typedef typename ValueTraits::pointer_type    pointer_type ;
  typedef typename ValueTraits::reference_type  reference_type ;

  const FunctorType  m_functor ;
  const int          m_league ;
-  const int          m_shared ;
+  const ReducerType  m_reducer ;
        pointer_type m_result_ptr ;
+  const int          m_shared ;

  template< class TagType >
  inline
  typename std::enable_if< std::is_same< TagType , void >::value >::type
  exec( pointer_type ptr ) const
    {
-      reference_type update = ValueInit::init( m_functor , ptr );
+      reference_type update = ValueInit::init(  ReducerConditional::select(m_functor , m_reducer) , ptr );

      for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
        m_functor( Member(ileague,m_league,m_shared) , update );
      }

-      Kokkos::Impl::FunctorFinal< FunctorType , TagType >::
-        final( m_functor , ptr );
+      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
+        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
    }

  template< class TagType >
@ -747,14 +787,14 @@ private:
    {
      const TagType t{} ;

-      reference_type update = ValueInit::init( m_functor , ptr );
+      reference_type update = ValueInit::init(  ReducerConditional::select(m_functor , m_reducer) , ptr );

      for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
        m_functor( t , Member(ileague,m_league,m_shared) , update );
      }

-      Kokkos::Impl::FunctorFinal< FunctorType , TagType >::
-        final( m_functor , ptr );
+      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
+        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
    }

 public:
@ -763,7 +803,7 @@ public:
  void execute() const
    {
      pointer_type ptr = (pointer_type) Kokkos::Serial::scratch_memory_resize
-           ( ValueTraits::value_size( m_functor ) , m_shared );
+           ( ValueTraits::value_size(  ReducerConditional::select(m_functor , m_reducer) ) , m_shared );

      this-> template exec< WorkTag >( m_result_ptr ? m_result_ptr : ptr );
    }
@ -771,12 +811,16 @@ public:
  template< class ViewType >
  ParallelReduce( const FunctorType  & arg_functor
                , const Policy       & arg_policy
-                , const ViewType     & arg_result
-                )
+                , const ViewType     & arg_result ,
+                typename std::enable_if<
+                  Kokkos::is_view< ViewType >::value &&
+                  !Kokkos::is_reducer_type<ReducerType>::value
+                  ,void*>::type = NULL)
    : m_functor( arg_functor )
    , m_league( arg_policy.league_size() )
-    , m_shared( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( m_functor , 1 ) )
+    , m_reducer( InvalidType() )
    , m_result_ptr( arg_result.ptr_on_device() )
+    , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( m_functor , 1 ) )
    {
      static_assert( Kokkos::is_view< ViewType >::value
        , "Reduction result on Kokkos::Serial must be a Kokkos::View" );
@ -786,6 +830,21 @@ public:
        , "Reduction result on Kokkos::Serial must be a Kokkos::View in HostSpace" );
    }

+  inline
+  ParallelReduce( const FunctorType & arg_functor
+    , Policy       arg_policy
+    , const ReducerType& reducer )
+  : m_functor( arg_functor )
+  , m_league(  arg_policy.league_size() )
+  , m_reducer( reducer )
+  , m_result_ptr(  reducer.result_view().data() )
+  , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+  {
+  /*static_assert( std::is_same< typename ViewType::memory_space
+                          , Kokkos::HostSpace >::value
+  , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
+  }
+
 };

 } // namespace Impl
@ -1045,6 +1104,10 @@ void single(const Impl::ThreadSingleStruct<Impl::SerialTeamMember>& , const Func
 }
 }

+//----------------------------------------------------------------------------
+
+#include <impl/Kokkos_Serial_Task.hpp>
+
 #endif // defined( KOKKOS_HAVE_SERIAL )
 #endif /* #define KOKKOS_SERIAL_HPP */

--- a/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp
+++ b/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp
@ -1,4 +1,3 @@
-
 /*
 //@HEADER
 // ************************************************************************
@ -47,13 +46,655 @@
 #ifndef KOKKOS_TASKPOLICY_HPP
 #define KOKKOS_TASKPOLICY_HPP

-#include <Kokkos_Core_fwd.hpp>
-#include <Kokkos_MemoryPool.hpp>
-#include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_Tags.hpp>
-#include <impl/Kokkos_StaticAssert.hpp>
-#include <impl/Kokkos_AllocationTracker.hpp>
+//----------------------------------------------------------------------------

+#include <Kokkos_Core_fwd.hpp>
+
+// If compiling with CUDA then must be using CUDA 8 or better
+// and use relocateable device code to enable the task policy.
+// nvcc relocatable device code option: --relocatable-device-code=true
+
+#if ( defined( KOKKOS_COMPILER_NVCC ) )
+  #if ( 8000 <= CUDA_VERSION ) && \
+      defined( KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE )
+
+  #define KOKKOS_ENABLE_TASKPOLICY
+
+  #endif
+#else
+
+#define KOKKOS_ENABLE_TASKPOLICY
+
+#endif
+
+
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
+//----------------------------------------------------------------------------
+
+#include <Kokkos_MemoryPool.hpp>
+#include <impl/Kokkos_Tags.hpp>
+#include <impl/Kokkos_TaskQueue.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+enum TaskType { TaskTeam   = Impl::TaskBase<void,void,void>::TaskTeam
+              , TaskSingle = Impl::TaskBase<void,void,void>::TaskSingle };
+
+enum TaskPriority { TaskHighPriority    = 0
+                  , TaskRegularPriority = 1
+                  , TaskLowPriority     = 2 };
+
+template< typename Space >
+class TaskPolicy ;
+
+template< typename Space >
+void wait( TaskPolicy< Space > const & );
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/*\brief  Implementation data for task data management, access, and execution.
+ *
+ *  CRTP Inheritance structure to allow static_cast from the
+ *  task root type and a task's FunctorType.
+ *
+ *    TaskBase< Space , ResultType , FunctorType >
+ *      : TaskBase< Space , ResultType , void >
+ *      , FunctorType
+ *      { ... };
+ *
+ *    TaskBase< Space , ResultType , void >
+ *      : TaskBase< Space , void , void >
+ *      { ... };
+ */
+template< typename Space , typename ResultType , typename FunctorType >
+class TaskBase ;
+
+template< typename Space >
+class TaskExec ;
+
+}} // namespace Kokkos::Impl
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/**
+ *
+ *  Future< space >  // value_type == void
+ *  Future< value >  // space == Default
+ *  Future< value , space >
+ *
+ */
+template< typename Arg1 /* = void */ , typename Arg2 /* = void */ >
+class Future {
+private:
+
+  template< typename > friend class TaskPolicy ;
+  template< typename , typename > friend class Future ;
+  template< typename , typename , typename > friend class Impl::TaskBase ;
+
+  enum { Arg1_is_space  = Kokkos::Impl::is_space< Arg1 >::value };
+  enum { Arg2_is_space  = Kokkos::Impl::is_space< Arg2 >::value };
+  enum { Arg1_is_value  = ! Arg1_is_space &&
+                          ! std::is_same< Arg1 , void >::value };
+  enum { Arg2_is_value  = ! Arg2_is_space &&
+                          ! std::is_same< Arg2 , void >::value };
+
+  static_assert( ! ( Arg1_is_space && Arg2_is_space )
+               , "Future cannot be given two spaces" );
+
+  static_assert( ! ( Arg1_is_value && Arg2_is_value )
+               , "Future cannot be given two value types" );
+
+  using ValueType =
+    typename std::conditional< Arg1_is_value , Arg1 ,
+    typename std::conditional< Arg2_is_value , Arg2 , void
+    >::type >::type ;
+
+  using Space =
+    typename std::conditional< Arg1_is_space , Arg1 ,
+    typename std::conditional< Arg2_is_space , Arg2 , void
+    >::type >::type ;
+
+  using task_base  = Impl::TaskBase< Space , ValueType , void > ;
+  using queue_type = Impl::TaskQueue< Space > ;
+
+  task_base * m_task ;
+
+  KOKKOS_INLINE_FUNCTION explicit
+  Future( task_base * task ) : m_task(0)
+    { if ( task ) queue_type::assign( & m_task , task ); }
+
+  //----------------------------------------
+
+public:
+
+  using execution_space = typename Space::execution_space ;
+  using value_type      = ValueType ;
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  bool is_null() const { return 0 == m_task ; }
+
+  KOKKOS_INLINE_FUNCTION
+  int reference_count() const
+    { return 0 != m_task ? m_task->reference_count() : 0 ; }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  ~Future() { if ( m_task ) queue_type::assign( & m_task , (task_base*)0 ); }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr Future() noexcept : m_task(0) {}
+
+  KOKKOS_INLINE_FUNCTION
+  Future( Future && rhs )
+    : m_task( rhs.m_task ) { rhs.m_task = 0 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  Future( const Future & rhs )
+    : m_task(0)
+    { if ( rhs.m_task ) queue_type::assign( & m_task , rhs.m_task ); }
+
+  KOKKOS_INLINE_FUNCTION
+  Future & operator = ( Future && rhs )
+    {
+      if ( m_task ) queue_type::assign( & m_task , (task_base*)0 );
+      m_task = rhs.m_task ;
+      rhs.m_task = 0 ;
+      return *this ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  Future & operator = ( const Future & rhs )
+    {
+      if ( m_task || rhs.m_task ) queue_type::assign( & m_task , rhs.m_task );
+      return *this ;
+    }
+
+  //----------------------------------------
+
+  template< class A1 , class A2 >
+  KOKKOS_INLINE_FUNCTION
+  Future( Future<A1,A2> && rhs )
+    : m_task( rhs.m_task )
+    {
+      static_assert
+        ( std::is_same< Space , void >::value ||
+          std::is_same< Space , typename Future<A1,A2>::Space >::value
+        , "Assigned Futures must have the same space" );
+
+      static_assert
+        ( std::is_same< value_type , void >::value ||
+          std::is_same< value_type , typename Future<A1,A2>::value_type >::value
+        , "Assigned Futures must have the same value_type" );
+
+      rhs.m_task = 0 ;
+    }
+
+  template< class A1 , class A2 >
+  KOKKOS_INLINE_FUNCTION
+  Future( const Future<A1,A2> & rhs )
+    : m_task(0)
+    {
+      static_assert
+        ( std::is_same< Space , void >::value ||
+          std::is_same< Space , typename Future<A1,A2>::Space >::value
+        , "Assigned Futures must have the same space" );
+
+      static_assert
+        ( std::is_same< value_type , void >::value ||
+          std::is_same< value_type , typename Future<A1,A2>::value_type >::value
+        , "Assigned Futures must have the same value_type" );
+
+      if ( rhs.m_task ) queue_type::assign( & m_task , rhs.m_task );
+    }
+
+  template< class A1 , class A2 >
+  KOKKOS_INLINE_FUNCTION
+  Future & operator = ( const Future<A1,A2> & rhs )
+    {
+      static_assert
+        ( std::is_same< Space , void >::value ||
+          std::is_same< Space , typename Future<A1,A2>::Space >::value
+        , "Assigned Futures must have the same space" );
+
+      static_assert
+        ( std::is_same< value_type , void >::value ||
+          std::is_same< value_type , typename Future<A1,A2>::value_type >::value
+        , "Assigned Futures must have the same value_type" );
+
+      if ( m_task || rhs.m_task ) queue_type::assign( & m_task , rhs.m_task );
+      return *this ;
+    }
+
+  template< class A1 , class A2 >
+  KOKKOS_INLINE_FUNCTION
+  Future & operator = ( Future<A1,A2> && rhs )
+    {
+      static_assert
+        ( std::is_same< Space , void >::value ||
+          std::is_same< Space , typename Future<A1,A2>::Space >::value
+        , "Assigned Futures must have the same space" );
+
+      static_assert
+        ( std::is_same< value_type , void >::value ||
+          std::is_same< value_type , typename Future<A1,A2>::value_type >::value
+        , "Assigned Futures must have the same value_type" );
+
+      if ( m_task ) queue_type::assign( & m_task , (task_base*) 0 );
+      m_task = rhs.m_task ;
+      rhs.m_task = 0 ;
+      return *this ;
+    }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  typename task_base::get_return_type
+  get() const
+    {
+      if ( 0 == m_task ) {
+        Kokkos::abort( "Kokkos:::Future::get ERROR: is_null()");
+      }
+      return m_task->get();
+    }
+};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< typename ExecSpace >
+class TaskPolicy
+{
+private:
+
+  using track_type = Kokkos::Experimental::Impl::SharedAllocationTracker ;
+  using queue_type = Kokkos::Impl::TaskQueue< ExecSpace > ;
+  using task_base  = Impl::TaskBase< ExecSpace , void , void > ;
+
+  track_type   m_track ;
+  queue_type * m_queue ;
+
+  //----------------------------------------
+  // Process optional arguments to spawn and respawn functions
+
+  KOKKOS_INLINE_FUNCTION static
+  void assign( task_base * const ) {}
+
+  // TaskTeam or TaskSingle
+  template< typename ... Options >
+  KOKKOS_INLINE_FUNCTION static
+  void assign( task_base * const task
+             , TaskType const & arg
+             , Options const & ... opts )
+    {
+      task->m_task_type = arg ;
+      assign( task , opts ... );
+    }
+
+  // TaskHighPriority or TaskRegularPriority or TaskLowPriority
+  template< typename ... Options >
+  KOKKOS_INLINE_FUNCTION static
+  void assign( task_base * const task
+             , TaskPriority const & arg
+             , Options const & ... opts )
+    {
+      task->m_priority = arg ;
+      assign( task , opts ... );
+    }
+
+  // Future for a dependence
+  template< typename A1 , typename A2 , typename ... Options >
+  KOKKOS_INLINE_FUNCTION static
+  void assign( task_base * const task
+             , Future< A1 , A2 > const & arg 
+             , Options const & ... opts )
+    {
+      // Assign dependence to task->m_next
+      // which will be processed within subsequent call to schedule.
+      // Error if the dependence is reset.
+
+      if ( 0 != Kokkos::atomic_exchange(& task->m_next, arg.m_task) ) {
+        Kokkos::abort("TaskPolicy ERROR: resetting task dependence");
+      }
+
+      if ( 0 != arg.m_task ) {
+        // The future may be destroyed upon returning from this call
+        // so increment reference count to track this assignment.
+        Kokkos::atomic_fetch_add( &(arg.m_task->m_ref_count) , 1 );
+      }
+
+      assign( task , opts ... );
+    }
+
+  //----------------------------------------
+
+public:
+
+  using execution_policy = TaskPolicy ;
+  using execution_space  = ExecSpace ;
+  using memory_space     = typename queue_type::memory_space ;
+  using member_type      = Kokkos::Impl::TaskExec< ExecSpace > ;
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicy() : m_track(), m_queue(0) {}
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicy( TaskPolicy && rhs ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicy( TaskPolicy const & rhs ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicy & operator = ( TaskPolicy && rhs ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicy & operator = ( TaskPolicy const & rhs ) = default ;
+
+  TaskPolicy( memory_space const & arg_memory_space
+            , unsigned const arg_memory_pool_capacity
+            , unsigned const arg_memory_pool_log2_superblock = 12 )
+    : m_track()
+    , m_queue(0)
+    {
+      typedef Kokkos::Experimental::Impl::SharedAllocationRecord
+        < memory_space , typename queue_type::Destroy >
+          record_type ;
+
+      record_type * record =
+        record_type::allocate( arg_memory_space
+                             , "TaskQueue"
+                             , sizeof(queue_type)
+                             );
+
+      m_queue = new( record->data() )
+        queue_type( arg_memory_space
+                  , arg_memory_pool_capacity
+                  , arg_memory_pool_log2_superblock );
+
+      record->m_destroy.m_queue = m_queue ;
+
+      m_track.assign_allocated_record_to_uninitialized( record );
+    }
+
+  //----------------------------------------
+  /**\brief  Allocation size for a spawned task */
+  template< typename FunctorType >
+  KOKKOS_FUNCTION
+  size_t spawn_allocation_size() const
+    {
+      using task_type  = Impl::TaskBase< execution_space
+                                       , typename FunctorType::value_type
+                                       , FunctorType > ;
+
+      return m_queue->allocate_block_size( sizeof(task_type) );
+    }
+
+  /**\brief  Allocation size for a when_all aggregate */
+  KOKKOS_FUNCTION
+  size_t when_all_allocation_size( int narg ) const
+    {
+      using task_base  = Kokkos::Impl::TaskBase< ExecSpace , void , void > ;
+
+      return m_queue->allocate_block_size( sizeof(task_base) + narg * sizeof(task_base*) );
+    }
+
+  //----------------------------------------
+
+  /**\brief  A task spawns a task with options
+   *
+   *  1) High, Normal, or Low priority
+   *  2) With or without dependence
+   *  3) Team or Serial
+   */
+  template< typename FunctorType , typename ... Options >
+  KOKKOS_FUNCTION
+  Future< typename FunctorType::value_type , ExecSpace >
+  task_spawn( FunctorType const & arg_functor 
+            , Options const & ... arg_options
+            ) const
+    {
+      using value_type  = typename FunctorType::value_type ;
+      using future_type = Future< value_type , execution_space > ;
+      using task_type   = Impl::TaskBase< execution_space
+                                        , value_type
+                                        , FunctorType > ;
+
+      //----------------------------------------
+      // Give single-thread back-ends an opportunity to clear
+      // queue of ready tasks before allocating a new task
+
+      m_queue->iff_single_thread_recursive_execute();
+
+      //----------------------------------------
+
+      future_type f ;
+
+      // Allocate task from memory pool
+      f.m_task =
+        reinterpret_cast< task_type * >(m_queue->allocate(sizeof(task_type)));
+
+      if ( f.m_task ) {
+
+        // Placement new construction
+        new ( f.m_task ) task_type( arg_functor );
+
+        // Reference count starts at two
+        // +1 for matching decrement when task is complete
+        // +1 for future
+        f.m_task->m_queue      = m_queue ;
+        f.m_task->m_ref_count  = 2 ;
+        f.m_task->m_alloc_size = sizeof(task_type);
+
+        assign( f.m_task , arg_options... );
+
+        // Spawning from within the execution space so the
+        // apply function pointer is guaranteed to be valid
+        f.m_task->m_apply = task_type::apply ;
+
+        m_queue->schedule( f.m_task );
+        // this task may be updated or executed at any moment
+      }
+
+      return f ;
+    }
+
+  /**\brief  The host process spawns a task with options
+   *
+   *  1) High, Normal, or Low priority
+   *  2) With or without dependence
+   *  3) Team or Serial
+   */
+  template< typename FunctorType , typename ... Options >
+  inline
+  Future< typename FunctorType::value_type , ExecSpace >
+  host_spawn( FunctorType const & arg_functor 
+            , Options const & ... arg_options
+            ) const
+    {
+      using value_type  = typename FunctorType::value_type ;
+      using future_type = Future< value_type , execution_space > ;
+      using task_type   = Impl::TaskBase< execution_space
+                                        , value_type
+                                        , FunctorType > ;
+
+      future_type f ;
+
+      // Allocate task from memory pool
+      f.m_task = 
+        reinterpret_cast<task_type*>( m_queue->allocate(sizeof(task_type)) );
+
+      if ( f.m_task ) {
+
+        // Placement new construction
+        new( f.m_task ) task_type( arg_functor );
+
+        // Reference count starts at two:
+        // +1 to match decrement when task completes
+        // +1 for the future
+        f.m_task->m_queue      = m_queue ;
+        f.m_task->m_ref_count  = 2 ;
+        f.m_task->m_alloc_size = sizeof(task_type);
+
+        assign( f.m_task , arg_options... );
+
+        // Potentially spawning outside execution space so the
+        // apply function pointer must be obtained from execution space.
+        // Required for Cuda execution space function pointer.
+        queue_type::specialization::template
+          proc_set_apply< FunctorType >( & f.m_task->m_apply );
+
+        m_queue->schedule( f.m_task );
+      }
+      return f ;
+    }
+
+  /**\brief  Return a future that is complete
+   *         when all input futures are complete.
+   */
+  template< typename A1 , typename A2 >
+  KOKKOS_FUNCTION
+  Future< ExecSpace >
+  when_all( int narg , Future< A1 , A2 > const * const arg ) const
+    {
+      static_assert
+        ( std::is_same< execution_space
+                      , typename Future< A1 , A2 >::execution_space
+                      >::value
+        , "Future must have same execution space" );
+
+      using future_type = Future< ExecSpace > ;
+      using task_base   = Kokkos::Impl::TaskBase< ExecSpace , void , void > ;
+
+      future_type f ;
+
+      size_t const size  = sizeof(task_base) + narg * sizeof(task_base*);
+
+      f.m_task =
+        reinterpret_cast< task_base * >( m_queue->allocate( size ) );
+
+      if ( f.m_task ) {
+
+        new( f.m_task ) task_base();
+
+        // Reference count starts at two:
+        // +1 to match decrement when task completes
+        // +1 for the future
+        f.m_task->m_queue      = m_queue ;
+        f.m_task->m_ref_count  = 2 ;
+        f.m_task->m_alloc_size = size ;
+        f.m_task->m_dep_count  = narg ;
+        f.m_task->m_task_type  = task_base::Aggregate ;
+
+        task_base ** const dep = f.m_task->aggregate_dependences();
+
+        // Assign dependences to increment their reference count
+        // The futures may be destroyed upon returning from this call
+        // so increment reference count to track this assignment.
+
+        for ( int i = 0 ; i < narg ; ++i ) {
+          task_base * const t = dep[i] = arg[i].m_task ;
+          if ( 0 != t ) {
+            Kokkos::atomic_fetch_add( &(t->m_ref_count) , 1 );
+          }
+        }
+
+        m_queue->schedule( f.m_task );
+        // this when_all may be processed at any moment
+      }
+
+      return f ;
+    }
+
+  /**\brief  An executing task respawns itself with options
+   *
+   *  1) High, Normal, or Low priority
+   *  2) With or without dependence
+   */
+  template< class FunctorType , typename ... Options >
+  KOKKOS_FUNCTION
+  void respawn( FunctorType * task_self
+              , Options const & ... arg_options ) const
+    {
+      using value_type  = typename FunctorType::value_type ;
+      using task_type   = Impl::TaskBase< execution_space
+                                        , value_type
+                                        , FunctorType > ;
+
+      task_base * const zero = (task_base *) 0 ;
+      task_base * const lock = (task_base *) task_base::LockTag ;
+      task_type * const task = static_cast< task_type * >( task_self );
+
+      // Precondition:
+      //   task is in Executing state
+      //   therefore  m_next == LockTag
+      //
+      // Change to m_next == 0 for no dependence
+
+      if ( lock != Kokkos::atomic_exchange( & task->m_next, zero ) ) {
+        Kokkos::abort("TaskPolicy::respawn ERROR: already respawned");
+      }
+
+      assign( task , arg_options... );
+
+      // Postcondition:
+      //   task is in Executing-Respawn state
+      //   therefore  m_next == dependece or 0
+    }
+
+  //----------------------------------------
+
+  template< typename S >
+  friend
+  void Kokkos::wait( Kokkos::TaskPolicy< S > const & );
+
+  //----------------------------------------
+
+  inline
+  int allocation_capacity() const noexcept
+    { return m_queue->m_memory.get_mem_size(); }
+
+  KOKKOS_INLINE_FUNCTION
+  int allocated_task_count() const noexcept
+    { return m_queue->m_count_alloc ; }
+
+  KOKKOS_INLINE_FUNCTION
+  int allocated_task_count_max() const noexcept
+    { return m_queue->m_max_alloc ; }
+
+  KOKKOS_INLINE_FUNCTION
+  long allocated_task_count_accum() const noexcept
+    { return m_queue->m_accum_alloc ; }
+
+};
+
+template< typename ExecSpace >
+inline
+void wait( TaskPolicy< ExecSpace > const & policy )
+{ policy.m_queue->execute(); }
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

 namespace Kokkos {
@ -463,5 +1104,6 @@ void wait( TaskPolicy< ExecSpace > & );
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

-#endif /* #define KOKKOS_TASKPOLICY_HPP */
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+#endif /* #ifndef KOKKOS_TASKPOLICY_HPP */

--- a/lib/kokkos/core/src/Kokkos_Threads.hpp
+++ b/lib/kokkos/core/src/Kokkos_Threads.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -211,6 +211,8 @@ struct VerifyExecutionCanAccessMemorySpace
 #include <Threads/Kokkos_ThreadsTeam.hpp>
 #include <Threads/Kokkos_Threads_Parallel.hpp>

+#include <KokkosExp_MDRangePolicy.hpp>
+
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

--- a/lib/kokkos/core/src/Kokkos_View.hpp
+++ b/lib/kokkos/core/src/Kokkos_View.hpp
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
@ -178,9 +178,10 @@ public:
 namespace Kokkos {
 namespace Impl {

-template< class FunctorType , class ... Traits >
+template< class FunctorType , class ReducerType, class ... Traits >
 class ParallelReduce< FunctorType
                    , Kokkos::RangePolicy< Traits ...>
+                    , ReducerType
                    , Kokkos::OpenMP
                    >
 {
@ -192,15 +193,21 @@ private:
  typedef typename Policy::WorkRange    WorkRange ;
  typedef typename Policy::member_type  Member ;

-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType, WorkTag > ValueInit ;
-  typedef Kokkos::Impl::FunctorValueJoin<   FunctorType, WorkTag > ValueJoin ;
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  // Static Assert WorkTag void if ReducerType not InvalidType
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd, WorkTag > ValueInit ;
+  typedef Kokkos::Impl::FunctorValueJoin<   ReducerTypeFwd, WorkTag > ValueJoin ;

  typedef typename ValueTraits::pointer_type    pointer_type ;
  typedef typename ValueTraits::reference_type  reference_type ;

  const FunctorType   m_functor ;
  const Policy        m_policy ;
+  const ReducerType   m_reducer ;
  const pointer_type  m_result_ptr ;

  template< class TagType >
@ -252,7 +259,7 @@ public:
      OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
      OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");

-      OpenMPexec::resize_scratch( ValueTraits::value_size( m_functor ) , 0 );
+      OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );

 #pragma omp parallel
      {
@ -260,7 +267,7 @@ public:
        const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
        ParallelReduce::template exec_range< WorkTag >
          ( m_functor , range.begin() , range.end()
-          , ValueInit::init( m_functor , exec.scratch_reduce() ) );
+          , ValueInit::init( ReducerConditional::select(m_functor , m_reducer), exec.scratch_reduce() ) );
      }
 /* END #pragma omp parallel */

@ -269,13 +276,13 @@ public:
      const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );

      for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
-        ValueJoin::join( m_functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
+        ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
      }

-      Kokkos::Impl::FunctorFinal<  FunctorType , WorkTag >::final( m_functor , ptr );
+      Kokkos::Impl::FunctorFinal<  ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );

      if ( m_result_ptr ) {
-        const int n = ValueTraits::value_count( m_functor );
+        const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );

        for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
      }
@ -289,7 +296,7 @@ public:
      OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
      OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");

-      OpenMPexec::resize_scratch( ValueTraits::value_size( m_functor ) , 0 );
+      OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );

 #pragma omp parallel
      {
@ -302,7 +309,7 @@ public:

        long work_index = exec.get_work_index();

-        reference_type update = ValueInit::init( m_functor , exec.scratch_reduce() );
+        reference_type update = ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , exec.scratch_reduce() );
        while(work_index != -1) {
          const Member begin = static_cast<Member>(work_index) * m_policy.chunk_size();
          const Member end = begin + m_policy.chunk_size() < m_policy.end()?begin+m_policy.chunk_size():m_policy.end();
@ -319,13 +326,13 @@ public:
      const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );

      for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
-        ValueJoin::join( m_functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
+        ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
      }

-      Kokkos::Impl::FunctorFinal<  FunctorType , WorkTag >::final( m_functor , ptr );
+      Kokkos::Impl::FunctorFinal<  ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );

      if ( m_result_ptr ) {
-        const int n = ValueTraits::value_count( m_functor );
+        const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );

        for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
      }
@ -337,18 +344,35 @@ public:
  inline
  ParallelReduce( const FunctorType & arg_functor
                , Policy       arg_policy
-                , const ViewType    & arg_result_view )
+                , const ViewType    & arg_result_view
+                , typename std::enable_if<
+                           Kokkos::is_view< ViewType >::value &&
+                           !Kokkos::is_reducer_type<ReducerType>::value
+                  ,void*>::type = NULL)
    : m_functor( arg_functor )
    , m_policy(  arg_policy )
-    , m_result_ptr(  arg_result_view.ptr_on_device() )
+    , m_reducer( InvalidType() )
+    , m_result_ptr(  arg_result_view.data() )
    {
-      static_assert( Kokkos::is_view< ViewType >::value
-        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View" );
-
-      static_assert( std::is_same< typename ViewType::memory_space
+      /*static_assert( std::is_same< typename ViewType::memory_space
                                      , Kokkos::HostSpace >::value
-        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );
+        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
    }
+
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+                , Policy       arg_policy
+                , const ReducerType& reducer )
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    , m_reducer( reducer )
+    , m_result_ptr(  reducer.result_view().data() )
+    {
+      /*static_assert( std::is_same< typename ViewType::memory_space
+                                      , Kokkos::HostSpace >::value
+        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
+    }
+
 };

 } // namespace Impl
@ -568,13 +592,13 @@ public:

      const size_t team_reduce_size = Policy::member_type::team_reduce_size();

-      OpenMPexec::resize_scratch( 0 , team_reduce_size + m_shmem_size );
+      OpenMPexec::resize_scratch( 0 , team_reduce_size + m_shmem_size + m_policy.scratch_size(1));

 #pragma omp parallel
      {
        ParallelFor::template exec_team< WorkTag, typename Policy::schedule_type::type>
          ( m_functor
-          , Member( * OpenMPexec::get_thread_omp(), m_policy, m_shmem_size) );
+          , Member( * OpenMPexec::get_thread_omp(), m_policy, m_shmem_size, 0) );
      }
 /* END #pragma omp parallel */
    }
@ -584,14 +608,15 @@ public:
               const Policy      & arg_policy )
    : m_functor( arg_functor )
    , m_policy(  arg_policy )
-    , m_shmem_size( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+    , m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
    {}
 };


-template< class FunctorType , class ... Properties >
+template< class FunctorType , class ReducerType, class ... Properties >
 class ParallelReduce< FunctorType
                    , Kokkos::TeamPolicy< Properties ... >
+                    , ReducerType
                    , Kokkos::OpenMP
                    >
 {
@ -602,15 +627,19 @@ private:
  typedef typename Policy::work_tag     WorkTag ;
  typedef typename Policy::member_type  Member ;

-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag >  ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , WorkTag >  ValueInit ;
-  typedef Kokkos::Impl::FunctorValueJoin<   FunctorType , WorkTag >  ValueJoin ;
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTag >  ValueInit ;
+  typedef Kokkos::Impl::FunctorValueJoin<   ReducerTypeFwd , WorkTag >  ValueJoin ;

  typedef typename ValueTraits::pointer_type    pointer_type ;
  typedef typename ValueTraits::reference_type  reference_type ;

  const FunctorType  m_functor ;
  const Policy       m_policy ;
+  const ReducerType  m_reducer ;
  const pointer_type m_result_ptr ;
  const int          m_shmem_size ;

@ -644,7 +673,7 @@ public:

      const size_t team_reduce_size = Policy::member_type::team_reduce_size();

-      OpenMPexec::resize_scratch( ValueTraits::value_size( m_functor ) , team_reduce_size + m_shmem_size );
+      OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , team_reduce_size + m_shmem_size );

 #pragma omp parallel
      {
@ -652,8 +681,8 @@ public:

        ParallelReduce::template exec_team< WorkTag >
          ( m_functor
-          , Member( exec , m_policy , m_shmem_size )
-          , ValueInit::init( m_functor , exec.scratch_reduce() ) );
+          , Member( exec , m_policy , m_shmem_size, 0 )
+          , ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , exec.scratch_reduce() ) );
      }
 /* END #pragma omp parallel */

@ -665,13 +694,13 @@ public:
          max_active_threads = m_policy.league_size()* m_policy.team_size();

        for ( int i = 1 ; i < max_active_threads ; ++i ) {
-          ValueJoin::join( m_functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
+          ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
        }

-        Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( m_functor , ptr );
+        Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );

        if ( m_result_ptr ) {
-          const int n = ValueTraits::value_count( m_functor );
+          const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );

          for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
        }
@ -682,12 +711,33 @@ public:
  inline
  ParallelReduce( const FunctorType  & arg_functor ,
                  const Policy       & arg_policy ,
-                  const ViewType     & arg_result )
+                  const ViewType     & arg_result ,
+                  typename std::enable_if<
+                    Kokkos::is_view< ViewType >::value &&
+                    !Kokkos::is_reducer_type<ReducerType>::value
+                    ,void*>::type = NULL)
    : m_functor( arg_functor )
    , m_policy(  arg_policy )
+    , m_reducer( InvalidType() )
    , m_result_ptr( arg_result.ptr_on_device() )
-    , m_shmem_size( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+    , m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
    {}
+
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+    , Policy       arg_policy
+    , const ReducerType& reducer )
+  : m_functor( arg_functor )
+  , m_policy(  arg_policy )
+  , m_reducer( reducer )
+  , m_result_ptr(  reducer.result_view().data() )
+  , m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+  {
+  /*static_assert( std::is_same< typename ViewType::memory_space
+                          , Kokkos::HostSpace >::value
+  , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
+  }
+
 };

 } // namespace Impl
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
@ -0,0 +1,329 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#if defined( KOKKOS_HAVE_OPENMP ) && defined( KOKKOS_ENABLE_TASKPOLICY )
+
+#include <impl/Kokkos_TaskQueue_impl.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template class TaskQueue< Kokkos::OpenMP > ;
+
+//----------------------------------------------------------------------------
+
+TaskExec< Kokkos::OpenMP >::
+TaskExec()
+  : m_self_exec( 0 )
+  , m_team_exec( 0 )
+  , m_sync_mask( 0 )
+  , m_sync_value( 0 )
+  , m_sync_step( 0 )
+  , m_group_rank( 0 )
+  , m_team_rank( 0 )
+  , m_team_size( 1 )
+{
+}
+
+TaskExec< Kokkos::OpenMP >::
+TaskExec( Kokkos::Impl::OpenMPexec & arg_exec , int const arg_team_size )
+  : m_self_exec( & arg_exec )
+  , m_team_exec( arg_exec.pool_rev(arg_exec.pool_rank_rev() / arg_team_size) )
+  , m_sync_mask( 0 )
+  , m_sync_value( 0 )
+  , m_sync_step( 0 )
+  , m_group_rank( arg_exec.pool_rank_rev() / arg_team_size )
+  , m_team_rank(  arg_exec.pool_rank_rev() % arg_team_size )
+  , m_team_size(  arg_team_size )
+{
+  // This team spans
+  //    m_self_exec->pool_rev( team_size * group_rank )
+  //    m_self_exec->pool_rev( team_size * ( group_rank + 1 ) - 1 )
+
+  int64_t volatile * const sync = (int64_t *) m_self_exec->scratch_reduce();
+
+  sync[0] = int64_t(0) ;
+  sync[1] = int64_t(0) ;
+
+  for ( int i = 0 ; i < m_team_size ; ++i ) {
+    m_sync_value |= int64_t(1) << (8*i);
+    m_sync_mask  |= int64_t(3) << (8*i);
+  }
+
+  Kokkos::memory_fence();
+}
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+
+void TaskExec< Kokkos::OpenMP >::team_barrier_impl() const
+{
+  if ( m_team_exec->scratch_reduce_size() < int(2 * sizeof(int64_t)) ) {
+    Kokkos::abort("TaskQueue<OpenMP> scratch_reduce memory too small");
+  }
+
+  // Use team shared memory to synchronize.
+  // Alternate memory locations between barriers to avoid a sequence
+  // of barriers overtaking one another.
+
+  int64_t volatile * const sync =
+    ((int64_t *) m_team_exec->scratch_reduce()) + ( m_sync_step & 0x01 );
+
+  // This team member sets one byte within the sync variable
+  int8_t volatile * const sync_self =
+   ((int8_t *) sync) + m_team_rank ;
+
+#if 0
+fprintf( stdout
+       , "barrier group(%d) member(%d) step(%d) wait(%lx) : before(%lx)\n"
+       , m_group_rank
+       , m_team_rank
+       , m_sync_step
+       , m_sync_value
+       , *sync
+       );
+fflush(stdout);
+#endif
+
+  *sync_self = int8_t( m_sync_value & 0x03 ); // signal arrival
+
+  while ( m_sync_value != *sync ); // wait for team to arrive
+
+#if 0
+fprintf( stdout
+       , "barrier group(%d) member(%d) step(%d) wait(%lx) : after(%lx)\n"
+       , m_group_rank
+       , m_team_rank
+       , m_sync_step
+       , m_sync_value
+       , *sync
+       );
+fflush(stdout);
+#endif
+
+  ++m_sync_step ;
+
+  if ( 0 == ( 0x01 & m_sync_step ) ) { // Every other step
+    m_sync_value ^= m_sync_mask ;
+    if ( 1000 < m_sync_step ) m_sync_step = 0 ;
+  }
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+
+void TaskQueueSpecialization< Kokkos::OpenMP >::execute
+  ( TaskQueue< Kokkos::OpenMP > * const queue )
+{
+  using execution_space = Kokkos::OpenMP ;
+  using queue_type      = TaskQueue< execution_space > ;
+  using task_root_type  = TaskBase< execution_space , void , void > ;
+  using PoolExec        = Kokkos::Impl::OpenMPexec ;
+  using Member          = TaskExec< execution_space > ;
+
+  task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
+
+  // Required:  team_size <= 8
+
+  const int team_size = PoolExec::pool_size(2); // Threads per core
+  // const int team_size = PoolExec::pool_size(1); // Threads per NUMA
+
+  if ( 8 < team_size ) {
+    Kokkos::abort("TaskQueue<OpenMP> unsupported team size");
+  }
+
+#pragma omp parallel
+  {
+    PoolExec & self = *PoolExec::get_thread_omp();
+
+    Member single_exec ;
+    Member team_exec( self , team_size );
+
+    // Team shared memory
+    task_root_type * volatile * const task_shared =
+      (task_root_type **) team_exec.m_team_exec->scratch_thread();
+
+// Barrier across entire OpenMP thread pool to insure initialization
+#pragma omp barrier
+
+    // Loop until all queues are empty and no tasks in flight
+
+    do {
+
+      task_root_type * task = 0 ;
+
+      // Each team lead attempts to acquire either a thread team task
+      // or a single thread task for the team.
+
+      if ( 0 == team_exec.team_rank() ) {
+
+        task = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
+
+        // Loop by priority and then type
+        for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
+          for ( int j = 0 ; j < 2 && end == task ; ++j ) {
+            task = queue_type::pop_task( & queue->m_ready[i][j] );
+          }
+        }
+      }
+
+      // Team lead broadcast acquired task to team members:
+
+      if ( 1 < team_exec.team_size() ) {
+
+        if ( 0 == team_exec.team_rank() ) *task_shared = task ;
+
+        // Fence to be sure task_shared is stored before the barrier
+        Kokkos::memory_fence();
+
+        // Whole team waits for every team member to reach this statement
+        team_exec.team_barrier();
+
+        // Fence to be sure task_shared is stored
+        Kokkos::memory_fence();
+
+        task = *task_shared ;
+      }
+
+#if 0
+fprintf( stdout
+       , "\nexecute group(%d) member(%d) task_shared(0x%lx) task(0x%lx)\n"
+       , team_exec.m_group_rank
+       , team_exec.m_team_rank
+       , uintptr_t(task_shared)
+       , uintptr_t(task)
+       );
+fflush(stdout);
+#endif
+
+      if ( 0 == task ) break ; // 0 == m_ready_count
+
+      if ( end == task ) {
+        // All team members wait for whole team to reach this statement.
+        // Is necessary to prevent task_shared from being updated
+        // before it is read by all threads.
+        team_exec.team_barrier();
+      }
+      else if ( task_root_type::TaskTeam == task->m_task_type ) {
+        // Thread Team Task
+        (*task->m_apply)( task , & team_exec );
+
+        // The m_apply function performs a barrier
+
+        if ( 0 == team_exec.team_rank() ) {
+          // team member #0 completes the task, which may delete the task
+          queue->complete( task ); 
+        }
+      }
+      else {
+        // Single Thread Task
+
+        if ( 0 == team_exec.team_rank() ) {
+
+          (*task->m_apply)( task , & single_exec );
+
+          queue->complete( task ); 
+        }
+
+        // All team members wait for whole team to reach this statement.
+        // Not necessary to complete the task.
+        // Is necessary to prevent task_shared from being updated
+        // before it is read by all threads.
+        team_exec.team_barrier();
+      }
+    } while(1);
+  }
+// END #pragma omp parallel
+
+}
+
+void TaskQueueSpecialization< Kokkos::OpenMP >::
+  iff_single_thread_recursive_execute
+    ( TaskQueue< Kokkos::OpenMP > * const queue )
+{
+  using execution_space = Kokkos::OpenMP ;
+  using queue_type      = TaskQueue< execution_space > ;
+  using task_root_type  = TaskBase< execution_space , void , void > ;
+  using Member          = TaskExec< execution_space > ;
+
+  if ( 1 == omp_get_num_threads() ) {
+
+    task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
+
+    Member single_exec ;
+
+    task_root_type * task = end ;
+
+    do {
+
+      task = end ;
+
+      // Loop by priority and then type
+      for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
+        for ( int j = 0 ; j < 2 && end == task ; ++j ) {
+          task = queue_type::pop_task( & queue->m_ready[i][j] );
+        }
+      }
+
+      if ( end == task ) break ;
+
+      (*task->m_apply)( task , & single_exec );
+
+      queue->complete( task ); 
+
+    } while(1);
+  }
+}
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_HAVE_OPENMP ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
+
+
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
@ -0,0 +1,356 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_OPENMP_TASK_HPP
+#define KOKKOS_IMPL_OPENMP_TASK_HPP
+
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+class TaskQueueSpecialization< Kokkos::OpenMP >
+{
+public:
+
+  using execution_space = Kokkos::OpenMP ;
+  using queue_type      = Kokkos::Impl::TaskQueue< execution_space > ;
+  using task_base_type  = Kokkos::Impl::TaskBase< execution_space , void , void > ;
+
+  // Must specify memory space
+  using memory_space = Kokkos::HostSpace ;
+
+  static
+  void iff_single_thread_recursive_execute( queue_type * const );
+
+  // Must provide task queue execution function
+  static void execute( queue_type * const );
+
+  // Must provide mechanism to set function pointer in
+  // execution space from the host process.
+  template< typename FunctorType >
+  static
+  void proc_set_apply( task_base_type::function_type * ptr )
+    {
+      using TaskType = TaskBase< Kokkos::OpenMP
+                               , typename FunctorType::value_type
+                               , FunctorType
+                               > ;
+       *ptr = TaskType::apply ;
+    }
+};
+
+extern template class TaskQueue< Kokkos::OpenMP > ;
+
+//----------------------------------------------------------------------------
+
+template<>
+class TaskExec< Kokkos::OpenMP >
+{
+private:
+
+  TaskExec( TaskExec && ) = delete ;
+  TaskExec( TaskExec const & ) = delete ;
+  TaskExec & operator = ( TaskExec && ) = delete ;
+  TaskExec & operator = ( TaskExec const & ) = delete ;
+
+
+  using PoolExec = Kokkos::Impl::OpenMPexec ;
+
+  friend class Kokkos::Impl::TaskQueue< Kokkos::OpenMP > ;
+  friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::OpenMP > ;
+
+  PoolExec * const m_self_exec ;  ///< This thread's thread pool data structure 
+  PoolExec * const m_team_exec ;  ///< Team thread's thread pool data structure
+  int64_t          m_sync_mask ;
+  int64_t mutable  m_sync_value ;
+  int     mutable  m_sync_step ;
+  int              m_group_rank ; ///< Which "team" subset of thread pool
+  int              m_team_rank ;  ///< Which thread within a team
+  int              m_team_size ;
+
+  TaskExec();
+  TaskExec( PoolExec & arg_exec , int arg_team_size );
+
+  void team_barrier_impl() const ;
+
+public:
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  void * team_shared() const
+    { return m_team_exec ? m_team_exec->scratch_thread() : (void*) 0 ; }
+
+  int team_shared_size() const
+    { return m_team_exec ? m_team_exec->scratch_thread_size() : 0 ; }
+
+  /**\brief  Whole team enters this function call
+   *         before any teeam member returns from
+   *         this function call.
+   */
+  void team_barrier() const { if ( 1 < m_team_size ) team_barrier_impl(); }
+#else
+  KOKKOS_INLINE_FUNCTION void team_barrier() const {}
+  KOKKOS_INLINE_FUNCTION void * team_shared() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION int team_shared_size() const { return 0 ; }
+#endif
+
+  KOKKOS_INLINE_FUNCTION
+  int team_rank() const { return m_team_rank ; }
+
+  KOKKOS_INLINE_FUNCTION
+  int team_size() const { return m_team_size ; }
+};
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >
+TeamThreadRange
+  ( Impl::TaskExec< Kokkos::OpenMP > & thread
+  , const iType & count )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >(thread,count);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >
+TeamThreadRange
+  ( Impl:: TaskExec< Kokkos::OpenMP > & thread
+  , const iType & start
+  , const iType & end )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >(thread,start,end);
+}
+
+/** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team.
+ * This functionality requires C++11 support.
+*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for
+  ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >& loop_boundaries
+  , const Lambda& lambda
+  )
+{
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i);
+  }
+}
+
+template<typename iType, class Lambda, typename ValueType>
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >& loop_boundaries
+  , const Lambda& lambda
+  , ValueType& initialized_result)
+{
+  int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
+  ValueType result = initialized_result;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i, result);
+  }
+
+  if ( 1 < loop_boundaries.thread.team_size() ) {
+
+    ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
+
+    loop_boundaries.thread.team_barrier();
+    shared[team_rank] = result;
+
+    loop_boundaries.thread.team_barrier();
+
+    // reduce across threads to thread 0
+    if (team_rank == 0) {
+      for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
+        shared[0] += shared[i];
+      }
+    }
+
+    loop_boundaries.thread.team_barrier();
+
+    // broadcast result
+    initialized_result = shared[0];
+  }
+  else {
+    initialized_result = result ;
+  }
+}
+
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
+   const Lambda & lambda,
+   const JoinType & join,
+   ValueType& initialized_result)
+{
+  int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
+  ValueType result = initialized_result;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i, result);
+  }
+
+  if ( 1 < loop_boundaries.thread.team_size() ) {
+    ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
+
+    loop_boundaries.thread.team_barrier();
+    shared[team_rank] = result;
+
+    loop_boundaries.thread.team_barrier();
+
+    // reduce across threads to thread 0
+    if (team_rank == 0) {
+      for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
+        join(shared[0], shared[i]);
+      }
+    }
+
+    loop_boundaries.thread.team_barrier();
+
+    // broadcast result
+    initialized_result = shared[0];
+  }
+  else {
+    initialized_result = result ;
+  }
+}
+
+// placeholder for future function
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
+   const Lambda & lambda,
+   ValueType& initialized_result)
+{
+}
+
+// placeholder for future function
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
+   const Lambda & lambda,
+   const JoinType & join,
+   ValueType& initialized_result)
+{
+}
+
+template< typename ValueType, typename iType, class Lambda >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan
+  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
+   const Lambda & lambda)
+{
+  ValueType accum = 0 ;
+  ValueType val, local_total;
+  ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
+  int team_size = loop_boundaries.thread.team_size();
+  int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
+
+  // Intra-member scan
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    local_total = 0;
+    lambda(i,local_total,false);
+    val = accum;
+    lambda(i,val,true);
+    accum += local_total;
+  }
+
+  shared[team_rank] = accum;
+  loop_boundaries.thread.team_barrier();
+
+  // Member 0 do scan on accumulated totals
+  if (team_rank == 0) {
+    for( iType i = 1; i < team_size; i+=1) {
+      shared[i] += shared[i-1];
+    }
+    accum = 0; // Member 0 set accum to 0 in preparation for inter-member scan
+  }
+
+  loop_boundaries.thread.team_barrier();
+
+  // Inter-member scan adding in accumulated totals
+  if (team_rank != 0) { accum = shared[team_rank-1]; }
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    local_total = 0;
+    lambda(i,local_total,false);
+    val = accum;
+    lambda(i,val,true);
+    accum += local_total;
+  }
+}
+
+// placeholder for future function
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
+   const Lambda & lambda)
+{
+}
+
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+#endif /* #ifndef KOKKOS_IMPL_OPENMP_TASK_HPP */
+
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp
@ -49,6 +49,7 @@
 #include <impl/Kokkos_Error.hpp>
 #include <iostream>
 #include <impl/Kokkos_CPUDiscovery.hpp>
+#include <impl/Kokkos_Profiling_Interface.hpp>

 #ifdef KOKKOS_HAVE_OPENMP

@ -85,16 +86,8 @@ int OpenMPexec::m_map_rank[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };

 int OpenMPexec::m_pool_topo[ 4 ] = { 0 };

-#if ! KOKKOS_USING_EXP_VIEW
-
-OpenMPexec::Pool OpenMPexec::m_pool;
-
-#else
-
 OpenMPexec * OpenMPexec::m_pool[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };

-#endif
-
 void OpenMPexec::verify_is_process( const char * const label )
 {
  if ( omp_in_parallel() ) {
@ -125,16 +118,12 @@ void OpenMPexec::clear_scratch()
 #pragma omp parallel
  {
    const int rank_rev = m_map_rank[ omp_get_thread_num() ];
-#if KOKKOS_USING_EXP_VIEW
    typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
    if ( m_pool[ rank_rev ] ) {
      Record * const r = Record::get_record( m_pool[ rank_rev ] );
      m_pool[ rank_rev ] = 0 ;
      Record::decrement( r );
    }
-#else
-    m_pool.at(rank_rev).clear();
-#endif
  }
 /* END #pragma omp parallel */
 }
@ -172,8 +161,6 @@ void OpenMPexec::resize_scratch( size_t reduce_size , size_t thread_size )
      const int rank_rev = m_map_rank[ omp_get_thread_num() ];
      const int rank     = pool_size - ( rank_rev + 1 );

-#if KOKKOS_USING_EXP_VIEW
-
      typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;

      Record * const r = Record::allocate( Kokkos::HostSpace()
@ -184,15 +171,6 @@ void OpenMPexec::resize_scratch( size_t reduce_size , size_t thread_size )

      m_pool[ rank_rev ] = reinterpret_cast<OpenMPexec*>( r->data() );

-#else
-
-      #pragma omp critical
-      {
-        m_pool.at(rank_rev) = HostSpace::allocate_and_track( "openmp_scratch", alloc_size );
-      }
-
-#endif
-
      new ( m_pool[ rank_rev ] ) OpenMPexec( rank , ALLOC_EXEC , reduce_size , thread_size );
    }
 /* END #pragma omp parallel */
@ -330,6 +308,10 @@ void OpenMP::initialize( unsigned thread_count ,
  }
  // Init the array for used for arbitrarily sized atomics
  Impl::init_lock_array_host_space();
+
+  #if (KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::initialize();
+  #endif
 }

 //----------------------------------------------------------------------------
@ -350,6 +332,10 @@ void OpenMP::finalize()
  if ( Impl::s_using_hwloc && Kokkos::hwloc::can_bind_threads() ) {
    hwloc::unbind_this_thread();
  }
+
+  #if (KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::finalize();
+  #endif
 }

 //----------------------------------------------------------------------------
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp
@ -46,7 +46,6 @@

 #include <impl/Kokkos_Traits.hpp>
 #include <impl/Kokkos_spinwait.hpp>
-#include <impl/Kokkos_AllocationTracker.hpp>

 #include <Kokkos_Atomic.hpp>
 #include <iostream>
@ -63,38 +62,10 @@ public:

  enum { MAX_THREAD_COUNT = 4096 };

-#if ! KOKKOS_USING_EXP_VIEW
-
-  struct Pool
-  {
-    Pool() : m_trackers() {}
-
-    AllocationTracker m_trackers[ MAX_THREAD_COUNT ];
-
-    OpenMPexec * operator[](int i)
-    {
-      return reinterpret_cast<OpenMPexec *>(m_trackers[i].alloc_ptr());
-    }
-
-    AllocationTracker & at(int i)
-    {
-      return m_trackers[i];
-    }
-  };
-
-
-private:
-
-  static Pool         m_pool; // Indexed by: m_pool_rank_rev
-
-#else
-
 private:

  static OpenMPexec * m_pool[ MAX_THREAD_COUNT ]; // Indexed by: m_pool_rank_rev

-#endif
-
  static int          m_pool_topo[ 4 ];
  static int          m_map_rank[ MAX_THREAD_COUNT ];

@ -145,6 +116,12 @@ public:

  inline long team_work_index() const { return m_team_work_index ; }

+  inline int scratch_reduce_size() const
+    { return m_scratch_reduce_end - m_scratch_exec_end ; }
+
+  inline int scratch_thread_size() const
+    { return m_scratch_thread_end - m_scratch_reduce_end ; }
+
  inline void * scratch_reduce() const { return ((char *) this) + m_scratch_exec_end ; }
  inline void * scratch_thread() const { return ((char *) this) + m_scratch_reduce_end ; }

@ -157,15 +134,15 @@ public:

  ~OpenMPexec() {}

-  OpenMPexec( const int poolRank
-            , const int scratch_exec_size
-            , const int scratch_reduce_size
-            , const int scratch_thread_size )
-    : m_pool_rank( poolRank )
-    , m_pool_rank_rev( pool_size() - ( poolRank + 1 ) )
-    , m_scratch_exec_end( scratch_exec_size )
-    , m_scratch_reduce_end( m_scratch_exec_end   + scratch_reduce_size )
-    , m_scratch_thread_end( m_scratch_reduce_end + scratch_thread_size )
+  OpenMPexec( const int arg_poolRank
+            , const int arg_scratch_exec_size
+            , const int arg_scratch_reduce_size
+            , const int arg_scratch_thread_size )
+    : m_pool_rank( arg_poolRank )
+    , m_pool_rank_rev( pool_size() - ( arg_poolRank + 1 ) )
+    , m_scratch_exec_end( arg_scratch_exec_size )
+    , m_scratch_reduce_end( m_scratch_exec_end   + arg_scratch_reduce_size )
+    , m_scratch_thread_end( m_scratch_reduce_end + arg_scratch_thread_size )
    , m_barrier_state(0)
    {}

@ -330,7 +307,7 @@ public:

  Impl::OpenMPexec    & m_exec ;
  scratch_memory_space  m_team_shared ;
-  int                   m_team_shmem ;
+  int                   m_team_scratch_size[2] ;
  int                   m_team_base_rev ;
  int                   m_team_rank_rev ;
  int                   m_team_rank ;
@ -378,15 +355,15 @@ public:

  KOKKOS_INLINE_FUNCTION
  const execution_space::scratch_memory_space& team_shmem() const
-    { return m_team_shared.set_team_thread_mode(1,0) ; }
+    { return m_team_shared.set_team_thread_mode(0,1,0) ; }

  KOKKOS_INLINE_FUNCTION
  const execution_space::scratch_memory_space& team_scratch(int) const
-    { return m_team_shared.set_team_thread_mode(1,0) ; }
+    { return m_team_shared.set_team_thread_mode(0,1,0) ; }

  KOKKOS_INLINE_FUNCTION
  const execution_space::scratch_memory_space& thread_scratch(int) const
-    { return m_team_shared.set_team_thread_mode(team_size(),team_rank()) ; }
+    { return m_team_shared.set_team_thread_mode(0,team_size(),team_rank()) ; }

  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
@ -568,11 +545,12 @@ public:
  inline
  OpenMPexecTeamMember( Impl::OpenMPexec & exec
                      , const TeamPolicyInternal< OpenMP, Properties ...> & team
-                      , const int shmem_size
+                      , const int shmem_size_L1
+                      , const int shmem_size_L2
                      )
    : m_exec( exec )
    , m_team_shared(0,0)
-    , m_team_shmem( shmem_size )
+    , m_team_scratch_size{ shmem_size_L1 , shmem_size_L2 }
    , m_team_base_rev(0)
    , m_team_rank_rev(0)
    , m_team_rank(0)
@ -580,7 +558,7 @@ public:
    , m_league_rank(0)
    , m_league_end(0)
    , m_league_size( team.league_size() )
-    , m_chunk_size( team.chunk_size() )
+    , m_chunk_size( team.chunk_size()>0?team.chunk_size():team.team_iter() )
    , m_league_chunk_end(0)
    , m_team_lead_exec( *exec.pool_rev( team.team_alloc() * (m_exec.pool_rank_rev()/team.team_alloc()) ))
    , m_team_alloc( team.team_alloc())
@ -589,10 +567,9 @@ public:
      const int pool_team_rank_rev   = pool_rank_rev % team.team_alloc();
      const int pool_league_rank_rev = pool_rank_rev / team.team_alloc();
      const int pool_num_teams       = OpenMP::thread_pool_size(0)/team.team_alloc();
-      const int chunk_size           = team.chunk_size()>0?team.chunk_size():team.team_iter();
-      const int chunks_per_team      = ( team.league_size() + chunk_size*pool_num_teams-1 ) / (chunk_size*pool_num_teams);
-            int league_iter_end      = team.league_size() - pool_league_rank_rev * chunks_per_team * chunk_size;
-            int league_iter_begin    = league_iter_end - chunks_per_team * chunk_size;
+      const int chunks_per_team      = ( team.league_size() + m_chunk_size*pool_num_teams-1 ) / (m_chunk_size*pool_num_teams);
+            int league_iter_end      = team.league_size() - pool_league_rank_rev * chunks_per_team * m_chunk_size;
+            int league_iter_begin    = league_iter_end - chunks_per_team * m_chunk_size;
      if (league_iter_begin < 0)     league_iter_begin = 0;
      if (league_iter_end>team.league_size()) league_iter_end = team.league_size();

@ -611,7 +588,9 @@ public:
        m_team_rank      = m_team_size - ( m_team_rank_rev + 1 );
        m_league_end     = league_iter_end ;
        m_league_rank    = league_iter_begin ;
-        new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_shmem );
+        new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] ,
+                                             ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0],
+                                               0 );
      }

      if ( (m_team_rank_rev == 0) && (m_invalid_thread == 0) ) {
@ -627,10 +606,13 @@ public:

  void next_static()
    {
-      if ( ++m_league_rank < m_league_end ) {
+      if ( m_league_rank < m_league_end ) {
        team_barrier();
-        new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_shmem );
+        new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] ,
+                                             ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0],
+                                               0);
      }
+      m_league_rank++;
    }

  bool valid_dynamic() {
@ -661,10 +643,13 @@ public:
    if(m_invalid_thread)
      return;

-    team_barrier();
-    if ( ++m_league_rank < m_league_chunk_end ) {
-      new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_shmem );
+    if ( m_league_rank < m_league_chunk_end ) {
+      team_barrier();
+      new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] ,
+                                           ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0],
+                                             0);
    }
+    m_league_rank++;
  }

  static inline int team_reduce_size() { return TEAM_REDUCE_SIZE ; }
@ -687,8 +672,10 @@ public:
    m_team_size = p.m_team_size;
    m_team_alloc = p.m_team_alloc;
    m_team_iter = p.m_team_iter;
-    m_team_scratch_size = p.m_team_scratch_size;
-    m_thread_scratch_size = p.m_thread_scratch_size;
+    m_team_scratch_size[0] = p.m_team_scratch_size[0];
+    m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
+    m_team_scratch_size[1] = p.m_team_scratch_size[1];
+    m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
    m_chunk_size = p.m_chunk_size;
    return *this;
  }
@ -719,8 +706,8 @@ private:
  int m_team_alloc ;
  int m_team_iter ;

-  size_t m_team_scratch_size;
-  size_t m_thread_scratch_size;
+  size_t m_team_scratch_size[2];
+  size_t m_thread_scratch_size[2];

  int m_chunk_size;

@ -753,15 +740,19 @@ public:

  inline int team_size()   const { return m_team_size ; }
  inline int league_size() const { return m_league_size ; }
-  inline size_t scratch_size() const { return m_team_scratch_size + m_team_size*m_thread_scratch_size ; }
+  inline size_t scratch_size(const int& level, int team_size_ = -1) const {
+    if(team_size_ < 0)
+      team_size_ = m_team_size;
+    return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level] ;
+  }

  /** \brief  Specify league size, request team size */
  TeamPolicyInternal( typename traits::execution_space &
            , int league_size_request
            , int team_size_request
            , int /* vector_length_request */ = 1 )
-            : m_team_scratch_size ( 0 )
-            , m_thread_scratch_size ( 0 )
+            : m_team_scratch_size { 0 , 0 }
+            , m_thread_scratch_size { 0 , 0 }
            , m_chunk_size(0)
    { init( league_size_request , team_size_request ); }

@ -769,24 +760,24 @@ public:
            , int league_size_request
            , const Kokkos::AUTO_t & /* team_size_request */
            , int /* vector_length_request */ = 1)
-            : m_team_scratch_size ( 0 )
-            , m_thread_scratch_size ( 0 )
+            : m_team_scratch_size { 0 , 0 }
+            , m_thread_scratch_size { 0 , 0 }
            , m_chunk_size(0)
    { init( league_size_request , traits::execution_space::thread_pool_size(2) ); }

  TeamPolicyInternal( int league_size_request
            , int team_size_request
            , int /* vector_length_request */ = 1 )
-            : m_team_scratch_size ( 0 )
-            , m_thread_scratch_size ( 0 )
+            : m_team_scratch_size { 0 , 0 }
+            , m_thread_scratch_size { 0 , 0 }
            , m_chunk_size(0)
    { init( league_size_request , team_size_request ); }

  TeamPolicyInternal( int league_size_request
            , const Kokkos::AUTO_t & /* team_size_request */
            , int /* vector_length_request */ = 1 )
-            : m_team_scratch_size ( 0 )
-            , m_thread_scratch_size ( 0 )
+            : m_team_scratch_size { 0 , 0 }
+            , m_thread_scratch_size { 0 , 0 }
            , m_chunk_size(0)
    { init( league_size_request , traits::execution_space::thread_pool_size(2) ); }

@ -803,24 +794,21 @@ public:
  }

  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
-    (void) level;
    TeamPolicyInternal p = *this;
-    p.m_team_scratch_size = per_team.value;
+    p.m_team_scratch_size[level] = per_team.value;
    return p;
  };

  inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
-    (void) level;
    TeamPolicyInternal p = *this;
-    p.m_thread_scratch_size = per_thread.value;
+    p.m_thread_scratch_size[level] = per_thread.value;
    return p;
  };

  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
-    (void) level;
    TeamPolicyInternal p = *this;
-    p.m_team_scratch_size = per_team.value;
-    p.m_thread_scratch_size = per_thread.value;
+    p.m_team_scratch_size[level] = per_team.value;
+    p.m_thread_scratch_size[level] = per_thread.value;
    return p;
  };

--- a/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.cpp
+++ b/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.cpp
@ -104,7 +104,7 @@ namespace Kokkos {

 int Qthread::is_initialized()
 {
-  Impl::s_number_workers != 0 ;
+  return Impl::s_number_workers != 0 ;
 }

 int Qthread::concurrency()
--- a/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.hpp
+++ b/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -113,7 +113,7 @@ public:
        m_worker_state = QthreadExec::Inactive ;
        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
      }
-    
+
      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
        m_worker_base[j]->m_worker_state = QthreadExec::Active ;
      }
@ -136,7 +136,7 @@ public:
          m_worker_state = QthreadExec::Inactive ;
          Impl::spinwait( m_worker_state , QthreadExec::Inactive );
        }
-    
+
        for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
          m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
        }
@ -145,11 +145,13 @@ public:

  //----------------------------------------
  /** Reduce across all workers participating in the 'exec_all' */
-  template< class FunctorType , class ArgTag >
+  template< class FunctorType , class ReducerType , class ArgTag >
  inline
-  void exec_all_reduce( const FunctorType & func ) const
+  void exec_all_reduce( const FunctorType & func, const ReducerType & reduce ) const
    {
-      typedef Kokkos::Impl::FunctorValueJoin< FunctorType , ArgTag > ValueJoin ;
+      typedef Kokkos::Impl::if_c< std::is_same<InvalidType, ReducerType>::value, FunctorType, ReducerType > ReducerConditional;
+      typedef typename ReducerConditional::type ReducerTypeFwd;
+      typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, ArgTag > ValueJoin ;

      const int rev_rank = m_worker_size - ( m_worker_rank + 1 );

@ -160,14 +162,14 @@ public:

        Impl::spinwait( fan.m_worker_state , QthreadExec::Active );

-        ValueJoin::join( func , m_scratch_alloc , fan.m_scratch_alloc );
+        ValueJoin::join( ReducerConditional::select(func , reduce) , m_scratch_alloc , fan.m_scratch_alloc );
      }

      if ( rev_rank ) {
        m_worker_state = QthreadExec::Inactive ;
        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
      }
-    
+
      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
        m_worker_base[j]->m_worker_state = QthreadExec::Active ;
      }
@ -197,7 +199,7 @@ public:
      }
      else {
        // Root thread scans across values before releasing threads
-        // Worker data is in reverse order, so m_worker_base[0] is the 
+        // Worker data is in reverse order, so m_worker_base[0] is the
        // highest ranking thread.

        // Copy from lower ranking to higher ranking worker.
@ -216,7 +218,7 @@ public:
          ValueJoin::join( func , m_worker_base[i-1]->m_scratch_alloc , m_worker_base[i]->m_scratch_alloc );
        }
      }
-    
+
      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
        m_worker_base[j]->m_worker_state = QthreadExec::Active ;
      }
@ -349,7 +351,7 @@ public:
      }
      else {
        // Root thread scans across values before releasing threads
-        // Worker data is in reverse order, so m_shepherd_base[0] is the 
+        // Worker data is in reverse order, so m_shepherd_base[0] is the
        // highest ranking thread.

        // Copy from lower ranking to higher ranking worker.
@ -371,7 +373,7 @@ public:

        memory_fence();
      }
-    
+
      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
        m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
      }
--- a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_Parallel.hpp
+++ b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_Parallel.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -130,9 +130,10 @@ public:

 //----------------------------------------------------------------------------

-template< class FunctorType , class ... Traits >
+template< class FunctorType , class ReducerType , class ... Traits >
 class ParallelReduce< FunctorType
                    , Kokkos::RangePolicy< Traits ... >
+                    , ReducerType
                    , Kokkos::Qthread
                    >
 {
@ -141,18 +142,24 @@ private:
  typedef Kokkos::RangePolicy< Traits ... >  Policy ;

  typedef typename Policy::work_tag     WorkTag ;
-  typedef typename Policy::member_type  Member ;
  typedef typename Policy::WorkRange    WorkRange ;
+  typedef typename Policy::member_type  Member ;

-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType, WorkTag > ValueInit ;
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType, ReducerType>::value, FunctorType, ReducerType > ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  // Static Assert WorkTag void if ReducerType not InvalidType
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd, WorkTag > ValueInit ;

  typedef typename ValueTraits::pointer_type    pointer_type ;
  typedef typename ValueTraits::reference_type  reference_type ;

-  const FunctorType  m_functor ;
-  const Policy       m_policy ;
-  const pointer_type m_result_ptr ;
+  const FunctorType   m_functor ;
+  const Policy        m_policy ;
+  const ReducerType   m_reducer ;
+  const pointer_type  m_result_ptr ;

  template< class TagType >
  inline static
@ -187,9 +194,10 @@ private:

    ParallelReduce::template exec_range< WorkTag >(
      self.m_functor, range.begin(), range.end(),
-      ValueInit::init( self.m_functor , exec.exec_all_reduce_value() ) );
+      ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer)
+                     , exec.exec_all_reduce_value() ) );

-    exec.template exec_all_reduce<FunctorType, WorkTag >( self.m_functor );
+    exec.template exec_all_reduce< FunctorType, ReducerType, WorkTag >( self.m_functor, self.m_reducer );
  }

 public:
@ -197,26 +205,39 @@ public:
  inline
  void execute() const
    {
-      QthreadExec::resize_worker_scratch( ValueTraits::value_size( m_functor ) , 0 );
+      QthreadExec::resize_worker_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::exec , this );

      const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();

-      Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( m_functor , data );
+      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , data );

      if ( m_result_ptr ) {
-        const unsigned n = ValueTraits::value_count( m_functor );
+        const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
        for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
      }
    }

-  template< class HostViewType >
+  template< class ViewType >
  ParallelReduce( const FunctorType  & arg_functor
                , const Policy       & arg_policy
-                , const HostViewType & arg_result_view )
+                , const ViewType & arg_result_view
+                , typename std::enable_if<Kokkos::is_view< ViewType >::value &&
+                                          !Kokkos::is_reducer_type< ReducerType >::value
+                                          , void*>::type = NULL)
    : m_functor( arg_functor )
-    , m_policy(  arg_policy )
-    , m_result_ptr( arg_result_view.ptr_on_device() )
+    , m_policy( arg_policy )
+    , m_reducer( InvalidType() )
+    , m_result_ptr( arg_result_view.data() )
+    { }
+
+  ParallelReduce( const FunctorType & arg_functor
+                , Policy       arg_policy
+                , const ReducerType& reducer )
+    : m_functor( arg_functor )
+    , m_policy( arg_policy )
+    , m_reducer( reducer )
+    , m_result_ptr( reducer.result_view().data() )
    { }
 };

@ -291,10 +312,12 @@ public:

 //----------------------------------------------------------------------------

-template< class FunctorType , class ... Properties >
+template< class FunctorType , class ReducerType , class ... Properties >
 class ParallelReduce< FunctorType
                    , TeamPolicy< Properties... >
-                    , Kokkos::Qthread >
+                    , ReducerType
+                    , Kokkos::Qthread
+                    >
 {
 private:

@ -303,14 +326,18 @@ private:
  typedef typename Policy::work_tag     WorkTag ;
  typedef typename Policy::member_type  Member ;

-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType, WorkTag > ValueInit ;
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTag >  ValueInit ;

  typedef typename ValueTraits::pointer_type    pointer_type ;
  typedef typename ValueTraits::reference_type  reference_type ;

  const FunctorType  m_functor ;
  const Policy       m_policy ;
+  const ReducerType  m_reducer ;
  const pointer_type m_result_ptr ;

  template< class TagType >
@ -345,9 +372,10 @@ private:
    ParallelReduce::template exec_team< WorkTag >
      ( self.m_functor
      , Member( exec , self.m_policy )
-      , ValueInit::init( self.m_functor , exec.exec_all_reduce_value() ) );
+      , ValueInit::init( ReducerConditional::select( self.m_functor , self.m_reducer )
+                       , exec.exec_all_reduce_value() ) );

-    exec.template exec_all_reduce< FunctorType , WorkTag >( self.m_functor );
+    exec.template exec_all_reduce< FunctorType, ReducerType, WorkTag >( self.m_functor, self.m_reducer );
  }

 public:
@ -356,29 +384,43 @@ public:
  void execute() const
    {
      QthreadExec::resize_worker_scratch
-        ( /* reduction   memory */ ValueTraits::value_size( m_functor )
+        ( /* reduction   memory */ ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) )
        , /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( m_functor , m_policy.team_size() ) );

      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::exec , this );

      const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();

-      Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( m_functor , data );
+      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer), data );

      if ( m_result_ptr ) {
-        const unsigned n = ValueTraits::value_count( m_functor );
+        const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
        for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
      }
    }

  template< class ViewType >
-  ParallelReduce( const FunctorType & arg_functor ,
-                  const Policy      & arg_policy ,
-                  const ViewType    & arg_result )
+  ParallelReduce( const FunctorType & arg_functor
+                , const Policy      & arg_policy
+                , const ViewType    & arg_result
+                , typename std::enable_if<Kokkos::is_view< ViewType >::value &&
+                                          !Kokkos::is_reducer_type< ReducerType >::value
+                                          , void*>::type = NULL)
    : m_functor( arg_functor )
-    , m_policy(  arg_policy )
+    , m_policy( arg_policy )
+    , m_reducer( InvalidType() )
    , m_result_ptr( arg_result.ptr_on_device() )
    { }
+
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+                , Policy       arg_policy
+                , const ReducerType& reducer )
+  : m_functor( arg_functor )
+  , m_policy( arg_policy )
+  , m_reducer( reducer )
+  , m_result_ptr( reducer.result_view().data() )
+  { }
 };

 //----------------------------------------------------------------------------
@ -395,8 +437,8 @@ private:
  typedef Kokkos::RangePolicy< Traits ... >  Policy ;

  typedef typename Policy::work_tag     WorkTag ;
-  typedef typename Policy::member_type  Member ;
  typedef typename Policy::WorkRange    WorkRange ;
+  typedef typename Policy::member_type  Member ;

  typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
  typedef Kokkos::Impl::FunctorValueInit<   FunctorType, WorkTag > ValueInit ;
--- a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp
+++ b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp
@ -58,6 +58,8 @@
 #include <Kokkos_Atomic.hpp>
 #include <Qthread/Kokkos_Qthread_TaskPolicy.hpp>

+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
 //----------------------------------------------------------------------------

 namespace Kokkos {
@ -120,13 +122,13 @@ Task::~TaskMember()
 }


-Task::TaskMember( const function_verify_type        arg_verify
-                , const function_dealloc_type       arg_dealloc
-                , const function_apply_single_type  arg_apply_single
-                , const function_apply_team_type    arg_apply_team
-                , volatile int &                    arg_active_count
-                , const unsigned                    arg_sizeof_derived
-                , const unsigned                    arg_dependence_capacity
+Task::TaskMember( const function_verify_type   arg_verify
+                , const function_dealloc_type  arg_dealloc
+                , const function_single_type   arg_apply_single
+                , const function_team_type     arg_apply_team
+                , volatile int &               arg_active_count
+                , const unsigned               arg_sizeof_derived
+                , const unsigned               arg_dependence_capacity
                )
  : m_dealloc( arg_dealloc )
  , m_verify(  arg_verify )
@ -144,12 +146,12 @@ Task::TaskMember( const function_verify_type        arg_verify
  for ( unsigned i = 0 ; i < arg_dependence_capacity ; ++i ) m_dep[i] = 0 ;
 }

-Task::TaskMember( const function_dealloc_type       arg_dealloc
-                , const function_apply_single_type  arg_apply_single
-                , const function_apply_team_type    arg_apply_team
-                , volatile int &                    arg_active_count
-                , const unsigned                    arg_sizeof_derived
-                , const unsigned                    arg_dependence_capacity
+Task::TaskMember( const function_dealloc_type  arg_dealloc
+                , const function_single_type   arg_apply_single
+                , const function_team_type     arg_apply_team
+                , volatile int &               arg_active_count
+                , const unsigned               arg_sizeof_derived
+                , const unsigned               arg_dependence_capacity
                )
  : m_dealloc( arg_dealloc )
  , m_verify(  & Task::verify_type<void> )
@ -316,12 +318,8 @@ aligned_t Task::qthread_func( void * arg )
                                        , int(Kokkos::Experimental::TASK_STATE_EXECUTING)
                                        );

-  // It is a single thread's responsibility to close out
-  // this task's execution.
-  bool close_out = false ;
-
  if ( task->m_apply_team && ! task->m_apply_single ) {
-    const Kokkos::Impl::QthreadTeamPolicyMember::TaskTeam task_team_tag ;
+    Kokkos::Impl::QthreadTeamPolicyMember::TaskTeam task_team_tag ;

    // Initialize team size and rank with shephered info
    Kokkos::Impl::QthreadTeamPolicyMember member( task_team_tag );
@ -344,7 +342,7 @@ fflush(stdout);
    if ( member.team_rank() == 0 ) task->closeout();
    member.team_barrier();
  }
-  else if ( task->m_apply_team && task->m_apply_single == reinterpret_cast<function_apply_single_type>(1) ) {
+  else if ( task->m_apply_team && task->m_apply_single == reinterpret_cast<function_single_type>(1) ) {
    // Team hard-wired to one, no cloning
    Kokkos::Impl::QthreadTeamPolicyMember member ;
    (*task->m_apply_team)( task , member );
@ -488,5 +486,6 @@ void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Qthread > & policy )
 } // namespace Experimental
 } // namespace Kokkos

+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
 #endif /* #if defined( KOKKOS_HAVE_QTHREAD ) */

--- a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.hpp
+++ b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.hpp
@ -69,6 +69,8 @@

 #include <impl/Kokkos_FunctorAdapter.hpp>

+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
 //----------------------------------------------------------------------------

 namespace Kokkos {
@ -80,24 +82,24 @@ class TaskMember< Kokkos::Qthread , void , void >
 {
 public:

-  typedef void         (* function_apply_single_type) ( TaskMember * );
-  typedef void         (* function_apply_team_type)   ( TaskMember * , Kokkos::Impl::QthreadTeamPolicyMember & );
-  typedef void         (* function_dealloc_type)( TaskMember * );
  typedef TaskMember * (* function_verify_type) ( TaskMember * );
+  typedef void         (* function_single_type) ( TaskMember * );
+  typedef void         (* function_team_type)   ( TaskMember * , Kokkos::Impl::QthreadTeamPolicyMember & );
+  typedef void         (* function_dealloc_type)( TaskMember * );

 private:

-  const function_dealloc_type       m_dealloc ;       ///< Deallocation
-  const function_verify_type        m_verify ;        ///< Result type verification
-  const function_apply_single_type  m_apply_single ;  ///< Apply function
-  const function_apply_team_type    m_apply_team ;    ///< Apply function
-  int volatile * const              m_active_count ;  ///< Count of active tasks on this policy
-  aligned_t                         m_qfeb ;          ///< Qthread full/empty bit
-  TaskMember ** const               m_dep ;           ///< Dependences
-  const int                         m_dep_capacity ;  ///< Capacity of dependences
-  int                               m_dep_size ;      ///< Actual count of dependences
-  int                               m_ref_count ;     ///< Reference count
-  int                               m_state ;         ///< State of the task
+  const function_dealloc_type  m_dealloc ;       ///< Deallocation
+  const function_verify_type   m_verify ;        ///< Result type verification
+  const function_single_type   m_apply_single ;  ///< Apply function
+  const function_team_type     m_apply_team ;    ///< Apply function
+  int volatile * const         m_active_count ;  ///< Count of active tasks on this policy
+  aligned_t                    m_qfeb ;          ///< Qthread full/empty bit
+  TaskMember ** const          m_dep ;           ///< Dependences
+  const int                    m_dep_capacity ;  ///< Capacity of dependences
+  int                          m_dep_size ;      ///< Actual count of dependences
+  int                          m_ref_count ;     ///< Reference count
+  int                          m_state ;         ///< State of the task

  TaskMember() /* = delete */ ;
  TaskMember( const TaskMember & ) /* = delete */ ;
@ -128,22 +130,22 @@ protected :
  ~TaskMember();

  // Used by TaskMember< Qthread , ResultType , void >
-  TaskMember( const function_verify_type        arg_verify
-            , const function_dealloc_type       arg_dealloc
-            , const function_apply_single_type  arg_apply_single
-            , const function_apply_team_type    arg_apply_team
-            , volatile int &                    arg_active_count
-            , const unsigned                    arg_sizeof_derived
-            , const unsigned                    arg_dependence_capacity
+  TaskMember( const function_verify_type   arg_verify
+            , const function_dealloc_type  arg_dealloc
+            , const function_single_type   arg_apply_single
+            , const function_team_type     arg_apply_team
+            , volatile int &               arg_active_count
+            , const unsigned               arg_sizeof_derived
+            , const unsigned               arg_dependence_capacity
            );

  // Used for TaskMember< Qthread , void , void >
-  TaskMember( const function_dealloc_type       arg_dealloc
-            , const function_apply_single_type  arg_apply_single
-            , const function_apply_team_type    arg_apply_team
-            , volatile int &                    arg_active_count
-            , const unsigned                    arg_sizeof_derived
-            , const unsigned                    arg_dependence_capacity
+  TaskMember( const function_dealloc_type  arg_dealloc
+            , const function_single_type   arg_apply_single
+            , const function_team_type     arg_apply_team
+            , volatile int &               arg_active_count
+            , const unsigned               arg_sizeof_derived
+            , const unsigned               arg_dependence_capacity
            );

 public:
@ -221,7 +223,7 @@ public:
      typedef typename DerivedTaskType::functor_type  functor_type ;
      typedef typename functor_type::value_type       value_type ;

-      const function_apply_single_type flag = reinterpret_cast<function_apply_single_type>( arg_is_team ? 0 : 1 );
+      const function_single_type flag = reinterpret_cast<function_single_type>( arg_is_team ? 0 : 1 );

      DerivedTaskType * const task =
        new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) )
@ -379,16 +381,16 @@ protected:

  typedef TaskMember< Kokkos::Qthread , void , void >  task_root_type ;
  typedef task_root_type::function_dealloc_type        function_dealloc_type ;
-  typedef task_root_type::function_apply_single_type   function_apply_single_type ;
-  typedef task_root_type::function_apply_team_type     function_apply_team_type ;
+  typedef task_root_type::function_single_type         function_single_type ;
+  typedef task_root_type::function_team_type           function_team_type ;

  inline
-  TaskMember( const function_dealloc_type       arg_dealloc
-            , const function_apply_single_type  arg_apply_single
-            , const function_apply_team_type    arg_apply_team
-            , volatile int &                    arg_active_count
-            , const unsigned                    arg_sizeof_derived
-            , const unsigned                    arg_dependence_capacity
+  TaskMember( const function_dealloc_type  arg_dealloc
+            , const function_single_type   arg_apply_single
+            , const function_team_type     arg_apply_team
+            , volatile int &               arg_active_count
+            , const unsigned               arg_sizeof_derived
+            , const unsigned               arg_dependence_capacity
            )
    : task_root_type( & task_root_type::template verify_type< ResultType >
                    , arg_dealloc
@ -413,17 +415,17 @@ public:
  typedef TaskMember< Kokkos::Qthread , void , void >        task_root_type ;
  typedef TaskMember< Kokkos::Qthread , ResultType , void >  task_base_type ;
  typedef task_root_type::function_dealloc_type              function_dealloc_type ;
-  typedef task_root_type::function_apply_single_type         function_apply_single_type ;
-  typedef task_root_type::function_apply_team_type           function_apply_team_type ;
+  typedef task_root_type::function_single_type               function_single_type ;
+  typedef task_root_type::function_team_type                 function_team_type ;

  inline
-  TaskMember( const function_dealloc_type       arg_dealloc
-            , const function_apply_single_type  arg_apply_single
-            , const function_apply_team_type    arg_apply_team
-            , volatile int &                    arg_active_count
-            , const unsigned                    arg_sizeof_derived
-            , const unsigned                    arg_dependence_capacity
-            , const functor_type &              arg_functor
+  TaskMember( const function_dealloc_type  arg_dealloc
+            , const function_single_type   arg_apply_single
+            , const function_team_type     arg_apply_team
+            , volatile int &               arg_active_count
+            , const unsigned               arg_sizeof_derived
+            , const unsigned               arg_dependence_capacity
+            , const functor_type &         arg_functor
            )
    : task_base_type( arg_dealloc
                    , arg_apply_single
@ -453,6 +455,7 @@ class TaskPolicy< Kokkos::Qthread >
 public:

  typedef Kokkos::Qthread                        execution_space ;
+  typedef TaskPolicy                             execution_policy ;
  typedef Kokkos::Impl::QthreadTeamPolicyMember  member_type ;

 private:
@ -489,14 +492,17 @@ public:
    , const unsigned arg_task_team_size = 0 /* choose default */
    );

-  TaskPolicy() = default ;
-  TaskPolicy( TaskPolicy && rhs ) = default ;
-  TaskPolicy( const TaskPolicy & rhs ) = default ;
-  TaskPolicy & operator = ( TaskPolicy && rhs ) = default ;
-  TaskPolicy & operator = ( const TaskPolicy & rhs ) = default ;
+  KOKKOS_FUNCTION TaskPolicy() = default ;
+  KOKKOS_FUNCTION TaskPolicy( TaskPolicy && rhs ) = default ;
+  KOKKOS_FUNCTION TaskPolicy( const TaskPolicy & rhs ) = default ;
+  KOKKOS_FUNCTION TaskPolicy & operator = ( TaskPolicy && rhs ) = default ;
+  KOKKOS_FUNCTION TaskPolicy & operator = ( const TaskPolicy & rhs ) = default ;

  //----------------------------------------

+  KOKKOS_INLINE_FUNCTION
+  int allocated_task_count() const { return m_active_count ; }
+
  template< class ValueType >
  const Future< ValueType , execution_space > &
    spawn( const Future< ValueType , execution_space > & f 
@ -653,5 +659,6 @@ public:
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
 #endif /* #define KOKKOS_QTHREAD_TASK_HPP */

--- a/lib/kokkos/core/src/Qthread/README
+++ b/lib/kokkos/core/src/Qthread/README
@ -3,26 +3,23 @@

 # Cloning repository and branch:

-git clone https://github.com/stelleg/qthreads qthreads-with-clone
+git clone git@github.com:Qthreads/qthreads.git qthreads

-cd qthreads-with-clone
+cd qthreads

-# Added to ./git/config
-#
-# [branch "cloned_tasks"]
-#        remote = origin
-#        merge = refs/heads/cloned_tasks
-#
+# checkout branch with "cloned tasks"

-git branch cloned_tasks
-git checkout cloned_tasks
-git pull
+git checkout dev-kokkos
+
+# Configure/autogen

 sh autogen.sh

-# configurure with 'hwloc' installation:
+# configure with 'hwloc' installation:

 ./configure CFLAGS="-DCLONED_TASKS -DQTHREAD_LOCAL_PRIORITY" --with-hwloc=${HWLOCDIR} --prefix=${INSTALLDIR}

+# install

+make install

--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
@ -53,6 +53,7 @@
 #include <Kokkos_Core.hpp>
 #include <impl/Kokkos_Error.hpp>
 #include <impl/Kokkos_CPUDiscovery.hpp>
+#include <impl/Kokkos_Profiling_Interface.hpp>


 //----------------------------------------------------------------------------
@ -134,11 +135,7 @@ void ThreadsExec::driver(void)

 ThreadsExec::ThreadsExec()
  : m_pool_base(0)
-#if ! KOKKOS_USING_EXP_VIEW
-  , m_scratch()
-#else
  , m_scratch(0)
-#endif
  , m_scratch_reduce_end(0)
  , m_scratch_thread_end(0)
  , m_numa_rank(0)
@ -198,8 +195,6 @@ ThreadsExec::~ThreadsExec()
 {
  const unsigned entry = m_pool_size - ( m_pool_rank + 1 );

-#if KOKKOS_USING_EXP_VIEW
-
  typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;

  if ( m_scratch ) {
@ -210,12 +205,6 @@ ThreadsExec::~ThreadsExec()
    Record::decrement( r );
  }

-#else
-
-  m_scratch.clear();
-
-#endif
-
  m_pool_base   = 0 ;
  m_scratch_reduce_end = 0 ;
  m_scratch_thread_end = 0 ;
@ -439,8 +428,6 @@ void * ThreadsExec::root_reduce_scratch()

 void ThreadsExec::execute_resize_scratch( ThreadsExec & exec , const void * )
 {
-#if KOKKOS_USING_EXP_VIEW
-
  typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;

  if ( exec.m_scratch ) {
@ -451,19 +438,11 @@ void ThreadsExec::execute_resize_scratch( ThreadsExec & exec , const void * )
    Record::decrement( r );
  }

-#else
-
-  exec.m_scratch.clear();
-
-#endif
-
  exec.m_scratch_reduce_end = s_threads_process.m_scratch_reduce_end ;
  exec.m_scratch_thread_end = s_threads_process.m_scratch_thread_end ;

  if ( s_threads_process.m_scratch_thread_end ) {

-#if KOKKOS_USING_EXP_VIEW
-
    // Allocate tracked memory:
    {
      Record * const r = Record::allocate( Kokkos::HostSpace() , "thread_scratch" , s_threads_process.m_scratch_thread_end );
@ -475,15 +454,6 @@ void ThreadsExec::execute_resize_scratch( ThreadsExec & exec , const void * )

    unsigned * ptr = reinterpret_cast<unsigned *>( exec.m_scratch );

-#else
-
-    exec.m_scratch =
-      HostSpace::allocate_and_track( "thread_scratch" , s_threads_process.m_scratch_thread_end );
-
-    unsigned * ptr = reinterpret_cast<unsigned *>( exec.m_scratch.alloc_ptr() );
-
-#endif
-
    unsigned * const end = ptr + s_threads_process.m_scratch_thread_end / sizeof(unsigned);

    // touch on this thread
@ -520,11 +490,7 @@ void * ThreadsExec::resize_scratch( size_t reduce_size , size_t thread_size )
    s_threads_process.m_scratch = s_threads_exec[0]->m_scratch ;
  }

-#if KOKKOS_USING_EXP_VIEW
  return s_threads_process.m_scratch ;
-#else
-  return s_threads_process.m_scratch.alloc_ptr() ;
-#endif
 }

 //----------------------------------------------------------------------------
@ -758,6 +724,9 @@ void ThreadsExec::initialize( unsigned thread_count ,
  // Init the array for used for arbitrarily sized atomics
  Impl::init_lock_array_host_space();

+  #if (KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::initialize();
+  #endif
 }

 //----------------------------------------------------------------------------
@ -807,6 +776,10 @@ void ThreadsExec::finalize()
  s_threads_process.m_pool_size       = 1 ;
  s_threads_process.m_pool_fan_size   = 0 ;
  s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+
+  #if (KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::finalize();
+  #endif
 }

 //----------------------------------------------------------------------------
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
@ -49,7 +49,6 @@
 #include <utility>
 #include <impl/Kokkos_spinwait.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
-#include <impl/Kokkos_AllocationTracker.hpp>

 #include <Kokkos_Atomic.hpp>

@ -89,11 +88,7 @@ private:

  ThreadsExec * const * m_pool_base ; ///< Base for pool fan-in

-#if ! KOKKOS_USING_EXP_VIEW
-  Impl::AllocationTracker m_scratch ;
-#else
  void *        m_scratch ;
-#endif
  int           m_scratch_reduce_end ;
  int           m_scratch_thread_end ;
  int           m_numa_rank ;
@ -138,19 +133,10 @@ public:
  static int get_thread_count();
  static ThreadsExec * get_thread( const int init_thread_rank );

-#if ! KOKKOS_USING_EXP_VIEW
-
-  inline void * reduce_memory() const { return reinterpret_cast<unsigned char *>(m_scratch.alloc_ptr()); }
-  KOKKOS_INLINE_FUNCTION  void * scratch_memory() const { return reinterpret_cast<unsigned char *>(m_scratch.alloc_ptr()) + m_scratch_reduce_end ; }
-
-#else
-
  inline void * reduce_memory() const { return m_scratch ; }
  KOKKOS_INLINE_FUNCTION  void * scratch_memory() const
    { return reinterpret_cast<unsigned char *>(m_scratch) + m_scratch_reduce_end ; }

-#endif
-
  KOKKOS_INLINE_FUNCTION  int volatile & state() { return m_pool_state ; }
  KOKKOS_INLINE_FUNCTION  ThreadsExec * const * pool_base() const { return m_pool_base ; }

--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
@ -129,15 +129,15 @@ public:

  KOKKOS_INLINE_FUNCTION
  const execution_space::scratch_memory_space & team_shmem() const
-    { return m_team_shared.set_team_thread_mode(1,0) ; }
+    { return m_team_shared.set_team_thread_mode(0,1,0) ; }

  KOKKOS_INLINE_FUNCTION
  const execution_space::scratch_memory_space & team_scratch(int) const
-    { return m_team_shared.set_team_thread_mode(1,0) ; }
+    { return m_team_shared.set_team_thread_mode(0,1,0) ; }

  KOKKOS_INLINE_FUNCTION
  const execution_space::scratch_memory_space & thread_scratch(int) const
-    { return m_team_shared.set_team_thread_mode(team_size(),team_rank()) ; }
+    { return m_team_shared.set_team_thread_mode(0,team_size(),team_rank()) ; }

  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
@ -433,10 +433,11 @@ public:

  void next_static()
    {
-      if ( ++m_league_rank < m_league_end ) {
+      if ( m_league_rank < m_league_end ) {
        team_barrier();
        set_team_shared();
      }
+      m_league_rank++;
    }

  bool valid_dynamic() {
@ -468,10 +469,11 @@ public:
    if(m_invalid_thread)
      return;

-    team_barrier();
-    if ( ++m_league_rank < m_league_chunk_end ) {
+    if ( m_league_rank < m_league_chunk_end ) {
+      team_barrier();
      set_team_shared();
    }
+    m_league_rank++;
  }

  void set_league_shmem( const int arg_league_rank
@ -504,8 +506,8 @@ private:
  int m_team_alloc ;
  int m_team_iter ;

-  size_t m_team_scratch_size;
-  size_t m_thread_scratch_size;
+  size_t m_team_scratch_size[2];
+  size_t m_thread_scratch_size[2];

  int m_chunk_size;

@ -549,8 +551,10 @@ public:
    m_team_size = p.m_team_size;
    m_team_alloc = p.m_team_alloc;
    m_team_iter = p.m_team_iter;
-    m_team_scratch_size = p.m_team_scratch_size;
-    m_thread_scratch_size = p.m_thread_scratch_size;
+    m_team_scratch_size[0] = p.m_team_scratch_size[0];
+    m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
+    m_team_scratch_size[1] = p.m_team_scratch_size[1];
+    m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
    m_chunk_size = p.m_chunk_size;
    return *this;
  }
@ -577,7 +581,12 @@ public:
  inline int team_size() const { return m_team_size ; }
  inline int team_alloc() const { return m_team_alloc ; }
  inline int league_size() const { return m_league_size ; }
-  inline size_t scratch_size() const { return m_team_scratch_size + m_team_size*m_thread_scratch_size ; }
+  inline size_t scratch_size(const int& level, int team_size_ = -1 ) const {
+    if(team_size_ < 0)
+      team_size_ = m_team_size;
+    return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level] ;
+  }
+
  inline int team_iter() const { return m_team_iter ; }

  /** \brief  Specify league size, request team size */
@ -588,8 +597,8 @@ public:
    : m_league_size(0)
    , m_team_size(0)
    , m_team_alloc(0)
-    , m_team_scratch_size ( 0 )
-    , m_thread_scratch_size ( 0 )
+    , m_team_scratch_size { 0 , 0 }
+    , m_thread_scratch_size { 0 , 0 }
    , m_chunk_size(0)
    { init(league_size_request,team_size_request); (void) vector_length_request; }

@ -601,8 +610,8 @@ public:
    : m_league_size(0)
    , m_team_size(0)
    , m_team_alloc(0)
-    , m_team_scratch_size ( 0 )
-    , m_thread_scratch_size ( 0 )
+    , m_team_scratch_size { 0 , 0 }
+    , m_thread_scratch_size { 0 , 0 }
    , m_chunk_size(0)
    { init(league_size_request,traits::execution_space::thread_pool_size(2)); }

@ -612,8 +621,8 @@ public:
    : m_league_size(0)
    , m_team_size(0)
    , m_team_alloc(0)
-    , m_team_scratch_size ( 0 )
-    , m_thread_scratch_size ( 0 )
+    , m_team_scratch_size { 0 , 0 }
+    , m_thread_scratch_size { 0 , 0 }
    , m_chunk_size(0)
    { init(league_size_request,team_size_request); }

@ -623,8 +632,8 @@ public:
    : m_league_size(0)
    , m_team_size(0)
    , m_team_alloc(0)
-    , m_team_scratch_size ( 0 )
-    , m_thread_scratch_size ( 0 )
+    , m_team_scratch_size { 0 , 0 }
+    , m_thread_scratch_size { 0 , 0 }
    , m_chunk_size(0)
    { init(league_size_request,traits::execution_space::thread_pool_size(2)); }

@ -639,26 +648,23 @@ public:

  /** \brief set per team scratch size for a specific level of the scratch hierarchy */
  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
-    (void) level;
    TeamPolicyInternal p = *this;
-    p.m_team_scratch_size = per_team.value;
+    p.m_team_scratch_size[level] = per_team.value;
    return p;
  };

  /** \brief set per thread scratch size for a specific level of the scratch hierarchy */
  inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
-    (void) level;
    TeamPolicyInternal p = *this;
-    p.m_thread_scratch_size = per_thread.value;
+    p.m_thread_scratch_size[level] = per_thread.value;
    return p;
  };

  /** \brief set per thread and per team scratch size for a specific level of the scratch hierarchy */
  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
-    (void) level;
    TeamPolicyInternal p = *this;
-    p.m_team_scratch_size = per_team.value;
-    p.m_thread_scratch_size = per_thread.value;
+    p.m_team_scratch_size[level] = per_team.value;
+    p.m_thread_scratch_size[level] = per_thread.value;
    return p;
  };

--- a/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp
@ -264,7 +264,7 @@ public:
             , const Policy      & arg_policy )
    : m_functor( arg_functor )
    , m_policy(  arg_policy )
-    , m_shared( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+    , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
    { }
 };

@ -272,9 +272,10 @@ public:
 //----------------------------------------------------------------------------
 /* ParallelReduce with Kokkos::Threads and RangePolicy */

-template< class FunctorType , class ... Traits >
+template< class FunctorType , class ReducerType, class ... Traits >
 class ParallelReduce< FunctorType
                    , Kokkos::RangePolicy< Traits ... >
+                    , ReducerType
                    , Kokkos::Threads
                    >
 {
@ -286,14 +287,18 @@ private:
  typedef typename Policy::WorkRange   WorkRange ;
  typedef typename Policy::member_type Member ;

-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType, WorkTag > ValueInit ;
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd, WorkTag > ValueInit ;

  typedef typename ValueTraits::pointer_type    pointer_type ;
  typedef typename ValueTraits::reference_type  reference_type ;

  const FunctorType  m_functor ;
  const Policy       m_policy ;
+  const ReducerType   m_reducer ;
  const pointer_type m_result_ptr ;

  template< class TagType >
@ -344,9 +349,9 @@ private:

    ParallelReduce::template exec_range< WorkTag >
      ( self.m_functor , range.begin() , range.end() 
-      , ValueInit::init( self.m_functor , exec.reduce_memory() ) );
+      , ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() ) );

-    exec.template fan_in_reduce< FunctorType , WorkTag >( self.m_functor );
+    exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) );
  }

  template<class Schedule>
@ -362,7 +367,7 @@ private:
    exec.barrier();

    long work_index = exec.get_work_index();
-    reference_type update = ValueInit::init( self.m_functor , exec.reduce_memory() );
+    reference_type update = ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() );
    while(work_index != -1) {
      const Member begin = static_cast<Member>(work_index) * self.m_policy.chunk_size();
      const Member end = begin + self.m_policy.chunk_size() < self.m_policy.end()?begin+self.m_policy.chunk_size():self.m_policy.end();
@ -372,7 +377,7 @@ private:
      work_index = exec.get_work_index();
    }

-    exec.template fan_in_reduce< FunctorType , WorkTag >( self.m_functor );
+    exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) );
  }

 public:
@ -380,7 +385,7 @@ public:
  inline
  void execute() const
    {
-      ThreadsExec::resize_scratch( ValueTraits::value_size( m_functor ) , 0 );
+      ThreadsExec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );

      ThreadsExec::start( & ParallelReduce::exec , this );

@ -391,7 +396,7 @@ public:
        const pointer_type data =
          (pointer_type) ThreadsExec::root_reduce_scratch();

-        const unsigned n = ValueTraits::value_count( m_functor );
+        const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
        for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
      }
    }
@ -399,9 +404,14 @@ public:
  template< class HostViewType >
  ParallelReduce( const FunctorType  & arg_functor ,
                  const Policy       & arg_policy ,
-                  const HostViewType & arg_result_view )
+                  const HostViewType & arg_result_view ,
+                  typename std::enable_if<
+                               Kokkos::is_view< HostViewType >::value &&
+                              !Kokkos::is_reducer_type<ReducerType>::value
+                  ,void*>::type = NULL)
    : m_functor( arg_functor )
    , m_policy( arg_policy )
+    , m_reducer( InvalidType() )
    , m_result_ptr( arg_result_view.ptr_on_device() )
    {
      static_assert( Kokkos::is_view< HostViewType >::value
@ -410,14 +420,30 @@ public:
      static_assert( std::is_same< typename HostViewType::memory_space , HostSpace >::value
        , "Kokkos::Threads reduce result must be a View in HostSpace" );
    }
+
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+                , Policy       arg_policy
+                , const ReducerType& reducer )
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    , m_reducer( reducer )
+    , m_result_ptr(  reducer.result_view().data() )
+    {
+      /*static_assert( std::is_same< typename ViewType::memory_space
+                                      , Kokkos::HostSpace >::value
+        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
+    }
+
 };

 //----------------------------------------------------------------------------
 /* ParallelReduce with Kokkos::Threads and TeamPolicy */

-template< class FunctorType , class ... Properties >
+template< class FunctorType , class ReducerType, class ... Properties >
 class ParallelReduce< FunctorType
                    , Kokkos::TeamPolicy< Properties ... >
+                    , ReducerType
                    , Kokkos::Threads
                    >
 {
@ -426,14 +452,19 @@ private:
  typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Threads, Properties ... >              Policy ;
  typedef typename Policy::work_tag                                WorkTag ;
  typedef typename Policy::member_type                             Member ;
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType, WorkTag > ValueInit ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd, WorkTag > ValueInit ;

  typedef typename ValueTraits::pointer_type    pointer_type ;
  typedef typename ValueTraits::reference_type  reference_type ;

  const FunctorType  m_functor ;
  const Policy       m_policy ;
+  const ReducerType  m_reducer ;
  const pointer_type m_result_ptr ;
  const int          m_shared ;

@ -464,9 +495,9 @@ private:

    ParallelReduce::template exec_team< WorkTag >
      ( self.m_functor , Member( & exec , self.m_policy , self.m_shared )
-      , ValueInit::init( self.m_functor , exec.reduce_memory() ) );
+      , ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() ) );

-    exec.template fan_in_reduce< FunctorType , WorkTag >( self.m_functor );
+    exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) );
  }

 public:
@ -474,7 +505,7 @@ public:
  inline
  void execute() const
    {
-      ThreadsExec::resize_scratch( ValueTraits::value_size( m_functor ) , Policy::member_type::team_reduce_size() + m_shared );
+      ThreadsExec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , Policy::member_type::team_reduce_size() + m_shared );

      ThreadsExec::start( & ParallelReduce::exec , this );

@ -484,20 +515,41 @@ public:

        const pointer_type data = (pointer_type) ThreadsExec::root_reduce_scratch();

-        const unsigned n = ValueTraits::value_count( m_functor );
+        const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
        for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
      }
    }

  template< class ViewType >
-  ParallelReduce( const FunctorType & arg_functor
-                , const Policy      & arg_policy
-                , const ViewType    & arg_result )
+  inline
+  ParallelReduce( const FunctorType  & arg_functor ,
+                  const Policy       & arg_policy ,
+                  const ViewType     & arg_result ,
+                  typename std::enable_if<
+                    Kokkos::is_view< ViewType >::value &&
+                    !Kokkos::is_reducer_type<ReducerType>::value
+                    ,void*>::type = NULL)
    : m_functor( arg_functor )
-    , m_policy( arg_policy )
+    , m_policy(  arg_policy )
+    , m_reducer( InvalidType() )
    , m_result_ptr( arg_result.ptr_on_device() )
-    , m_shared( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
-    { }
+    , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+    {}
+
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+    , Policy       arg_policy
+    , const ReducerType& reducer )
+  : m_functor( arg_functor )
+  , m_policy(  arg_policy )
+  , m_reducer( reducer )
+  , m_result_ptr(  reducer.result_view().data() )
+  , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+  {
+  /*static_assert( std::is_same< typename ViewType::memory_space
+                          , Kokkos::HostSpace >::value
+  , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
+  }
 };

 //----------------------------------------------------------------------------
--- a/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.cpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.cpp
@ -46,9 +46,10 @@
 #include <stdio.h>
 #include <iostream>
 #include <sstream>
+#include <Kokkos_Core.hpp>
 #include <Threads/Kokkos_Threads_TaskPolicy.hpp>

-#if defined( KOKKOS_HAVE_PTHREAD )
+#if defined( KOKKOS_HAVE_PTHREAD ) && defined( KOKKOS_ENABLE_TASKPOLICY )

 #define QLOCK   (reinterpret_cast<void*>( ~((uintptr_t)0) ))
 #define QDENIED (reinterpret_cast<void*>( ~((uintptr_t)0) - 1 ))
@ -87,9 +88,8 @@ ThreadsTaskPolicyQueue::ThreadsTaskPolicyQueue
  , const unsigned arg_task_team_size
  )
  : m_space( Kokkos::Threads::memory_space()
-           , arg_task_max_size
-           , arg_task_max_size * arg_task_max_count
-           , 1 /* only one level of memory pool */
+           , arg_task_max_size * arg_task_max_count * 1.2
+           , 16 /* log2(superblock size) */
           )
  , m_team { 0 , 0 , 0 }
  , m_serial { 0 , 0 , 0 }
@ -624,10 +624,10 @@ ThreadsTaskPolicyQueue::allocate_task
  // User created task memory pool with an estimate,
  // if estimate is to low then report and throw exception.

-  if ( m_space.get_min_chunk_size() < size_alloc ) {
+  if ( m_space.get_min_block_size() < size_alloc ) {
    fprintf(stderr,"TaskPolicy<Threads> task allocation requires %d bytes on memory pool with %d byte chunk size\n"
           , int(size_alloc)
-           , int(m_space.get_min_chunk_size())
+           , int(m_space.get_min_block_size())
           );
    fflush(stderr);
    Kokkos::Impl::throw_runtime_exception("TaskMember< Threads >::task_allocate");
@ -926,5 +926,5 @@ void Task::clear_dependence()
 } /* namespace Experimental */
 } /* namespace Kokkos */

-#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */
+#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */

--- a/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.hpp
@ -50,7 +50,7 @@
 #include <Kokkos_Threads.hpp>
 #include <Kokkos_TaskPolicy.hpp>

-#if defined( KOKKOS_HAVE_PTHREAD )
+#if defined( KOKKOS_HAVE_PTHREAD ) && defined( KOKKOS_ENABLE_TASKPOLICY )

 //----------------------------------------------------------------------------

@ -737,10 +737,9 @@ public:
 } /* namespace Experimental */
 } /* namespace Kokkos */

-#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */
-
 //----------------------------------------------------------------------------

+#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
 #endif /* #ifndef KOKKOS_THREADS_TASKPOLICY_HPP */


--- a/lib/kokkos/core/src/impl/KokkosExp_SharedAlloc.hpp
+++ b/lib/kokkos/core/src/impl/KokkosExp_SharedAlloc.hpp
@ -246,8 +246,8 @@ private:
  enum : uintptr_t { DO_NOT_DEREF_FLAG = 0x01ul };

  // The allocation record resides in Host memory space
-  Record  * m_record ;
  uintptr_t m_record_bits ;
+  Record  * m_record ;

 public:

--- a/lib/kokkos/core/src/impl/KokkosExp_ViewCtor.hpp
+++ b/lib/kokkos/core/src/impl/KokkosExp_ViewCtor.hpp
@ -47,8 +47,6 @@
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

-#if KOKKOS_USING_EXP_VIEW
-
 namespace Kokkos {

 /* For backward compatibility */
@ -68,8 +66,6 @@ struct ViewAllocateWithoutInitializing {

 } /* namespace Kokkos */

-#endif
-
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

--- a/lib/kokkos/core/src/impl/KokkosExp_ViewMapping.hpp
+++ b/lib/kokkos/core/src/impl/KokkosExp_ViewMapping.hpp
@ -2604,18 +2604,24 @@ class ViewMapping< DstTraits , SrcTraits ,
    &&
    std::is_same< typename DstTraits::specialize , void >::value
    &&
-    (
-      std::is_same< typename DstTraits::array_layout , Kokkos::LayoutLeft >::value ||
-      std::is_same< typename DstTraits::array_layout , Kokkos::LayoutRight >::value ||
-      std::is_same< typename DstTraits::array_layout , Kokkos::LayoutStride >::value
-    )
-    &&
    std::is_same< typename SrcTraits::specialize , void >::value
    &&
    (
-      std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value ||
-      std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value ||
-      std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutStride >::value
+      std::is_same< typename DstTraits::array_layout , typename SrcTraits::array_layout >::value
+      ||
+      (
+        (
+          std::is_same< typename DstTraits::array_layout , Kokkos::LayoutLeft >::value ||
+          std::is_same< typename DstTraits::array_layout , Kokkos::LayoutRight >::value ||
+          std::is_same< typename DstTraits::array_layout , Kokkos::LayoutStride >::value
+        )
+        &&
+        (
+          std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value ||
+          std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value ||
+          std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutStride >::value
+        )
+      )
    )
  )>::type >
 {
--- a/lib/kokkos/core/src/impl/Kokkos_AllocationTracker.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_AllocationTracker.cpp
@ -1,848 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#include <Kokkos_Core_fwd.hpp>
-
-#if ! KOKKOS_USING_EXP_VIEW
-
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-
-#include <Kokkos_Atomic.hpp>
-
-#include <impl/Kokkos_Singleton.hpp>
-#include <impl/Kokkos_AllocationTracker.hpp>
-#include <impl/Kokkos_Error.hpp>
-
-
-#include <string>
-#include <vector>
-#include <sstream>
-#include <algorithm>
-#include <utility>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <iomanip>
-
-/* Enable clean up of memory leaks */
-#define CLEAN_UP_MEMORY_LEAKS 0
-
-namespace Kokkos { namespace Impl {
-
-namespace {
-
-
-//-----------------------------------------------------------------------------
-// AllocationRecord
-//-----------------------------------------------------------------------------
-//
-// Used to track details about an allocation and provide a ref count
-// sizeof(AllocationRecord) == 128
-struct AllocationRecord
-{
-  enum {
-     OFFSET = sizeof(AllocatorBase*)          // allocator
-            + sizeof(void*)                   // alloc_ptr
-            + sizeof(uint64_t)                // alloc_size
-            + sizeof(AllocatorAttributeBase*) // attribute
-            + sizeof(uint32_t)                // node_index
-            + sizeof(uint32_t)                // ref_count
-   , LABEL_LENGTH = 128 - OFFSET
-  };
-
-  AllocatorBase * const          allocator;
-  void * const                   alloc_ptr;
-  const uint64_t                 alloc_size;
-  AllocatorAttributeBase * const attribute;
-  const int32_t                  node_index;
-  volatile uint32_t              ref_count;
-  const char                     label[LABEL_LENGTH];
-
-
-  AllocationRecord(  AllocatorBase * const arg_allocator
-                   , void *   arg_alloc_ptr
-                   , uint64_t arg_alloc_size
-                   , int32_t  arg_node_index
-                   , const std::string & arg_label
-                  )
-    : allocator(arg_allocator)
-    , alloc_ptr(arg_alloc_ptr)
-    , alloc_size(arg_alloc_size)
-    , attribute(NULL)
-    , node_index(arg_node_index)
-    , ref_count(1)
-    , label() // zero fill
-  {
-    const size_t length = static_cast<size_t>(LABEL_LENGTH-1u) < arg_label.size() ? static_cast<size_t>(LABEL_LENGTH-1u) : arg_label.size();
-    strncpy( const_cast<char *>(label), arg_label.c_str(), length );
-  }
-
-  ~AllocationRecord()
-  {
-    if (attribute) {
-      delete attribute;
-    }
-  }
-
-  uint32_t increment_ref_count()
-  {
-    uint32_t old_value = atomic_fetch_add( &ref_count, static_cast<uint32_t>(1) );
-    return old_value + 1u;
-  }
-
-  uint32_t decrement_ref_count()
-  {
-    uint32_t old_value = atomic_fetch_sub( &ref_count, static_cast<uint32_t>(1) );
-    return old_value - 1u;
-  }
-
-  void print( std::ostream & oss ) const
-  {
-    oss << "{ " << allocator->name()
-        << " } : \"" << label
-        << "\" ref_count(" << ref_count
-        << ") memory[ " << alloc_ptr
-        << " + " << alloc_size
-        << " ]" ;
-  }
-
-  bool set_attribute( AllocatorAttributeBase * attr )
-  {
-    bool result = false;
-    if (attribute == NULL) {
-      result = NULL == atomic_compare_exchange(  const_cast<AllocatorAttributeBase **>(&attribute)
-                                               , reinterpret_cast<AllocatorAttributeBase *>(NULL)
-                                               , attr );
-    }
-
-    return result;
-  }
-
-  // disallow copy and assignment
-  AllocationRecord( const AllocationRecord & );
-  AllocationRecord & operator=(const AllocationRecord &);
-};
-
-template <int NumBlocks>
-struct Bitset
-{
-  enum { blocks = NumBlocks };
-  enum { size = blocks * 64 };
-  enum { block_mask = 63u };
-  enum { block_shift = 6 };
-
-  // used to find free bits in a bitset
-  static int count_trailing_zeros(uint64_t x)
-  {
-    #if defined( KOKKOS_COMPILER_GNU ) || defined( KOKKOS_COMPILER_CLANG ) || defined( KOKKOS_COMPILER_APPLECC )
-      return x ? __builtin_ctzll(x) : 64;
-    #elif defined( KOKKOS_COMPILER_INTEL )
-      enum { shift = 32 };
-      enum { mask = (static_cast<uint64_t>(1) << shift) - 1u };
-      return (x & mask) ? _bit_scan_forward(static_cast<int>(x & mask)) :
-             (x >> shift) ? shift + _bit_scan_forward(static_cast<int>(x >> shift)) :
-             64 ;
-    #elif defined( KOKKOS_COMPILER_IBM )
-      return x ? __cnttz8(x) : 64;
-    #else
-      int i = 0;
-      for (; ((x & (static_cast<uint64_t>(1) << i)) == 0u) && i < 64; ++i ) {}
-      return i;
-    #endif
-  }
-
-  Bitset()
-    : m_bits()
-  {
-    for (int i=0; i < blocks; ++i) {
-      m_bits[i] = 0u;
-    }
-  }
-
-  bool set( int i )
-  {
-    const uint64_t bit = static_cast<uint64_t>(1) << ( i & block_mask );
-    return !( atomic_fetch_or( m_bits + (i >> block_shift), bit ) & bit );
-  }
-
-  bool reset( int i )
-  {
-    const uint64_t bit = static_cast<uint64_t>(1) << ( i & block_mask );
-    return atomic_fetch_and( m_bits + (i >> block_shift), ~bit ) & bit;
-  }
-
-  bool test( int i )
-  {
-    const uint64_t block = m_bits[ i >> block_shift ];
-    const uint64_t bit = static_cast<uint64_t>(1) << ( i & block_mask );
-    return block & bit;
-  }
-
-  int find_first_unset() const
-  {
-    for (int i=0; i < blocks; ++i) {
-      const uint64_t block = m_bits[i];
-      int b = count_trailing_zeros( ~block );
-
-      if ( b < 64 ) {
-        return (i << block_shift) + b;
-      }
-    }
-    return size;
-  }
-
-  volatile uint64_t m_bits[blocks];
-};
-
-//-----------------------------------------------------------------------------
-// AllocationRecordPool -- singleton class
-//
-// global_alloc_rec_pool is the ONLY instance of this class
-//
-//-----------------------------------------------------------------------------
-// Record AllocationRecords in a lock-free circular list.
-// Each node in the list has a buffer with space for 959 ((15*64)-1) records
-// managed by a bitset.  Atomics are used to set and reset bits in the bit set.
-// The head of the list is atomically updated to the last node found with
-// unused space.
-//
-// Cost time to create an allocation record: amortized O(1), worst case O(num nodes)
-// Cost to destroy an allocation recored: O(1)
-//
-// Singleton allocations are pushed onto a lock-free stack that is destroyed
-// after the circular list of allocation records.
-struct AllocationRecordPool
-{
-  enum { BITSET_BLOCKS = 15 };
-
-  typedef Bitset<BITSET_BLOCKS> bitset_type;
-
-  enum { BUFFER_SIZE = (bitset_type::size - 1) * sizeof(AllocationRecord) };
-
-  struct AllocationNode
-  {
-    AllocationNode()
-      : next()
-      , bitset()
-      , buffer()
-    {
-      // set the first bit to used
-      bitset.set(0);
-    }
-
-    void * get_buffer( int32_t node_index )
-    {
-      return buffer + (node_index-1) * sizeof(AllocationRecord);
-    }
-
-    // return 0 if no space is available in the node
-    int32_t get_node_index()
-    {
-      int32_t node_index = 0;
-      do {
-        node_index = bitset.find_first_unset();
-
-        // successfully claimed a bit
-        if ( node_index != bitset.size && bitset.set(node_index) )
-        {
-          return node_index;
-        }
-      } while ( node_index != bitset.size );
-      return 0;
-    }
-
-    void clear_node_index( int32_t node_index )
-    {
-      bitset.reset(node_index);
-    }
-
-    AllocationNode * next;
-    bitset_type      bitset;
-    char             buffer[BUFFER_SIZE];
-  };
-
-  struct SingletonNode
-  {
-    void * buffer;
-    SingletonNode * next;
-    Impl::singleton_destroy_function_type destroy;
-
-    SingletonNode( size_t size, Impl::singleton_create_function_type create_func, Impl::singleton_destroy_function_type destroy_func  )
-      : buffer(NULL)
-      , next(NULL)
-      , destroy(destroy_func)
-    {
-      if (size) {
-        buffer = malloc(size);
-        create_func(buffer);
-      }
-    }
-
-    ~SingletonNode()
-    {
-      if (buffer) {
-        try {
-          destroy(buffer);
-        } catch(...) {}
-        free(buffer);
-      }
-    }
-  };
-
-  AllocationRecordPool()
-    : head( new AllocationNode() )
-    , singleton_head(NULL)
-  {
-    // setup ring
-    head->next = head;
-  }
-
-  ~AllocationRecordPool()
-  {
-    // delete allocation records
-    {
-      AllocationNode * start = head;
-
-      AllocationNode * curr = start;
-
-      std::vector< std::string > string_vec;
-
-      do {
-        AllocationNode * next = curr->next;
-
-        #if defined( KOKKOS_DEBUG_PRINT_ALLOCATION_BITSET )
-        // print node bitset
-        for (int i=0; i < bitset_type::blocks; ++i ) {
-          std::cout << std::hex << std::showbase << curr->bitset.m_bits[i] << "   ";
-        }
-        std::cout << std::endl;
-        #endif
-
-        // bit zero does not map to an AllocationRecord
-        for ( int32_t i=1; i < bitset_type::size; ++i )
-        {
-          if (curr->bitset.test(i)) {
-            AllocationRecord * alloc_rec = reinterpret_cast<AllocationRecord *>( curr->get_buffer(i) );
-
-            std::ostringstream oss;
-            alloc_rec->print( oss );
-            string_vec.push_back( oss.str() );
-
-#if CLEAN_UP_MEMORY_LEAKS
-/* Cleaning up memory leaks prevents memory error detection tools
- * from reporting the original source of allocation, which can
- * impede debugging with such tools.
- */
-            try {
-              destroy(alloc_rec);
-            }
-            catch(...) {}
-#endif
-          }
-        }
-
-        curr->next = NULL;
-
-        delete curr;
-
-        curr = next;
-      } while ( curr != start );
-
-      //if ( !string_vec.empty() ) {
-      //  std::sort( string_vec.begin(), string_vec.end() );
-      //
-      //  std::ostringstream oss;
-      //  oss << "Error: Allocation pool destroyed with the following memory leak(s):\n";
-      //  for (size_t i=0; i< string_vec.size(); ++i)
-      //  {
-      //    oss << "   " << string_vec[i] << std::endl;
-      //  }
-      //
-      //  std::cerr << oss.str() << std::endl;
-      //}
-    }
-
-    // delete singletons
-    {
-      SingletonNode * curr = singleton_head;
-
-      while (curr) {
-        SingletonNode * next = curr->next;
-        delete curr;
-        curr = next;
-      }
-    }
-  }
-
-  AllocationRecord * create(  AllocatorBase * arg_allocator
-                            , void * arg_alloc_ptr
-                            , size_t arg_alloc_size
-                            , const std::string & arg_label
-                           )
-  {
-    AllocationNode * start = volatile_load(&head);
-
-    AllocationNode * curr = start;
-
-
-    int32_t node_index = curr->get_node_index();
-
-    if (node_index == 0) {
-      curr = volatile_load(&curr->next);
-    }
-
-    while (node_index == 0 && curr != start)
-    {
-      node_index = curr->get_node_index();
-      if (node_index == 0) {
-        curr = volatile_load(&curr->next);
-      }
-    }
-
-    // Need to allocate and insert a new node
-    if (node_index == 0 && curr == start)
-    {
-      AllocationNode * new_node = new AllocationNode();
-
-      node_index = new_node->get_node_index();
-
-      AllocationNode * next = NULL;
-      do {
-        next = volatile_load(&curr->next);
-        new_node->next = next;
-        memory_fence();
-      } while ( next != atomic_compare_exchange( &(curr->next), next, new_node ) );
-
-      curr = new_node;
-    }
-
-    void * buffer = curr->get_buffer(node_index);
-
-    // try to set head to curr
-    if ( start != curr )
-    {
-      atomic_compare_exchange( & head, start, curr );
-    }
-
-    return new (buffer) AllocationRecord(  arg_allocator
-                                         , arg_alloc_ptr
-                                         , arg_alloc_size
-                                         , node_index
-                                         , arg_label
-                                        );
-  }
-
-  void destroy( AllocationRecord * alloc_rec )
-  {
-    if (alloc_rec) {
-      const int32_t node_index = alloc_rec->node_index;
-      AllocationNode * node = get_node( alloc_rec );
-
-      // deallocate memory
-      alloc_rec->allocator->deallocate( alloc_rec->alloc_ptr, alloc_rec->alloc_size );
-
-      // call destructor
-      alloc_rec->~AllocationRecord();
-
-      // wait for writes to complete
-      memory_fence();
-
-      // clear node index
-      node->clear_node_index( node_index );
-    }
-  }
-
-  void * create_singleton( size_t size, Impl::singleton_create_function_type create_func, Impl::singleton_destroy_function_type destroy_func )
-  {
-    SingletonNode * node = new SingletonNode( size, create_func, destroy_func );
-    SingletonNode * next;
-
-    // insert new node at the head of the list
-    do {
-      next = volatile_load(&singleton_head);
-      node->next = next;
-    } while ( next != atomic_compare_exchange( &singleton_head, next, node ) );
-
-    return node->buffer;
-  }
-
-  void print_memory( std::ostream & out ) const
-  {
-    AllocationNode * start = head;
-
-    AllocationNode * curr = start;
-
-    std::vector< std::string > string_vec;
-
-    do {
-      AllocationNode * next = curr->next;
-
-      // bit zero does not map to an AllocationRecord
-      for ( int32_t i=1; i < bitset_type::size; ++i )
-      {
-        if (curr->bitset.test(i)) {
-          AllocationRecord * alloc_rec = reinterpret_cast<AllocationRecord *>( curr->get_buffer(i) );
-
-          std::ostringstream oss;
-          alloc_rec->print( oss );
-          string_vec.push_back( oss.str() );
-        }
-      }
-      curr = next;
-    } while ( curr != start );
-
-    if ( !string_vec.empty() ) {
-      std::sort( string_vec.begin(), string_vec.end() );
-
-      std::ostringstream oss;
-      oss << "Tracked Memory:" << std::endl;
-      for (size_t i=0; i< string_vec.size(); ++i)
-      {
-        oss << "   " << string_vec[i] << std::endl;
-      }
-      out << oss.str() << std::endl;
-    }
-    else {
-      out << "No Tracked Memory" << std::endl;
-    }
-  }
-
-  // find an AllocationRecord such that
-  // alloc_ptr <= ptr < alloc_ptr + alloc_size
-  // otherwise return NULL
-  AllocationRecord * find( void const * ptr, AllocatorBase const * allocator ) const
-  {
-    AllocationNode * start = head;
-
-    AllocationNode * curr = start;
-
-    char const * const char_ptr = reinterpret_cast<const char *>(ptr);
-
-    do {
-      AllocationNode * next = curr->next;
-
-      // bit zero does not map to an AllocationRecord
-      for ( int32_t i=1; i < bitset_type::size; ++i )
-      {
-        if (curr->bitset.test(i)) {
-          AllocationRecord * alloc_rec = reinterpret_cast<AllocationRecord *>( curr->get_buffer(i) );
-
-          char const * const alloc_ptr = reinterpret_cast<char const *>(alloc_rec->alloc_ptr);
-
-          if (   (allocator == alloc_rec->allocator)
-              && (alloc_ptr <= char_ptr)
-              && (char_ptr < (alloc_ptr + alloc_rec->alloc_size)) )
-          {
-            return alloc_rec;
-          }
-        }
-      }
-      curr = next;
-    } while ( curr != start );
-
-    return NULL;
-  }
-
-private:
-
-  AllocationNode * get_node( AllocationRecord * alloc_rec )
-  {
-    return reinterpret_cast<AllocationNode *>( alloc_rec - alloc_rec->node_index);
-  }
-
-  AllocationNode * head;
-  SingletonNode * singleton_head;
-};
-
-// create the global pool for allocation records
-AllocationRecordPool global_alloc_rec_pool;
-
-
-
-// convert a uintptr_t to an AllocationRecord pointer
-inline
-AllocationRecord * to_alloc_rec( uintptr_t alloc_rec )
-{
-  return reinterpret_cast<AllocationRecord *>( alloc_rec & ~static_cast<uintptr_t>(1) );
-}
-
-} // unnamed namespace
-
-//-----------------------------------------------------------------------------
-// Allocation Tracker methods
-//-----------------------------------------------------------------------------
-
-// Create a reference counted AllocationTracker
-void AllocationTracker::initalize(  AllocatorBase * arg_allocator
-                                  , void * arg_alloc_ptr
-                                  , size_t arg_alloc_size
-                                  , const std::string & arg_label
-                                 )
-{
-  if ( arg_allocator && arg_alloc_ptr && arg_alloc_size) {
-    // create record
-    AllocationRecord * alloc_rec = global_alloc_rec_pool.create(  arg_allocator
-                                                                , arg_alloc_ptr
-                                                                , arg_alloc_size
-                                                                , arg_label
-                                                               );
-
-    m_alloc_rec = reinterpret_cast<uintptr_t>(alloc_rec) | REF_COUNT_BIT;
-  }
-}
-
-void AllocationTracker::reallocate( size_t size ) const
-{
-  AllocationRecord * rec = to_alloc_rec( m_alloc_rec );
-
-  void * the_alloc_ptr = rec->allocator->reallocate( rec->alloc_ptr, rec->alloc_size, size );
-
-  if ( NULL != the_alloc_ptr )
-  {
-    *const_cast<void **>(&rec->alloc_ptr) = the_alloc_ptr;
-    *const_cast<uint64_t *>(&rec->alloc_size) = size;
-  }
-  else {
-    Impl::throw_runtime_exception( "Error: unable to reallocate allocation tracker");
-  }
-}
-
-
-void AllocationTracker::increment_ref_count() const
-{
-  to_alloc_rec( m_alloc_rec )->increment_ref_count();
-}
-
-
-void AllocationTracker::decrement_ref_count() const
-{
-  AllocationRecord * alloc_rec = to_alloc_rec( m_alloc_rec );
-  uint32_t the_ref_count = alloc_rec->decrement_ref_count();
-  if (the_ref_count == 0u) {
-    try {
-      global_alloc_rec_pool.destroy( alloc_rec );
-    }
-    catch(...) {}
-  }
-}
-
-namespace {
-
-struct NullAllocator { static const char * name() { return "Null Allocator"; } };
-
-}
-
-AllocatorBase * AllocationTracker::allocator() const
-{
-  if (m_alloc_rec & REF_COUNT_MASK) {
-    return to_alloc_rec(m_alloc_rec)->allocator;
-  }
-  return Allocator<NullAllocator>::singleton();
-}
-
-void * AllocationTracker::alloc_ptr()  const
-{
-  if (m_alloc_rec & REF_COUNT_MASK) {
-    return to_alloc_rec(m_alloc_rec)->alloc_ptr;
-  }
-  return NULL;
-}
-
-size_t AllocationTracker::alloc_size() const
-{
-  if (m_alloc_rec & REF_COUNT_MASK) {
-    return to_alloc_rec(m_alloc_rec)->alloc_size;
-  }
-  return 0u;
-}
-
-size_t AllocationTracker::ref_count()  const
-{
-  if (m_alloc_rec & REF_COUNT_MASK) {
-    return to_alloc_rec(m_alloc_rec)->ref_count;
-  }
-  return 0u;
-}
-
-char const * AllocationTracker::label() const
-{
-  if (m_alloc_rec & REF_COUNT_MASK) {
-    return to_alloc_rec(m_alloc_rec)->label;
-  }
-  return "[Empty Allocation Tracker]";
-}
-
-void AllocationTracker::print( std::ostream & oss) const
-{
-  if (m_alloc_rec & REF_COUNT_MASK) {
-    to_alloc_rec(m_alloc_rec)->print(oss);
-  }
-  else {
-    oss << label();
-  }
-}
-
-bool AllocationTracker::set_attribute( AllocatorAttributeBase * attr ) const
-{
-  bool result = false;
-  if (m_alloc_rec & REF_COUNT_MASK) {
-    result = to_alloc_rec(m_alloc_rec)->set_attribute(attr);
-  }
-  return result;
-}
-
-AllocatorAttributeBase * AllocationTracker::attribute() const
-{
-  if (m_alloc_rec & REF_COUNT_MASK) {
-    return to_alloc_rec(m_alloc_rec)->attribute;
-  }
-  return NULL;
-}
-
-void AllocationTracker::print_tracked_memory( std::ostream & out )
-{
-  global_alloc_rec_pool.print_memory( out );
-}
-
-
-AllocationTracker AllocationTracker::find( void const * ptr, AllocatorBase const * arg_allocator )
-{
-  AllocationRecord * alloc_rec = global_alloc_rec_pool.find(ptr, arg_allocator);
-
-  AllocationTracker tracker;
-
-  if ( alloc_rec != NULL )
-  {
-    if ( tracking_enabled() ) {
-      alloc_rec->increment_ref_count();
-      tracker.m_alloc_rec = reinterpret_cast<uintptr_t>(alloc_rec) | REF_COUNT_BIT;
-    }
-    else {
-      tracker.m_alloc_rec = reinterpret_cast<uintptr_t>(alloc_rec);
-    }
-  }
-
-  return tracker ;
-}
-
-
-
-//-----------------------------------------------------------------------------
-// static AllocationTracker
-//-----------------------------------------------------------------------------
-#if defined( KOKKOS_USE_DECENTRALIZED_HOST )
-namespace {
-
-  // TODO : Detect compiler support for thread local variables
-  #if defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP )
-    bool g_thread_local_tracking_enabled = true;
-    #pragma omp threadprivate(g_thread_local_tracking_enabled)
-  #elif defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS )
-    __thread bool g_thread_local_tracking_enabled = true;
-  #elif defined( KOKKOS_HAVE_OPENMP )
-    bool g_thread_local_tracking_enabled = true;
-    #pragma omp threadprivate(g_thread_local_tracking_enabled)
-  #elif defined( KOKKOS_HAVE_PTHREAD )
-    __thread bool g_thread_local_tracking_enabled = true;
-  #elif defined( KOKKOS_HAVE_SERIAL )
-      bool g_thread_local_tracking_enabled = true;
-  #endif
-} // unnamed namespace
-
-void AllocationTracker::disable_tracking()
-{
-  g_thread_local_tracking_enabled = false;
-}
-
-void AllocationTracker::enable_tracking()
-{
-  g_thread_local_tracking_enabled = true;
-}
-
-bool AllocationTracker::tracking_enabled()
-{
-  return g_thread_local_tracking_enabled;
-}
-#else
-namespace {
-enum TrackingEnum { TRACKING_ENABLED, TRACKING_DISABLED };
-volatile TrackingEnum g_tracking_enabled = TRACKING_ENABLED;
-}
-
-void AllocationTracker::disable_tracking()
-{
-  if ( TRACKING_ENABLED != atomic_compare_exchange( &g_tracking_enabled, TRACKING_ENABLED, TRACKING_DISABLED ) ) {
-    Impl::throw_runtime_exception("Error: Tracking already disabled");
-  }
-}
-
-void AllocationTracker::enable_tracking()
-{
-  if ( TRACKING_DISABLED != atomic_compare_exchange( &g_tracking_enabled, TRACKING_DISABLED, TRACKING_ENABLED ) ) {
-    Impl::throw_runtime_exception("Error: Tracking already enabled");
-  }
-}
-
-bool AllocationTracker::tracking_enabled()
-{
-  return g_tracking_enabled == TRACKING_ENABLED;
-}
-#endif
-
-
-//-----------------------------------------------------------------------------
-// create singleton free function
-//-----------------------------------------------------------------------------
-void * create_singleton(  size_t size
-                        , Impl::singleton_create_function_type create_func
-                        , Impl::singleton_destroy_function_type destroy_func )
-{
-  return global_alloc_rec_pool.create_singleton( size, create_func, destroy_func );
-}
-
-}} // namespace Kokkos::Impl
-
-#endif /* #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) */
-
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
--- a/lib/kokkos/core/src/impl/Kokkos_AllocationTracker.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_AllocationTracker.hpp
@ -1,574 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_ALLOCATION_TRACKER_HPP
-#define KOKKOS_ALLOCATION_TRACKER_HPP
-
-#include <Kokkos_Macros.hpp>
-
-#if ! KOKKOS_USING_EXP_VIEW
-
-#include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_Error.hpp>
-
-#include <stdint.h>
-#include <cstdlib>
-#include <string>
-#include <iosfwd>
-
-namespace Kokkos { namespace Impl {
-
-//-----------------------------------------------------------------------------
-// Create Singleton objects
-//-----------------------------------------------------------------------------
-
-typedef void * (*singleton_create_function_type)(void * buffer);
-typedef void (*singleton_destroy_function_type)(void *);
-
-void * create_singleton(  size_t size
-                        , singleton_create_function_type create_func
-                        , singleton_destroy_function_type destroy_func
-                       );
-
-
-
-/// class Singleton
-///
-/// Default construct a singleton type.  This method is used to circumvent
-/// order of construction issues.  Singleton objects are destroyed after all
-/// other allocations in the reverse order of their creation.
-template <typename Type>
-class Singleton
-{
-public:
-  /// Get a pointer to the Singleton. Default construct the singleton if it does not already exist
-  static Type * get()
-  {
-    static Type * singleton = NULL;
-    if (singleton == NULL) {
-      Impl::singleton_create_function_type  create_func = &create;
-      Impl::singleton_destroy_function_type destroy_func = &destroy;
-      singleton = reinterpret_cast<Type*>( Impl::create_singleton( sizeof(Type), create_func, destroy_func ) );
-    }
-    return singleton;
-  }
-
-private:
-
-  /// Call the Type constructor
-  static void destroy(void * ptr)
-  {
-    reinterpret_cast<Type*>(ptr)->~Type();
-  }
-
-  /// placement new the Type in buffer
-  static void * create(void * buffer)
-  {
-    return new (buffer) Type();
-  }
-};
-
-
-//-----------------------------------------------------------------------------
-// AllocatorBase
-//-----------------------------------------------------------------------------
-
-/// class AllocatorBase
-///
-/// Abstract base class for all Allocators.
-/// Allocators should be singleton objects, use Singleton<Allocator>::get to create
-/// to avoid order of destruction issues
-class AllocatorBase
-{
-public:
-  /// name of the allocator
-  /// used to report memory leaks
-  virtual const char * name() const = 0;
-
-  /// Allocate a buffer of size number of bytes
-  virtual void* allocate(size_t size) const = 0;
-
-  /// Deallocate a buffer with size number of bytes
-  /// The pointer must have been allocated with a call to corresponding allocate
-  virtual void deallocate(void * ptr, size_t size) const = 0;
-
-  /// Changes the size of the memory block pointed to by ptr.
-  /// Ptr must have been allocated with the corresponding allocate call
-  /// The function may move the memory block to a new location
-  /// (whose address is returned by the function).
-  ///
-  /// The content of the memory block is preserved up to the lesser of the new and
-  /// old sizes, even if the block is moved to a new location. If the new size is larger,
-  /// the value of the newly allocated portion is indeterminate.
-  ///
-  /// In case that ptr is a null pointer, the function behaves like allocate, assigning a
-  /// new block of size bytes and returning a pointer to its beginning.
-  virtual void * reallocate(void * old_ptr, size_t old_size, size_t new_size) const = 0;
-
-  /// can a texture object be bound to the allocated memory
-  virtual bool support_texture_binding() const = 0;
-
-  /// virtual destructor
-  virtual ~AllocatorBase() {}
-};
-
-/// class AllocatorAttributeBase
-class AllocatorAttributeBase
-{
-public:
-  virtual ~AllocatorAttributeBase() {}
-};
-
-//-----------------------------------------------------------------------------
-// Allocator< StaticAllocator > : public AllocatorBase
-//-----------------------------------------------------------------------------
-
-// HasStaticName
-template<typename T>
-class HasStaticName
-{
-  typedef const char * (*static_method)();
-  template<typename U, static_method> struct SFINAE {};
-  template<typename U> static char Test(SFINAE<U, &U::name>*);
-  template<typename U> static int Test(...);
-public:
-  enum { value = sizeof(Test<T>(0)) == sizeof(char) };
-};
-
-
-template <typename T>
-inline
-typename enable_if<HasStaticName<T>::value, const char *>::type
-allocator_name()
-{
-  return T::name();
-}
-
-template <typename T>
-inline
-typename enable_if<!HasStaticName<T>::value, const char *>::type
-allocator_name()
-{
-  return "Unnamed Allocator";
-}
-
-
-// HasStaticAllocate
-template<typename T>
-class HasStaticAllocate
-{
-  typedef void * (*static_method)(size_t);
-  template<typename U, static_method> struct SFINAE {};
-  template<typename U> static char Test(SFINAE<U, &U::allocate>*);
-  template<typename U> static int Test(...);
-public:
-  enum { value = sizeof(Test<T>(0)) == sizeof(char) };
-};
-
-template <typename T>
-inline
-typename enable_if<HasStaticAllocate<T>::value, void *>::type
-allocator_allocate(size_t size)
-{
-  return T::allocate(size);
-}
-
-template <typename T>
-inline
-typename enable_if<!HasStaticAllocate<T>::value, void *>::type
-allocator_allocate(size_t)
-{
-  throw_runtime_exception(  std::string("Error: ")
-                          + std::string(allocator_name<T>())
-                          + std::string(" cannot allocate memory!") );
-  return NULL;
-}
-
-// HasStaticDeallocate
-template<typename T>
-class HasStaticDeallocate
-{
-  typedef void (*static_method)(void *, size_t);
-  template<typename U, static_method> struct SFINAE {};
-  template<typename U> static char Test(SFINAE<U, &U::deallocate>*);
-  template<typename U> static int Test(...);
-public:
-  enum { value = sizeof(Test<T>(0)) == sizeof(char) };
-};
-
-template <typename T>
-inline
-typename enable_if<HasStaticDeallocate<T>::value, void>::type
-allocator_deallocate(void * ptr, size_t size)
-{
-  T::deallocate(ptr,size);
-}
-
-template <typename T>
-inline
-typename enable_if<!HasStaticDeallocate<T>::value, void>::type
-allocator_deallocate(void *, size_t)
-{
-  throw_runtime_exception(  std::string("Error: ")
-                          + std::string(allocator_name<T>())
-                          + std::string(" cannot deallocate memory!") );
-}
-
-// HasStaticReallocate
-template<typename T>
-class HasStaticReallocate
-{
-  typedef void * (*static_method)(void *, size_t, size_t);
-  template<typename U, static_method> struct SFINAE {};
-  template<typename U> static char Test(SFINAE<U, &U::reallocate>*);
-  template<typename U> static int Test(...);
-public:
-  enum { value = sizeof(Test<T>(0)) == sizeof(char) };
-};
-
-template <typename T>
-inline
-typename enable_if<HasStaticReallocate<T>::value, void *>::type
-allocator_reallocate(void * old_ptr, size_t old_size, size_t new_size)
-{
-  return T::reallocate(old_ptr, old_size, new_size);
-}
-
-template <typename T>
-inline
-typename enable_if<!HasStaticReallocate<T>::value, void *>::type
-allocator_reallocate(void *, size_t, size_t)
-{
-  throw_runtime_exception(  std::string("Error: ")
-                          + std::string(allocator_name<T>())
-                          + std::string(" cannot reallocate memory!") );
-  return NULL;
-}
-
-// HasStaticReallocate
-template<typename T>
-class HasStaticSupportTextureBinding
-{
-  typedef bool (*static_method)();
-  template<typename U, static_method> struct SFINAE {};
-  template<typename U> static char Test(SFINAE<U, &U::support_texture_binding>*);
-  template<typename U> static int Test(...);
-public:
-  enum { value = sizeof(Test<T>(0)) == sizeof(char) };
-};
-
-template <typename T>
-inline
-typename enable_if<HasStaticSupportTextureBinding<T>::value, bool>::type
-allocator_support_texture_binding()
-{
-  return T::support_texture_binding();
-}
-
-template <typename T>
-inline
-typename enable_if<!HasStaticSupportTextureBinding<T>::value, bool>::type
-allocator_support_texture_binding()
-{
-  return false;
-}
-
-template <typename T>
-class Allocator : public AllocatorBase
-{
-public:
-  virtual const char * name() const
-  {
-    return allocator_name<T>();
-  }
-
-  virtual void* allocate(size_t size) const
-  {
-    return allocator_allocate<T>(size);
-  }
-
-  virtual void deallocate(void * ptr, size_t size) const
-  {
-    allocator_deallocate<T>(ptr,size);
-  }
-
-  virtual void * reallocate(void * old_ptr, size_t old_size, size_t new_size) const
-  {
-    return allocator_reallocate<T>(old_ptr, old_size, new_size);
-  }
-
-  virtual bool support_texture_binding() const
-  {
-    return allocator_support_texture_binding<T>();
-  }
-
-  static AllocatorBase * singleton()
-  {
-    return Singleton< Allocator<T> >::get();
-  }
-};
-
-//-----------------------------------------------------------------------------
-// AllocationTracker
-//-----------------------------------------------------------------------------
-
-// forward declaration for friend classes
-struct MallocHelper;
-
-/// class AllocationTracker
-/// Will call deallocate from the AllocatorBase when the reference count reaches 0.
-/// Reference counting is disabled when the host is in parallel.
-class AllocationTracker
-{
-  // use the least significant bit of the AllocationRecord pointer to indicate if the
-  // AllocationTracker should reference count
-  enum {
-     REF_COUNT_BIT = static_cast<uintptr_t>(1)
-   , REF_COUNT_MASK = ~static_cast<uintptr_t>(1)
-  };
-
-public:
-
-  /// Find an AllocationTracker such that
-  /// alloc_ptr <= ptr < alloc_ptr + alloc_size
-  /// O(n) where n is the number of tracked allocations.
-  template <typename StaticAllocator>
-  static AllocationTracker find( void const * ptr )
-  {
-    return find( ptr, Allocator<StaticAllocator>::singleton() );
-  }
-
-
-  /// Pretty print all the currently tracked memory
-  static void print_tracked_memory( std::ostream & out );
-
-  /// Default constructor
-  KOKKOS_INLINE_FUNCTION
-  AllocationTracker()
-    : m_alloc_rec(0)
-  {}
-
-  /// Create a AllocationTracker
-  ///
-  /// Start reference counting the alloc_ptr.
-  /// When the reference count reachs 0 the allocator deallocate method
-  /// will be call with the given size.  The alloc_ptr should have been
-  /// allocated with the allocator's allocate method.
-  ///
-  /// If arg_allocator == NULL OR arg_alloc_ptr == NULL OR size == 0
-  /// do nothing
-  template <typename StaticAllocator>
-  AllocationTracker(  StaticAllocator const &
-                    , void * arg_alloc_ptr
-                    , size_t arg_alloc_size
-                    , const std::string & arg_label = std::string("") )
-    : m_alloc_rec(0)
-  {
-    AllocatorBase * arg_allocator = Allocator<StaticAllocator>::singleton();
-    initalize( arg_allocator, arg_alloc_ptr, arg_alloc_size, arg_label);
-  }
-
-  /// Create a AllocationTracker
-  ///
-  /// Start reference counting the alloc_ptr.
-  /// When the reference count reachs 0 the allocator deallocate method
-  /// will be call with the given size.  The alloc_ptr should have been
-  /// allocated with the allocator's allocate method.
-  ///
-  /// If arg_allocator == NULL OR arg_alloc_ptr == NULL OR size == 0
-  /// do nothing
-  template <typename StaticAllocator>
-  AllocationTracker(  StaticAllocator const &
-                    , size_t arg_alloc_size
-                    , const std::string & arg_label = std::string("")
-                   )
-    : m_alloc_rec(0)
-  {
-    AllocatorBase * arg_allocator = Allocator<StaticAllocator>::singleton();
-    void * arg_alloc_ptr = arg_allocator->allocate( arg_alloc_size );
-
-    initalize( arg_allocator, arg_alloc_ptr, arg_alloc_size, arg_label);
-  }
-
-  /// Copy an AllocatorTracker
-  KOKKOS_INLINE_FUNCTION
-  AllocationTracker( const AllocationTracker & rhs )
-    : m_alloc_rec( rhs.m_alloc_rec)
-  {
-#if !defined( __CUDA_ARCH__ )
-    if ( rhs.ref_counting() && tracking_enabled() ) {
-      increment_ref_count();
-    }
-    else {
-      m_alloc_rec = m_alloc_rec & REF_COUNT_MASK;
-    }
-#else
-    m_alloc_rec = m_alloc_rec & REF_COUNT_MASK;
-#endif
-  }
-
-  /// Copy an AllocatorTracker
-  /// Decrement the reference count of the current tracker if necessary
-  KOKKOS_INLINE_FUNCTION
-  AllocationTracker & operator=( const AllocationTracker & rhs )
-  {
-    if (this != &rhs) {
-#if !defined( __CUDA_ARCH__ )
-      if ( ref_counting() ) {
-        decrement_ref_count();
-      }
-
-      m_alloc_rec = rhs.m_alloc_rec;
-
-      if ( rhs.ref_counting() && tracking_enabled() ) {
-        increment_ref_count();
-      }
-      else {
-        m_alloc_rec = m_alloc_rec & REF_COUNT_MASK;
-      }
-#else
-      m_alloc_rec = rhs.m_alloc_rec & REF_COUNT_MASK;
-#endif
-    }
-
-    return * this;
-  }
-
-  /// Destructor
-  /// Decrement the reference count if necessary
-  KOKKOS_INLINE_FUNCTION
-  ~AllocationTracker()
-  {
-#if !defined( __CUDA_ARCH__ )
-    if ( ref_counting() ) {
-      decrement_ref_count();
-    }
-#endif
-  }
-
-  /// Is the tracker valid?
-  KOKKOS_INLINE_FUNCTION
-  bool is_valid() const
-  {
-    return (m_alloc_rec & REF_COUNT_MASK);
-  }
-
-
-
-  /// clear the tracker
-  KOKKOS_INLINE_FUNCTION
-  void clear()
-  {
-#if !defined( __CUDA_ARCH__ )
-    if ( ref_counting() ) {
-      decrement_ref_count();
-    }
-#endif
-    m_alloc_rec = 0;
-  }
-
-  /// is this tracker currently counting allocations?
-  KOKKOS_INLINE_FUNCTION
-  bool ref_counting() const
-  {
-    return (m_alloc_rec & REF_COUNT_BIT);
-  }
-
-  AllocatorBase * allocator() const;
-
-  /// pointer to the allocated memory
-  void * alloc_ptr()  const;
-
-  /// size in bytes of the allocated memory
-  size_t alloc_size() const;
-
-  /// the current reference count
-  size_t ref_count()  const;
-
-  /// the label given to the allocation
-  char const * label() const;
-
-  /// pretty print all the tracker's information to the std::ostream
-  void print( std::ostream & oss) const;
-
-
-  /// set an attribute ptr on the allocation record
-  /// the arg_attribute pointer will be deleted when the record is destroyed
-  /// the attribute ptr can only be set once
-  bool set_attribute( AllocatorAttributeBase * arg_attribute) const;
-
-  /// get the attribute ptr from the allocation record
-  AllocatorAttributeBase * attribute() const;
-
-
-  /// reallocate the memory tracked by this allocation
-  /// NOT thread-safe
-  void reallocate( size_t size ) const;
-
-  static void disable_tracking();
-  static void enable_tracking();
-  static bool tracking_enabled();
-
-private:
-
-  static AllocationTracker find( void const * ptr, AllocatorBase const * arg_allocator );
-
-  void initalize(  AllocatorBase * arg_allocator
-                 , void * arg_alloc_ptr
-                 , size_t arg_alloc_size
-                 , std::string const & label );
-
-  void increment_ref_count() const;
-  void decrement_ref_count() const;
-
-  friend struct Impl::MallocHelper;
-
-  uintptr_t m_alloc_rec;
-};
-
-}} // namespace Kokkos::Impl
-
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
-#endif //KOKKOS_ALLOCATION_TRACKER_HPP
-
--- a/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp
@ -0,0 +1,197 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_ANALYZE_POLICY_HPP
+#define KOKKOS_IMPL_ANALYZE_POLICY_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_Concepts.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+namespace Kokkos { namespace Impl {
+
+template < typename ExecutionSpace   = void
+         , typename Schedule         = void
+         , typename WorkTag          = void
+         , typename IndexType        = void
+         , typename IterationPattern = void
+         >
+struct PolicyTraitsBase
+{
+  using type = PolicyTraitsBase< ExecutionSpace, Schedule, WorkTag, IndexType, IterationPattern>;
+
+  using execution_space   = ExecutionSpace;
+  using schedule_type     = Schedule;
+  using work_tag          = WorkTag;
+  using index_type        = IndexType;
+  using iteration_pattern = IterationPattern;
+};
+
+
+template <typename PolicyBase, typename ExecutionSpace>
+struct SetExecutionSpace
+{
+  static_assert( is_void<typename PolicyBase::execution_space>::value
+               , "Kokkos Error: More than one execution space given" );
+  using type = PolicyTraitsBase< ExecutionSpace
+                               , typename PolicyBase::schedule_type
+                               , typename PolicyBase::work_tag
+                               , typename PolicyBase::index_type
+                               , typename PolicyBase::iteration_pattern
+                               >;
+};
+
+template <typename PolicyBase, typename Schedule>
+struct SetSchedule
+{
+  static_assert( is_void<typename PolicyBase::schedule_type>::value
+               , "Kokkos Error: More than one schedule type given" );
+  using type = PolicyTraitsBase< typename PolicyBase::execution_space
+                               , Schedule
+                               , typename PolicyBase::work_tag
+                               , typename PolicyBase::index_type
+                               , typename PolicyBase::iteration_pattern
+                               >;
+};
+
+template <typename PolicyBase, typename WorkTag>
+struct SetWorkTag
+{
+  static_assert( is_void<typename PolicyBase::work_tag>::value
+               , "Kokkos Error: More than one work tag given" );
+  using type = PolicyTraitsBase< typename PolicyBase::execution_space
+                               , typename PolicyBase::schedule_type
+                               , WorkTag
+                               , typename PolicyBase::index_type
+                               , typename PolicyBase::iteration_pattern
+                               >;
+};
+
+template <typename PolicyBase, typename IndexType>
+struct SetIndexType
+{
+  static_assert( is_void<typename PolicyBase::index_type>::value
+               , "Kokkos Error: More than one index type given" );
+  using type = PolicyTraitsBase< typename PolicyBase::execution_space
+                               , typename PolicyBase::schedule_type
+                               , typename PolicyBase::work_tag
+                               , IndexType
+                               , typename PolicyBase::iteration_pattern
+                               >;
+};
+
+
+template <typename PolicyBase, typename IterationPattern>
+struct SetIterationPattern
+{
+  static_assert( is_void<typename PolicyBase::iteration_pattern>::value
+               , "Kokkos Error: More than one iteration_pattern given" );
+  using type = PolicyTraitsBase< typename PolicyBase::execution_space
+                               , typename PolicyBase::schedule_type
+                               , typename PolicyBase::work_tag
+                               , typename PolicyBase::index_type
+                               , IterationPattern
+                               >;
+};
+
+
+template <typename Base, typename... Traits>
+struct AnalyzePolicy;
+
+template <typename Base, typename T, typename... Traits>
+struct AnalyzePolicy<Base, T, Traits...> : public
+  AnalyzePolicy<
+      typename std::conditional< is_execution_space<T>::value  , SetExecutionSpace<Base,T>
+    , typename std::conditional< is_schedule_type<T>::value    , SetSchedule<Base,T>
+    , typename std::conditional< is_index_type<T>::value       , SetIndexType<Base,T>
+    , typename std::conditional< std::is_integral<T>::value    , SetIndexType<Base, IndexType<T> >
+    , typename std::conditional< is_iteration_pattern<T>::value, SetIterationPattern<Base,T>
+    , SetWorkTag<Base,T>
+    >::type >::type >::type >::type>::type::type
+  , Traits...
+  >
+{};
+
+template <typename Base>
+struct AnalyzePolicy<Base>
+{
+  using execution_space = typename std::conditional< is_void< typename Base::execution_space >::value
+                                                   , DefaultExecutionSpace
+                                                   , typename Base::execution_space
+                                                   >::type;
+
+  using schedule_type = typename std::conditional< is_void< typename Base::schedule_type >::value
+                                                 , Schedule< Static >
+                                                 , typename Base::schedule_type
+                                                 >::type;
+
+  using work_tag = typename Base::work_tag;
+
+  using index_type = typename std::conditional< is_void< typename Base::index_type >::value
+                                              , IndexType< typename execution_space::size_type >
+                                              , typename Base::index_type
+                                              >::type
+                                               ::type // nasty hack to make index_type into an integral_type
+                                              ;       // instead of the wrapped IndexType<T> for backwards compatibility
+
+  using iteration_pattern = typename std::conditional< is_void< typename Base::iteration_pattern >::value
+                                                     , void // TODO set default iteration pattern
+                                                     , typename Base::iteration_pattern
+                                                     >::type;
+  using type = PolicyTraitsBase< execution_space
+                               , schedule_type
+                               , work_tag
+                               , index_type
+                               , iteration_pattern
+                               >;
+};
+
+template <typename... Traits>
+struct PolicyTraits
+  : public AnalyzePolicy< PolicyTraitsBase<>, Traits... >::type
+{};
+
+}} // namespace Kokkos::Impl
+
+
+#endif //KOKKOS_IMPL_ANALYZE_POLICY_HPP
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -218,7 +218,17 @@ T atomic_compare_exchange( volatile T * const dest , const T compare ,
  while( !Impl::lock_address_host_space( (void*) dest ) );
  T return_val = *dest;
  if( return_val == compare ) {
-    const T tmp = *dest = val;
+    // Don't use the following line of code here:
+    //
+    //const T tmp = *dest = val;
+    //
+    // Instead, put each assignment in its own statement.  This is
+    // because the overload of T::operator= for volatile *this should
+    // return void, not volatile T&.  See Kokkos #177:
+    //
+    // https://github.com/kokkos/kokkos/issues/177
+    *dest = val;
+    const T tmp = *dest;
    #ifndef KOKKOS_COMPILER_CLANG
    (void) tmp;
    #endif
@ -239,7 +249,7 @@ T atomic_compare_exchange( volatile T * const dest, const T compare, const T val
  {
    retval = dest[0];
    if ( retval == compare )
-  	dest[0] = val;
+        dest[0] = val;
  }
  return retval;
 }
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -228,7 +228,17 @@ T atomic_exchange( volatile T * const dest ,
 {
  while( !Impl::lock_address_host_space( (void*) dest ) );
  T return_val = *dest;
-  const T tmp = *dest = val;
+  // Don't use the following line of code here:
+  //
+  //const T tmp = *dest = val;
+  //
+  // Instead, put each assignment in its own statement.  This is
+  // because the overload of T::operator= for volatile *this should
+  // return void, not volatile T&.  See Kokkos #177:
+  //
+  // https://github.com/kokkos/kokkos/issues/177
+  *dest = val;
+  const T tmp = *dest;
  #ifndef KOKKOS_COMPILER_CLANG
  (void) tmp;
  #endif
@ -305,7 +315,9 @@ void atomic_assign( volatile T * const dest ,
  // member.  The volatile return value implicitly defines a
  // dereference that some compilers (gcc 4.7.2) warn is being ignored.
  // Suppress warning by casting return to void.
-  (void)( *dest = val );
+  //(void)( *dest = val );
+  *dest = val;
+
  Impl::unlock_address_host_space( (void*) dest );
 }
 //----------------------------------------------------------------------------
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@ -93,7 +93,7 @@ T atomic_fetch_add( volatile T * const dest ,
    assume.i = oldval.i ;
    newval.t = assume.t + val ;
    oldval.i = atomicCAS( (int*)dest , assume.i , newval.i );
-  } while ( assumed.i != oldval.i );
+  } while ( assume.i != oldval.i );

  return oldval.t ;
 }
@ -156,9 +156,26 @@ T atomic_fetch_add( volatile T * const dest ,

 #elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)

+#if defined( KOKKOS_ENABLE_ASM ) && defined ( KOKKOS_USE_ISA_X86_64 )
+KOKKOS_INLINE_FUNCTION
+int atomic_fetch_add( volatile int * dest , const int val )
+{
+        int original = val;
+
+        __asm__ __volatile__(
+                "lock xadd %1, %0"
+                : "+m" (*dest), "+r" (original)
+                : "m" (*dest), "r" (original)
+                : "memory"
+        );
+
+        return original;
+}
+#else
 KOKKOS_INLINE_FUNCTION
 int atomic_fetch_add( volatile int * const dest , const int val )
-{ return __sync_fetch_and_add(dest,val); }
+{ return __sync_fetch_and_add(dest, val); }
+#endif

 KOKKOS_INLINE_FUNCTION
 long int atomic_fetch_add( volatile long int * const dest , const long int val )
@ -276,7 +293,17 @@ T atomic_fetch_add( volatile T * const dest ,
 {
  while( !Impl::lock_address_host_space( (void*) dest ) );
  T return_val = *dest;
-  const T tmp = *dest = return_val + val;
+  // Don't use the following line of code here:
+  //
+  //const T tmp = *dest = return_val + val;
+  //
+  // Instead, put each assignment in its own statement.  This is
+  // because the overload of T::operator= for volatile *this should
+  // return void, not volatile T&.  See Kokkos #177:
+  //
+  // https://github.com/kokkos/kokkos/issues/177
+  *dest = return_val + val;
+  const T tmp = *dest;
  (void) tmp;
  Impl::unlock_address_host_space( (void*) dest );
  return return_val;
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
@ -73,7 +73,7 @@ T atomic_fetch_sub( volatile T * const dest ,
    assume.i = oldval.i ;
    newval.t = assume.t - val ;
    oldval.i = atomicCAS( (int*)dest , assume.i , newval.i );
-  } while ( assumed.i != oldval.i );
+  } while ( assume.i != oldval.i );

  return oldval.t ;
 }
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
@ -48,6 +48,22 @@
 namespace Kokkos {
 namespace Impl {

+template<class Scalar1, class Scalar2>
+struct MaxOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return (val1 > val2 ? val1 : val2);
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct MinOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return (val1 < val2 ? val1 : val2);
+  }
+};
+
 template<class Scalar1, class Scalar2>
 struct AddOper {
  KOKKOS_FORCEINLINE_FUNCTION
@ -276,6 +292,18 @@ T atomic_oper_fetch( const Oper& op, volatile T * const dest ,
 namespace Kokkos {

 // Fetch_Oper atomics: return value before operation
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_max(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::MaxOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_min(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::MinOper<T,const T>(),dest,val);
+}
+
 template < typename T >
 KOKKOS_INLINE_FUNCTION
 T atomic_fetch_mul(volatile T * const dest, const T val) {
@ -326,6 +354,18 @@ T atomic_fetch_rshift(volatile T * const dest, const unsigned int val) {


 // Oper Fetch atomics: return value after operation
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_max_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::MaxOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_min_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::MinOper<T,const T>(),dest,val);
+}
+
 template < typename T >
 KOKKOS_INLINE_FUNCTION
 T atomic_mul_fetch(volatile T * const dest, const T val) {
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
@ -425,42 +425,6 @@ struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<8> {
  typedef int64_t type;
 };

-#if ! KOKKOS_USING_EXP_VIEW
-
-class AllocationTracker;
-
-// Must be non-const, atomic access trait, and 32 or 64 bit type for true atomics.
-template<class ViewTraits>
-class ViewDataHandle<
-  ViewTraits ,
-  typename enable_if<
-    ( ! is_same<typename ViewTraits::const_value_type,typename ViewTraits::value_type>::value) &&
-    ( ViewTraits::memory_traits::Atomic )
-  >::type >
-{
-private:
-//  typedef typename if_c<(sizeof(typename ViewTraits::const_value_type)==4) ||
-//                        (sizeof(typename ViewTraits::const_value_type)==8),
-//                         int, Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars >::type
-//                   atomic_view_possible;
-  typedef typename Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<sizeof(typename ViewTraits::const_value_type)>::type enable_atomic_type;
-  typedef ViewDataHandle self_type;
-
-public:
-  enum {  ReturnTypeIsReference = false };
-
-  typedef Impl::AtomicViewDataHandle<ViewTraits> handle_type;
-  typedef Impl::AtomicDataElement<ViewTraits>    return_type;
-
-  KOKKOS_INLINE_FUNCTION
-  static handle_type create_handle( typename ViewTraits::value_type * arg_data_ptr, AllocationTracker const & /*arg_tracker*/ )
-  {
-    return handle_type(arg_data_ptr);
-  }
-};
-
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
 }} // namespace Kokkos::Impl

 #endif
--- a/lib/kokkos/core/src/impl/Kokkos_BasicAllocators.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_BasicAllocators.cpp
@ -1,287 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#include <Kokkos_HostSpace.hpp>
-
-#if ! KOKKOS_USING_EXP_VIEW
-
-#include <impl/Kokkos_BasicAllocators.hpp>
-#include <impl/Kokkos_Error.hpp>
-
-
-#include <stdint.h>    // uintptr_t
-#include <cstdlib>     // for malloc, realloc, and free
-#include <cstring>     // for memcpy
-
-#if defined(KOKKOS_POSIX_MEMALIGN_AVAILABLE)
-#include <sys/mman.h>  // for mmap, munmap, MAP_ANON, etc
-#include <unistd.h>    // for sysconf, _SC_PAGE_SIZE, _SC_PHYS_PAGES
-#endif
-
-#include <sstream>
-
-namespace Kokkos { namespace Impl {
-
-/*--------------------------------------------------------------------------*/
-
-void* MallocAllocator::allocate( size_t size )
-{
-  void * ptr = NULL;
-  if (size) {
-    ptr = malloc(size);
-
-    if (!ptr)
-    {
-      std::ostringstream msg ;
-      msg << name() << ": allocate(" << size << ") FAILED";
-      throw_runtime_exception( msg.str() );
-    }
-  }
-  return ptr;
-}
-
-void MallocAllocator::deallocate( void * ptr, size_t /*size*/ )
-{
-  if (ptr) {
-    free(ptr);
-  }
-}
-
-void * MallocAllocator::reallocate(void * old_ptr, size_t /*old_size*/, size_t new_size)
-{
-  void * ptr = realloc(old_ptr, new_size);
-
-  if (new_size > 0u && ptr == NULL) {
-    throw_runtime_exception("Error: Malloc Allocator could not reallocate memory");
-  }
-  return ptr;
-}
-
-/*--------------------------------------------------------------------------*/
-
-namespace {
-
-void * raw_aligned_allocate( size_t size, size_t alignment )
-{
-  void * ptr = NULL;
-  if ( size ) {
-#if defined( __INTEL_COMPILER ) && !defined ( KOKKOS_HAVE_CUDA )
-    ptr = _mm_malloc( size , alignment );
-
-#elif defined(KOKKOS_POSIX_MEMALIGN_AVAILABLE)
-
-    posix_memalign( & ptr, alignment , size );
-
-#else
-    // Over-allocate to and round up to guarantee proper alignment.
-    size_t size_padded = size + alignment + sizeof(void *);
-    void * alloc_ptr = malloc( size_padded );
-
-    if (alloc_ptr) {
-      uintptr_t address = reinterpret_cast<uintptr_t>(alloc_ptr);
-      // offset enough to record the alloc_ptr
-      address += sizeof(void *);
-      uintptr_t rem = address % alignment;
-      uintptr_t offset = rem ? (alignment - rem) : 0u;
-      address += offset;
-      ptr = reinterpret_cast<void *>(address);
-      // record the alloc'd pointer
-      address -= sizeof(void *);
-      *reinterpret_cast<void **>(address) = alloc_ptr;
-    }
-#endif
-  }
-  return ptr;
-}
-
-void raw_aligned_deallocate( void * ptr, size_t /*size*/ )
-{
-  if ( ptr ) {
-#if defined( __INTEL_COMPILER ) && !defined ( KOKKOS_HAVE_CUDA )
-    _mm_free( ptr );
-
-#elif defined(KOKKOS_POSIX_MEMALIGN_AVAILABLE)
-    free( ptr );
-#else
-    // get the alloc'd pointer
-    void * alloc_ptr = *(reinterpret_cast<void **>(ptr) -1);
-    free( alloc_ptr );
-#endif
-  }
-
-}
-
-}
-
-void* AlignedAllocator::allocate( size_t size )
-{
-  void * ptr = 0 ;
-
-  if ( size ) {
-    ptr = raw_aligned_allocate(size, MEMORY_ALIGNMENT);
-
-    if (!ptr)
-    {
-      std::ostringstream msg ;
-      msg << name() << ": allocate(" << size << ") FAILED";
-      throw_runtime_exception( msg.str() );
-    }
-  }
-  return ptr;
-}
-
-void AlignedAllocator::deallocate( void * ptr, size_t size )
-{
-  raw_aligned_deallocate( ptr, size);
-}
-
-void * AlignedAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
-{
-  void * ptr = old_ptr;;
-
-  if (old_size < new_size) {
-    ptr = allocate( new_size );
-
-    memcpy(ptr, old_ptr, old_size );
-
-    deallocate( old_ptr, old_size );
-  }
-
-  return ptr;
-}
-
-/*--------------------------------------------------------------------------*/
-
-// mmap flags for private anonymous memory allocation
-#if defined( MAP_ANONYMOUS ) && defined( MAP_PRIVATE )
-  #define MMAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS)
-#elif defined( MAP_ANON) && defined( MAP_PRIVATE )
-  #define MMAP_FLAGS (MAP_PRIVATE | MAP_ANON)
-#else
-  #define NO_MMAP
-#endif
-
-// huge page tables
-#if !defined( NO_MMAP )
-  #if defined( MAP_HUGETLB )
-    #define MMAP_FLAGS_HUGE (MMAP_FLAGS | MAP_HUGETLB )
-  #elif defined( MMAP_FLAGS )
-    #define MMAP_FLAGS_HUGE MMAP_FLAGS
-  #endif
-  // threshold to use huge pages
-  #define MMAP_USE_HUGE_PAGES (1u << 27)
-#endif
-
-// read write access to private memory
-#if !defined( NO_MMAP )
-  #define MMAP_PROTECTION (PROT_READ | PROT_WRITE)
-#endif
-
-
-void* PageAlignedAllocator::allocate( size_t size )
-{
-  void *ptr = NULL;
-  if (size) {
-#if !defined NO_MMAP
-    if ( size < MMAP_USE_HUGE_PAGES ) {
-      ptr = mmap( NULL, size, MMAP_PROTECTION, MMAP_FLAGS, -1 /*file descriptor*/, 0 /*offset*/);
-    } else {
-      ptr = mmap( NULL, size, MMAP_PROTECTION, MMAP_FLAGS_HUGE, -1 /*file descriptor*/, 0 /*offset*/);
-    }
-    if (ptr == MAP_FAILED) {
-      ptr = NULL;
-    }
-#else
-    static const size_t page_size = 4096; // TODO: read in from sysconf( _SC_PAGE_SIZE )
-
-    ptr = raw_aligned_allocate( size, page_size);
-#endif
-    if (!ptr)
-    {
-      std::ostringstream msg ;
-      msg << name() << ": allocate(" << size << ") FAILED";
-      throw_runtime_exception( msg.str() );
-    }
-  }
-  return ptr;
-}
-
-void PageAlignedAllocator::deallocate( void * ptr, size_t size )
-{
-#if !defined( NO_MMAP )
-  munmap(ptr, size);
-#else
-  raw_aligned_deallocate(ptr, size);
-#endif
-}
-
-void * PageAlignedAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
-{
-  void * ptr = NULL;
-#if defined( NO_MMAP ) || defined( __APPLE__ ) || defined( __CYGWIN__ )
-
-  if (old_size != new_size) {
-    ptr = allocate( new_size );
-
-    memcpy(ptr, old_ptr, (old_size < new_size ? old_size : new_size) );
-
-    deallocate( old_ptr, old_size );
-  }
-  else {
-    ptr = old_ptr;
-  }
-#else
-  ptr = mremap( old_ptr, old_size, new_size, MREMAP_MAYMOVE );
-
-  if (ptr == MAP_FAILED) {
-    throw_runtime_exception("Error: Page Aligned Allocator could not reallocate memory");
-  }
-#endif
-
-  return ptr;
-}
-
-}} // namespace Kokkos::Impl
-
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
--- a/Show More
+++ b/Show More