From a55adf4a6848a734a492e2f5dc993041927db08a Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Tue, 3 Oct 2017 11:30:00 -0600
Subject: [PATCH] Update to Kokkos r2.04.04 and add workaround for performance
 regression

---
 lib/kokkos/CHANGELOG.md                       |  19 +
 lib/kokkos/Makefile.kokkos                    |  32 +-
 lib/kokkos/algorithms/src/Kokkos_Random.hpp   | 237 ++++++++++++
 lib/kokkos/algorithms/unit_tests/Makefile     |  12 +
 lib/kokkos/algorithms/unit_tests/TestROCm.cpp | 112 ++++++
 lib/kokkos/bin/hpcbind                        | 239 ++++++++----
 lib/kokkos/bin/kokkos-bind                    | 221 -----------
 lib/kokkos/bin/nvcc_wrapper                   |  15 +-
 lib/kokkos/config/master_history.txt          |   1 +
 .../config/trilinos-integration/checkin-test  |   2 +-
 .../containers/src/Kokkos_StaticCrsGraph.hpp  | 149 ++++++++
 .../KokkosExp_Cuda_IterateTile_Refactor.hpp   | 160 ++++----
 lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp  |   4 +-
 lib/kokkos/core/src/Kokkos_Complex.hpp        | 357 +++++++++++++-----
 lib/kokkos/core/src/Kokkos_Crs.hpp            |   9 +-
 lib/kokkos/core/src/Kokkos_HBWSpace.hpp       |   3 +-
 lib/kokkos/core/src/Kokkos_NumericTraits.hpp  |   6 +-
 lib/kokkos/core/src/Kokkos_ROCm.hpp           |  18 +
 lib/kokkos/core/src/Makefile                  |   1 +
 .../core/src/OpenMP/Kokkos_OpenMP_Exec.hpp    |   1 +
 .../core/src/ROCm/Kokkos_ROCm_Reduce.hpp      |  12 +-
 lib/kokkos/core/src/ROCm/Kokkos_ROCm_Scan.hpp |   4 +-
 lib/kokkos/core/src/impl/Kokkos_BitOps.hpp    |  30 +-
 lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp  |   4 -
 lib/kokkos/core/unit_test/TestComplex.hpp     |  15 +-
 lib/kokkos/core/unit_test/TestMDRange.hpp     | 121 ++++--
 26 files changed, 1222 insertions(+), 562 deletions(-)
 create mode 100644 lib/kokkos/algorithms/unit_tests/TestROCm.cpp
 delete mode 100755 lib/kokkos/bin/kokkos-bind

diff --git a/lib/kokkos/CHANGELOG.md b/lib/kokkos/CHANGELOG.md
index 43d3f17d63..d414056187 100644
--- a/lib/kokkos/CHANGELOG.md
+++ b/lib/kokkos/CHANGELOG.md
@@ -1,5 +1,24 @@
 # Change Log
 
+## [2.04.04](https://github.com/kokkos/kokkos/tree/2.04.04) (2017-09-11)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/2.04.00...2.04.04)
+
+**Implemented enhancements:**
+
+- OpenMP partition: set number of threads on nested level [\#1082](https://github.com/kokkos/kokkos/issues/1082)
+- Add StaticCrsGraph row\(\) method [\#1071](https://github.com/kokkos/kokkos/issues/1071)
+- Enhance Kokkos complex operator overloading [\#1052](https://github.com/kokkos/kokkos/issues/1052)
+- Tell Trilinos packages about host+device lambda [\#1019](https://github.com/kokkos/kokkos/issues/1019)
+- Function markup for defaulted class members [\#952](https://github.com/kokkos/kokkos/issues/952)
+- Add deterministic random number generator [\#857](https://github.com/kokkos/kokkos/issues/857)
+
+**Fixed bugs:**
+
+- Fix reduction\_identity\<T\>::max for floating point numbers [\#1048](https://github.com/kokkos/kokkos/issues/1048)
+- Fix MD iteration policy ignores lower bound on GPUs [\#1041](https://github.com/kokkos/kokkos/issues/1041)
+- (Experimental) HBWSpace  Linking issues in KokkosKernels [\#1094](https://github.com/kokkos/kokkos/issues/1094)
+- (Experimental) ROCm:  algorithms/unit\_tests test\_sort failing with segfault [\#1070](https://github.com/kokkos/kokkos/issues/1070)
+
 ## [2.04.00](https://github.com/kokkos/kokkos/tree/2.04.00) (2017-08-16)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.13...2.04.00)
 
diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos
index b8236e8fd1..4641232a1f 100644
--- a/lib/kokkos/Makefile.kokkos
+++ b/lib/kokkos/Makefile.kokkos
@@ -443,7 +443,7 @@ endif
 ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
   KOKKOS_CPPFLAGS += -I$(MEMKIND_PATH)/include
   KOKKOS_LDFLAGS += -L$(MEMKIND_PATH)/lib
-  KOKKOS_LIBS += -lmemkind
+  KOKKOS_LIBS += -lmemkind -lnuma
   tmp := $(shell echo "\#define KOKKOS_HAVE_HBWSPACE 1" >> KokkosCore_config.tmp )
 endif
 
@@ -614,9 +614,18 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1)
   ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
 
   else
-    # Assume that this is a really a GNU compiler or it could be XL on P8.
-    KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
-    KOKKOS_LDFLAGS  += -mcpu=power8 -mtune=power8
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1) 
+        KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
+        KOKKOS_LDFLAGS  += -mcpu=power8 -mtune=power8
+    else
+      ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
+
+      else 
+        # Assume that this is a really a GNU compiler on P8.
+        KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
+        KOKKOS_LDFLAGS  += -mcpu=power8 -mtune=power8
+      endif
+    endif
   endif
 endif
 
@@ -626,9 +635,18 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER9), 1)
   ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
 
   else
-    # Assume that this is a really a GNU compiler or it could be XL on P9.
-    KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9
-    KOKKOS_LDFLAGS  += -mcpu=power9 -mtune=power9
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1) 
+        KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9
+        KOKKOS_LDFLAGS  += -mcpu=power9 -mtune=power9
+    else
+      ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
+
+      else 
+        # Assume that this is a really a GNU compiler on P9
+        KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9
+        KOKKOS_LDFLAGS  += -mcpu=power9 -mtune=power9
+      endif
+    endif
   endif
 endif
 
diff --git a/lib/kokkos/algorithms/src/Kokkos_Random.hpp b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
index 9082e47052..3db9a145d7 100644
--- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
@@ -1265,6 +1265,243 @@ void Random_XorShift1024_Pool<Kokkos::Cuda>::free_state(const Random_XorShift102
 }
 
 
+#endif
+
+#if defined(KOKKOS_ENABLE_ROCM) 
+
+  template<>
+  class Random_XorShift1024<Kokkos::Experimental::ROCm> {
+  private:
+    int p_;
+    const int state_idx_;
+    uint64_t* state_;
+    const int stride_;
+    friend class Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>;
+  public:
+
+    typedef Kokkos::Experimental::ROCm device_type;
+    typedef Random_XorShift1024_Pool<device_type> pool_type;
+
+    enum {MAX_URAND = 0xffffffffU};
+    enum {MAX_URAND64 = 0xffffffffffffffffULL-1};
+    enum {MAX_RAND = static_cast<int>(0xffffffffU/2)};
+    enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)};
+
+    KOKKOS_INLINE_FUNCTION
+    Random_XorShift1024 (const typename pool_type::state_data_type& state, int p, int state_idx = 0):
+      p_(p),state_idx_(state_idx),state_(&state(state_idx,0)),stride_(state.stride_1()){
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand() {
+      uint64_t state_0 = state_[ p_ * stride_ ];
+      uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
+      state_1 ^= state_1 << 31;
+      state_1 ^= state_1 >> 11;
+      state_0 ^= state_0 >> 30;
+      uint64_t tmp = ( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL;
+      tmp = tmp>>16;
+      return static_cast<uint32_t>(tmp&MAX_URAND);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64() {
+      uint64_t state_0 = state_[ p_ * stride_ ];
+      uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
+      state_1 ^= state_1 << 31;
+      state_1 ^= state_1 >> 11;
+      state_0 ^= state_0 >> 30;
+      return (( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand(const uint32_t& range) {
+      const uint32_t max_val = (MAX_URAND/range)*range;
+      uint32_t tmp = urand();
+      while(tmp>=max_val)
+        urand();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand(const uint32_t& start, const uint32_t& end ) {
+      return urand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64(const uint64_t& range) {
+      const uint64_t max_val = (MAX_URAND64/range)*range;
+      uint64_t tmp = urand64();
+      while(tmp>=max_val)
+        urand64();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64(const uint64_t& start, const uint64_t& end ) {
+      return urand64(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand() {
+      return static_cast<int>(urand()/2);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand(const int& range) {
+      const int max_val = (MAX_RAND/range)*range;
+      int tmp = rand();
+      while(tmp>=max_val)
+        rand();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand(const int& start, const int& end ) {
+      return rand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64() {
+      return static_cast<int64_t>(urand64()/2);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64(const int64_t& range) {
+      const int64_t max_val = (MAX_RAND64/range)*range;
+      int64_t tmp = rand64();
+      while(tmp>=max_val)
+        rand64();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64(const int64_t& start, const int64_t& end ) {
+      return rand64(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand() {
+      return 1.0f * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand(const float& range) {
+      return range * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand(const float& start, const float& end ) {
+      return frand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand() {
+      return 1.0 * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand(const double& range) {
+      return range * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand(const double& start, const double& end ) {
+      return frand(end-start)+start;
+    }
+
+    //Marsaglia polar method for drawing a standard normal distributed random number
+    KOKKOS_INLINE_FUNCTION
+    double normal() {
+      double S = 2.0;
+      double U;
+      while(S>=1.0) {
+        U = 2.0*drand() - 1.0;
+        const double V = 2.0*drand() - 1.0;
+        S = U*U+V*V;
+      }
+      return U*std::sqrt(-2.0*log(S)/S);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double normal(const double& mean, const double& std_dev=1.0) {
+      return mean + normal()*std_dev;
+    }
+  };
+
+template<>
+inline
+Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::Random_XorShift64_Pool(uint64_t seed) {
+  num_states_ = 0;
+  init(seed,4*32768);
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+Random_XorShift64<Kokkos::Experimental::ROCm> Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::get_state() const {
+#ifdef __HCC_ACCELERATOR__
+  const int i_offset = (threadIdx_x*blockDim_y + threadIdx_y)*blockDim_z+threadIdx_z;
+  int i = (((blockIdx_x*gridDim_y+blockIdx_y)*gridDim_z + blockIdx_z) *
+           blockDim_x*blockDim_y*blockDim_z + i_offset)%num_states_;
+  while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) {
+      i+=blockDim_x*blockDim_y*blockDim_z;
+      if(i>=num_states_) {i = i_offset;}
+  }
+
+  return Random_XorShift64<Kokkos::Experimental::ROCm>(state_(i),i);
+#else
+  return Random_XorShift64<Kokkos::Experimental::ROCm>(state_(0),0);
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::free_state(const Random_XorShift64<Kokkos::Experimental::ROCm> &state) const {
+#ifdef __HCC_ACCELERATOR__
+  state_(state.state_idx_) = state.state_;
+  locks_(state.state_idx_) = 0;
+  return;
+#endif
+}
+
+
+template<>
+inline
+Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::Random_XorShift1024_Pool(uint64_t seed) {
+  num_states_ = 0;
+  init(seed,4*32768);
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+Random_XorShift1024<Kokkos::Experimental::ROCm> Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::get_state() const {
+#ifdef __HCC_ACCELERATOR__
+  const int i_offset = (threadIdx_x*blockDim_y + threadIdx_y)*blockDim_z+threadIdx_z;
+  int i = (((blockIdx_x*gridDim_y+blockIdx_y)*gridDim_z + blockIdx_z) *
+           blockDim_x*blockDim_y*blockDim_z + i_offset)%num_states_;
+  while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) {
+      i+=blockDim_x*blockDim_y*blockDim_z;
+      if(i>=num_states_) {i = i_offset;}
+  }
+
+  return Random_XorShift1024<Kokkos::Experimental::ROCm>(state_, p_(i), i);
+#else
+  return Random_XorShift1024<Kokkos::Experimental::ROCm>(state_, p_(0), 0);
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::free_state(const Random_XorShift1024<Kokkos::Experimental::ROCm> &state) const {
+#ifdef __HCC_ACCELERATOR__
+  for(int i=0; i<16; i++)
+    state_(state.state_idx_,i) = state.state_[i];
+  locks_(state.state_idx_) = 0;
+  return;
+#endif
+}
+
+
 #endif
 
 
diff --git a/lib/kokkos/algorithms/unit_tests/Makefile b/lib/kokkos/algorithms/unit_tests/Makefile
index b74192ef18..a5a10c82ee 100644
--- a/lib/kokkos/algorithms/unit_tests/Makefile
+++ b/lib/kokkos/algorithms/unit_tests/Makefile
@@ -30,6 +30,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
 	TEST_TARGETS += test-cuda
 endif
 
+ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
+	OBJ_ROCM = TestROCm.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosAlgorithms_UnitTest_ROCm
+	TEST_TARGETS += test-rocm
+endif
+
 ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
 	OBJ_THREADS = TestThreads.o UnitTestMain.o gtest-all.o
 	TARGETS += KokkosAlgorithms_UnitTest_Threads
@@ -51,6 +57,9 @@ endif
 KokkosAlgorithms_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Cuda
 
+KokkosAlgorithms_UnitTest_ROCm: $(OBJ_ROCM) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(EXTRA_PATH) $(OBJ_ROCM) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_ROCm
+
 KokkosAlgorithms_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Threads
 
@@ -63,6 +72,9 @@ KokkosAlgorithms_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS)
 test-cuda: KokkosAlgorithms_UnitTest_Cuda
 	./KokkosAlgorithms_UnitTest_Cuda
 
+test-rocm: KokkosAlgorithms_UnitTest_ROCm
+	./KokkosAlgorithms_UnitTest_ROCm
+
 test-threads: KokkosAlgorithms_UnitTest_Threads
 	./KokkosAlgorithms_UnitTest_Threads
 
diff --git a/lib/kokkos/algorithms/unit_tests/TestROCm.cpp b/lib/kokkos/algorithms/unit_tests/TestROCm.cpp
new file mode 100644
index 0000000000..720b377ed2
--- /dev/null
+++ b/lib/kokkos/algorithms/unit_tests/TestROCm.cpp
@@ -0,0 +1,112 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_ROCM
+
+#include <cstdint>
+#include <iostream>
+#include <iomanip>
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#include <TestRandom.hpp>
+#include <TestSort.hpp>
+
+namespace Test {
+
+class rocm : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision(5) << std::scientific;
+    Kokkos::HostSpace::execution_space::initialize();
+    Kokkos::Experimental::ROCm::initialize( Kokkos::Experimental::ROCm::SelectDevice(0) );
+  }
+  static void TearDownTestCase()
+  {
+    Kokkos::Experimental::ROCm::finalize();
+    Kokkos::HostSpace::execution_space::finalize();
+  }
+};
+
+void rocm_test_random_xorshift64( int num_draws  )
+{
+  Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::Experimental::ROCm> >(num_draws);
+}
+
+void rocm_test_random_xorshift1024( int num_draws  )
+{
+  Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::Experimental::ROCm> >(num_draws);
+}
+
+
+#define ROCM_RANDOM_XORSHIFT64( num_draws )  \
+  TEST_F( rocm, Random_XorShift64 ) {        \
+  rocm_test_random_xorshift64(num_draws);    \
+  }
+
+#define ROCM_RANDOM_XORSHIFT1024( num_draws )  \
+  TEST_F( rocm, Random_XorShift1024 ) {        \
+  rocm_test_random_xorshift1024(num_draws);    \
+  }
+
+#define ROCM_SORT_UNSIGNED( size )                                    \
+  TEST_F( rocm, SortUnsigned ) {                                      \
+      Impl::test_sort< Kokkos::Experimental::ROCm, unsigned >(size);  \
+  }
+
+ROCM_RANDOM_XORSHIFT64(  132141141 )
+ROCM_RANDOM_XORSHIFT1024( 52428813 )
+ROCM_SORT_UNSIGNED(171)
+
+#undef ROCM_RANDOM_XORSHIFT64
+#undef ROCM_RANDOM_XORSHIFT1024
+#undef ROCM_SORT_UNSIGNED
+}
+#else
+void KOKKOS_ALGORITHMS_UNITTESTS_TESTROCM_PREVENT_LINK_ERROR() {}
+#endif  /* #ifdef KOKKOS_ENABLE_ROCM */
+
diff --git a/lib/kokkos/bin/hpcbind b/lib/kokkos/bin/hpcbind
index ca34648780..b88b334f8b 100755
--- a/lib/kokkos/bin/hpcbind
+++ b/lib/kokkos/bin/hpcbind
@@ -27,7 +27,7 @@ fi
 HPCBIND_HWLOC_PARENT_CPUSET=""
 if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
   MY_PID="$BASHPID"
-  HPCBIND_HWLOC_PARENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2)
+  HPCBIND_HWLOC_PARENT_CPUSET="$(hwloc-ps -a --cpuset | grep ${MY_PID} | cut -f 2)"
 fi
 
 ################################################################################
@@ -58,23 +58,34 @@ declare -i HPCBIND_ENABLE_GPU_MAPPING=$((NUM_GPUS > 0))
 ################################################################################
 HPCBIND_QUEUE_NAME=""
 declare -i HPCBIND_QUEUE_INDEX=0
-declare -i HPCBIND_QUEUE_GPU_MAPPING=0
+declare -i HPCBIND_QUEUE_MAPPING=0
 
-if [[ ! -z "${SLURM_LOCAL_ID}" ]]; then
-  HPCBIND_QUEUE_GPU_MAPPING=1
-  HPCBIND_QUEUE_NAME="sbatch"
+if [[ ! -z "${PMI_RANK}" ]]; then
+  HPCBIND_QUEUE_MAPPING=1
+  HPCBIND_QUEUE_NAME="mpich"
+  HPCBIND_QUEUE_INDEX=${PMI_RANK}
+elif [[ ! -z "${OMPI_COMM_WORLD_RANK}" ]]; then
+  HPCBIND_QUEUE_MAPPING=1
+  HPCBIND_QUEUE_NAME="openmpi"
+  HPCBIND_QUEUE_INDEX=${OMPI_COMM_WORLD_RANK}
+elif [[ ! -z "${MV2_COMM_WORLD_RANK}" ]]; then
+  HPCBIND_QUEUE_MAPPING=1
+  HPCBIND_QUEUE_NAME="mvapich2"
+  HPCBIND_QUEUE_INDEX=${MV2_COMM_WORLD_RANK}
+elif [[ ! -z "${SLURM_LOCAL_ID}" ]]; then
+  HPCBIND_QUEUE_MAPPING=1
+  HPCBIND_QUEUE_NAME="slurm"
   HPCBIND_QUEUE_INDEX=${SLURM_LOCAL_ID}
 elif [[ ! -z "${LBS_JOBINDEX}" ]]; then
-  HPCBIND_QUEUE_GPU_MAPPING=1
+  HPCBIND_QUEUE_MAPPING=1
   HPCBIND_QUEUE_NAME="bsub"
   HPCBIND_QUEUE_INDEX=${LBS_JOBINDEX}
 elif [[ ! -z "${ALPS_APP_PE}" ]]; then
-  HPCBIND_QUEUE_GPU_MAPPING=1
+  HPCBIND_QUEUE_MAPPING=1
   HPCBIND_QUEUE_NAME="aprun"
   HPCBIND_QUEUE_INDEX=${ALPS_APP_PE}
 fi
 
-
 ################################################################################
 # Show help
 ################################################################################
@@ -91,13 +102,14 @@ function show_help {
   echo "  --proc-bind=<LOC>     Set the initial process mask for the script"
   echo "                        LOC can be any valid location argument for"
   echo "                        hwloc-calc  Default: all"
+  echo "  --whole-system        ${cmd} will ignore the its parent process binding"
   echo "  --distribute=N        Distribute the current cpuset into N partitions"
   echo "  --distribute-partition=I"
   echo "                        Use the i'th partition (zero based)"
   echo "  --visible-gpus=<L>    Comma separated list of gpu ids"
   echo "                        Default: CUDA_VISIBLE_DEVICES or all gpus in"
   echo "                        sequential order"
-  echo "  --gpu-ignore-queue    Ignore queue job id when choosing visible GPU"
+  echo "  --ignore-queue        Ignore queue job id when choosing visible GPU and partition"
   echo "  --no-gpu-mapping      Do not set CUDA_VISIBLE_DEVICES"
   echo "  --openmp=M.m          Set env variables for the given OpenMP version"
   echo "                        Default: 4.0"
@@ -110,22 +122,30 @@ function show_help {
   echo "  --force-openmp-proc-bind=<OP>"
   echo "                        Override logic for selecting OMP_PROC_BIND"
   echo "  --no-openmp-nested    Set OMP_NESTED to false"
-  echo "  --show-bindings       Show the bindings"
-  echo "  --lstopo              Show bindings in lstopo without executing a command"
-  echo "  -v|--verbose          Show options and relevant environment variables"
+  echo "  --output-prefix=<P>   Save the output to files of the form"
+  echo "                        P-N.log, P-N.out and P-N.err where P is the prefix"
+  echo "                        and N is the queue index or mpi rank (no spaces)"
+  echo "  --output-mode=<Op>    How console output should be handled."
+  echo "                        Options are all, rank0, and none.  Default: rank0" 
+  echo "  --lstopo              Show bindings in lstopo"
+  echo "  -v|--verbose          Print bindings and relevant environment variables"
   echo "  -h|--help             Show this message"
   echo ""
   echo "Sample Usage:"
   echo "  Split the current process cpuset into 4 and use the 3rd partition"
   echo "    ${cmd} --distribute=4 --distribute-partition=2 -v -- command ..."
-  echo "  Bing the process to all even cores"
+  echo "  Launch 16 jobs over 4 nodes with 4 jobs per node using only the even pus"
+  echo "  and save the output to rank specific files"
+  echo "    mpiexec -N 16 -npernode 4 ${cmd} --whole-system --proc-bind=pu:even \\"
+  echo "      --distribute=4 -v --output-prefix=output  -- command ..."
+  echo "  Bind the process to all even cores"
   echo "    ${cmd} --proc-bind=core:even -v -- command ..."
-  echo "  Bind to the first 64 cores and split the current process cpuset into 4"
-  echo "    ${cmd} --proc-bind=core:0-63 --distribute=4 --distribute-partition=0 -- command ..."
-  echo "  skip GPU 0 when mapping visible devices"
+  echo "  Bind the the even cores of socket 0 and the odd cores of socket 1"
+  echo "    ${cmd} --proc-bind='socket:0.core:even socket:1.core:odd' -v -- command ..."
+  echo "  Skip GPU 0 when mapping visible devices"
   echo "    ${cmd} --distribute=4 --distribute-partition=0 --visible-gpus=1,2 -v -- command ..."
   echo "  Display the current bindings"
-  echo "    ${cmd} --proc-bind=numa:0 --show-bindings -- command"
+  echo "    ${cmd} --proc-bind=numa:0 -- command"
   echo "  Display the current bindings using lstopo"
   echo "    ${cmd} --proc-bind=numa:0.core:odd --lstopo"
   echo ""
@@ -144,7 +164,7 @@ fi
 declare -a UNKNOWN_ARGS=()
 declare -i HPCBIND_ENABLE_HWLOC_BIND=${HPCBIND_HAS_HWLOC}
 declare -i HPCBIND_DISTRIBUTE=1
-declare -i HPCBIND_PARTITION=0
+declare -i HPCBIND_PARTITION=-1
 HPCBIND_PROC_BIND="all"
 HPCBIND_OPENMP_VERSION=4.0
 declare -i HPCBIND_OPENMP_PERCENT=100
@@ -155,11 +175,15 @@ HPCBIND_OPENMP_FORCE_PROC_BIND=""
 HPCBIND_OPENMP_NESTED=${OMP_NESTED:-true}
 declare -i HPCBIND_VERBOSE=0
 
-declare -i HPCBIND_SHOW_BINDINGS=0
 declare -i HPCBIND_LSTOPO=0
 
-for i in $@; do
-  case $i in
+HPCBIND_OUTPUT_PREFIX=""
+HPCBIND_OUTPUT_MODE="rank0"
+
+declare -i HPCBIND_HAS_COMMAND=0
+
+for i in "$@"; do
+  case "$i" in
     # number of partitions to create
     --no-hwloc-bind)
       HPCBIND_ENABLE_HWLOC_BIND=0
@@ -169,6 +193,10 @@ for i in $@; do
       HPCBIND_PROC_BIND="${i#*=}"
       shift
       ;;
+    --whole-system)
+      HPCBIND_HWLOC_PARENT_CPUSET=""
+      shift
+      ;;
     --distribute=*)
       HPCBIND_DISTRIBUTE="${i#*=}"
       shift
@@ -182,8 +210,8 @@ for i in $@; do
       HPCBIND_VISIBLE_GPUS=$(echo "${i#*=}" | tr ',' ' ')
       shift
       ;;
-    --gpu-ignore-queue)
-      HPCBIND_QUEUE_GPU_MAPPING=0
+    --ignore-queue)
+      HPCBIND_QUEUE_MAPPING=0
       shift
       ;;
     --no-gpu-mapping)
@@ -218,14 +246,18 @@ for i in $@; do
       HPCBIND_OPENMP_NESTED="false"
       shift
       ;;
-    --show-bindings)
-      HPCBIND_VERBOSE=1
-      HPCBIND_SHOW_BINDINGS=1
+    --output-prefix=*)
+      HPCBIND_OUTPUT_PREFIX="${i#*=}"
+      shift
+      ;;
+    --output-mode=*)
+      HPCBIND_OUTPUT_MODE="${i#*=}"
+      #convert to lower case
+      HPCBIND_OUTPUT_MODE="${HPCBIND_OUTPUT_MODE,,}"
       shift
       ;;
     --lstopo)
       HPCBIND_VERBOSE=1
-      HPCBIND_SHOW_BINDINGS=0
       HPCBIND_LSTOPO=1
       shift
       ;;
@@ -239,6 +271,7 @@ for i in $@; do
       ;;
     # ignore remaining arguments
     --)
+      HPCBIND_HAS_COMMAND=1
       shift
       break
       ;;
@@ -250,16 +283,41 @@ for i in $@; do
   esac
 done
 
+################################################################################
+# Check output mode
+################################################################################
+declare -i HPCBIND_TEE=0
+
+if [[ "${HPCBIND_OUTPUT_MODE}" == "none" ]]; then
+  HPCBIND_TEE=0
+elif [[ "${HPCBIND_OUTPUT_MODE}" == "all" ]]; then
+  HPCBIND_TEE=1
+elif [[ ${HPCBIND_QUEUE_INDEX} -eq 0 ]]; then
+  #default to rank0 printing to screen
+  HPCBIND_TEE=1
+fi
+
+
+if [[ "${HPCBIND_OUTPUT_PREFIX}" == "" ]]; then
+  HPCBIND_LOG=/dev/null
+  HPCBIND_ERR=/dev/null
+  HPCBIND_OUT=/dev/null
+else
+  HPCBIND_LOG="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.hpc.log"
+  HPCBIND_ERR="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.err"
+  HPCBIND_OUT="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.out"
+  > ${HPCBIND_LOG}
+fi
+
 
 ################################################################################
 # Check unknown arguments
 ################################################################################
 if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then
-  echo "Uknown options: ${UNKNOWN_ARGS[*]}"
+  echo "HPCBIND Uknown options: ${UNKNOWN_ARGS[*]}" > >(tee -a ${HPCBIND_LOG})
   exit 1
 fi
 
-
 ################################################################################
 # Check that visible gpus are valid
 ################################################################################
@@ -268,22 +326,19 @@ if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then
   for ((i=0; i < ${#HPCBIND_VISIBLE_GPUS[*]}; i++)); do
     if [[ ${HPCBIND_VISIBLE_GPUS[$i]} -ge ${NUM_GPUS} ||
       ${HPCBIND_VISIBLE_GPUS[$i]} -lt 0 ]]; then
-      echo "Invaild GPU ID ${HPCBIND_VISIBLE_GPUS[$i]}, setting to 0"
+      echo "HPCBIND Invaild GPU ID ${HPCBIND_VISIBLE_GPUS[$i]} (setting to 0)" > >(tee -a ${HPCBIND_LOG})
       HPCBIND_VISIBLE_GPUS[$i]=0;
     fi
   done
   NUM_GPUS=${#HPCBIND_VISIBLE_GPUS[@]}
 fi
 
-
 ################################################################################
 # Check OpenMP percent
 ################################################################################
 if [[ ${HPCBIND_OPENMP_PERCENT} -lt 1 ]]; then
-  echo "OpenMP percent < 1, setting to 1"
   HPCBIND_OPENMP_PERCENT=1
 elif [[ ${HPCBIND_OPENMP_PERCENT} -gt 100 ]]; then
-  echo "OpenMP percent > 100, setting to 100"
   HPCBIND_OPENMP_PERCENT=100
 fi
 
@@ -291,15 +346,21 @@ fi
 # Check distribute
 ################################################################################
 if [[ ${HPCBIND_DISTRIBUTE} -le 0 ]]; then
-  echo "Invalid input for distribute, changing distribute to 1"
   HPCBIND_DISTRIBUTE=1
 fi
 
-if [[ ${HPCBIND_PARTITION} -ge ${HPCBIND_DISTRIBUTE} ]]; then
-  echo "Invalid input for distribute-partition, changing to 0"
+################################################################################
+#choose the correct partition
+################################################################################
+if [[ ${HPCBIND_PARTITION} -lt 0 && ${HPCBIND_QUEUE_MAPPING} -eq 1 ]]; then
+  HPCBIND_PARTITION=${HPCBIND_QUEUE_INDEX}
+elif [[ ${HPCBIND_PARTITION} -lt 0 ]]; then
   HPCBIND_PARTITION=0
 fi
 
+if [[ ${HPCBIND_PARTITION} -ge ${HPCBIND_DISTRIBUTE} ]]; then
+  HPCBIND_PARTITION=$((HPCBIND_PARTITION % HPCBIND_DISTRIBUTE))
+fi
 
 ################################################################################
 # Find cpuset and num threads
@@ -309,13 +370,17 @@ declare -i HPCBIND_NUM_PUS=0
 
 if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
   if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then
-    BINDING=$(hwloc-calc ${HPCBIND_PROC_BIND})
+    BINDING=$(hwloc-calc ${HPCBIND_PROC_BIND[*]})
   else
-    BINDING=$(hwloc-calc --restrict ${HPCBIND_HWLOC_PARENT_CPUSET} ${HPCBIND_PROC_BIND})
+    BINDING=$(hwloc-calc --restrict ${HPCBIND_HWLOC_PARENT_CPUSET} ${HPCBIND_PROC_BIND[*]})
   fi
 
-  CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${HPCBIND_DISTRIBUTE}))
-  HPCBIND_HWLOC_CPUSET=${CPUSETS[${HPCBIND_PARTITION}]}
+  if [[ ${HPCBIND_DISTRIBUTE} -gt 1 ]]; then
+    CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${HPCBIND_DISTRIBUTE}))
+    HPCBIND_HWLOC_CPUSET="${CPUSETS[${HPCBIND_PARTITION}]}"
+  else
+    HPCBIND_HWLOC_CPUSET="${BINDING}"
+  fi
   HPCBIND_NUM_PUS=$(hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu | wc -l)
 else
   HPCBIND_NUM_PUS=$(cat /proc/cpuinfo | grep -c processor)
@@ -373,13 +438,13 @@ export OMP_NESTED=${HPCBIND_OPENMP_NESTED}
 ################################################################################
 
 if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then
-  if [[ ${HPCBIND_QUEUE_GPU_MAPPING} -eq 0 ]]; then
+  if [[ ${HPCBIND_QUEUE_MAPPING} -eq 0 ]]; then
     declare -i GPU_ID=$((HPCBIND_PARTITION % NUM_GPUS))
-    export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}
+    export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}"
   else
     declare -i MY_TASK_ID=$((HPCBIND_QUEUE_INDEX * HPCBIND_DISTRIBUTE + HPCBIND_PARTITION))
     declare -i GPU_ID=$((MY_TASK_ID % NUM_GPUS))
-    export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}
+    export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}"
   fi
 fi
 
@@ -389,22 +454,22 @@ fi
 export HPCBIND_HAS_HWLOC=${HPCBIND_HAS_HWLOC}
 export HPCBIND_HAS_NVIDIA=${HPCBIND_HAS_NVIDIA}
 export HPCBIND_NUM_PUS=${HPCBIND_NUM_PUS}
-export HPCBIND_HWLOC_CPUSET=${HPCBIND_HWLOC_CPUSET}
+export HPCBIND_HWLOC_CPUSET="${HPCBIND_HWLOC_CPUSET}"
 export HPCBIND_HWLOC_DISTRIBUTE=${HPCBIND_DISTRIBUTE}
 export HPCBIND_HWLOC_DISTRIBUTE_PARTITION=${HPCBIND_PARTITION}
 if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then
   export HPCBIND_HWLOC_PARENT_CPUSET="all"
 else
-  export HPCBIND_HWLOC_PARENT_CPUSET=${HPCBIND_HWLOC_PARENT_CPUSET}
+  export HPCBIND_HWLOC_PARENT_CPUSET="${HPCBIND_HWLOC_PARENT_CPUSET}"
 fi
-export HPCBIND_HWLOC_PROC_BIND=${HPCBIND_PROC_BIND}
+export HPCBIND_HWLOC_PROC_BIND="${HPCBIND_PROC_BIND}"
 export HPCBIND_NVIDIA_ENABLE_GPU_MAPPING=${HPCBIND_ENABLE_GPU_MAPPING}
 export HPCBIND_NVIDIA_VISIBLE_GPUS=$(echo "${HPCBIND_VISIBLE_GPUS[*]}" | tr ' ' ',')
-export HPCBIND_OPENMP_VERSION=${HPCBIND_OPENMP_VERSION}
+export HPCBIND_OPENMP_VERSION="${HPCBIND_OPENMP_VERSION}"
 if [[ "${HPCBIND_QUEUE_NAME}" != "" ]]; then
   export HPCBIND_QUEUE_INDEX=${HPCBIND_QUEUE_INDEX}
-  export HPCBIND_QUEUE_NAME=${HPCBIND_QUEUE_NAME}
-  export HPCBIND_QUEUE_GPU_MAPPING=${HPCBIND_QUEUE_GPU_MAPPING}
+  export HPCBIND_QUEUE_NAME="${HPCBIND_QUEUE_NAME}"
+  export HPCBIND_QUEUE_MAPPING=${HPCBIND_QUEUE_MAPPING}
 fi
 
 
@@ -412,43 +477,63 @@ fi
 # Print verbose
 ################################################################################
 
-if [[ ${HPCBIND_VERBOSE} -eq 1 ]]; then
-  MY_ENV=$(env | sort)
-  echo "[HPCBIND]"
-  echo "${MY_ENV}" | grep -E "^HPCBIND_"
-  echo "[CUDA]"
-  echo "${MY_ENV}" | grep -E "^CUDA_"
-  echo "[OPENMP]"
-  echo "${MY_ENV}" | grep -E "^OMP_"
-fi
+TMP_ENV=$(env | sort)
+if [[ ${HPCBIND_TEE} -eq 0 || ${HPCBIND_VERBOSE} -eq 0 ]]; then
+  echo "[HOST]" >> ${HPCBIND_LOG}
+  hostname -s >> ${HPCBIND_LOG}
+  echo "[HPCBIND]" >> ${HPCBIND_LOG}
+  echo "${TMP_ENV}" | grep -E "^HPCBIND_" >> ${HPCBIND_LOG}
+  echo "[CUDA]" >> ${HPCBIND_LOG}
+  echo "${TMP_ENV}" | grep -E "^CUDA_" >> ${HPCBIND_LOG}
+  echo "[OPENMP]" >> ${HPCBIND_LOG}
+  echo "${TMP_ENV}" | grep -E "^OMP_" >> ${HPCBIND_LOG}
 
-if [[ ${HPCBIND_HAS_HWLOC} -eq 1 && ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then
-  echo "[BINDINGS]"
-  hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu
-elif [[ ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then
-  echo "Unable to show bindings, hwloc not available."
+  if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
+    echo "[BINDINGS]" >> ${HPCBIND_LOG}
+    hwloc-ls --restrict "${HPCBIND_HWLOC_CPUSET}" --only pu >> ${HPCBIND_LOG}
+  else
+    echo "Unable to show bindings, hwloc not available." >> ${HPCBIND_LOG}
+  fi
+else
+  echo "[HOST]" > >(tee -a ${HPCBIND_LOG})
+  hostname -s > >(tee -a ${HPCBIND_LOG})
+  echo "[HPCBIND]" > >(tee -a ${HPCBIND_LOG})
+  echo "${TMP_ENV}" | grep -E "^HPCBIND_" > >(tee -a ${HPCBIND_LOG})
+  echo "[CUDA]" > >(tee -a ${HPCBIND_LOG})
+  echo "${TMP_ENV}" | grep -E "^CUDA_" > >(tee -a ${HPCBIND_LOG})
+  echo "[OPENMP]" > >(tee -a ${HPCBIND_LOG})
+  echo "${TMP_ENV}" | grep -E "^OMP_" > >(tee -a ${HPCBIND_LOG})
+
+  if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
+    echo "[BINDINGS]" > >(tee -a ${HPCBIND_LOG})
+    hwloc-ls --restrict "${HPCBIND_HWLOC_CPUSET}" --only pu > >(tee -a ${HPCBIND_LOG})
+  else
+    echo "Unable to show bindings, hwloc not available." > >(tee -a ${HPCBIND_LOG})
+  fi
 fi
 
 ################################################################################
 # Run command
 ################################################################################
 
-if [[ ${HPCBIND_LSTOPO} -eq 0 ]]; then
-  if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
-    hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- $@
-  else
-    eval $@
-  fi
-else
-  if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
-    if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 && ! -z ${DISPLAY} ]]; then
-      echo "[BINDINGS]"
-      hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu
-      hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- lstopo --pid 0
+# must be the last executed command so that the return value is correct
+if [[ ${HPCBIND_LSTOPO} -eq 1 && ${HPCBIND_HAS_HWLOC} -eq 1 && ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 && ! -z ${DISPLAY} ]]; then
+  hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- lstopo --pid 0
+elif [[ ${HPCBIND_HAS_COMMAND} -eq 1 ]]; then
+  # clear output files
+  > ${HPCBIND_ERR}
+  > ${HPCBIND_OUT}
+  if [[ ${HPCBIND_TEE} -eq 0 ]]; then
+    if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
+      hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
     else
-      hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET}
+      eval $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
     fi
   else
-    echo "Unable to show bindings, hwloc not available."
+    if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
+      hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
+    else
+      eval $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
+    fi
   fi
 fi
diff --git a/lib/kokkos/bin/kokkos-bind b/lib/kokkos/bin/kokkos-bind
deleted file mode 100755
index b6fe07a1bd..0000000000
--- a/lib/kokkos/bin/kokkos-bind
+++ /dev/null
@@ -1,221 +0,0 @@
-#!/usr/bin/env bash
-
-# check if hwloc commands exist
-declare -i HAS_HWLOC=0
-type hwloc-bind >/dev/null 2>&1
-HAS_HWLOC="${HAS_HWLOC} + $?"
-
-type hwloc-distrib >/dev/null 2>&1
-HAS_HWLOC="${HAS_HWLOC} + $?"
-
-type hwloc-ls >/dev/null 2>&1
-HAS_HWLOC="${HAS_HWLOC} + $?"
-
-type hwloc-calc >/dev/null 2>&1
-HAS_HWLOC="${HAS_HWLOC} + $?"
-
-type hwloc-ps >/dev/null 2>&1
-HAS_HWLOC="${HAS_HWLOC} + $?"
-
-
-#parse args
-declare -a UNKNOWN_ARGS=()
-declare -i DISTRIBUTE=1
-declare -i INDEX=0
-PROC_BIND="all"
-CURRENT_CPUSET=""
-OPENMP_VERSION=4.0
-OPENMP_PROC_BIND=True
-OPENMP_NESTED=True
-VERBOSE=False
-
-#get the current process cpuset
-if [[ ${HAS_HWLOC} -eq 0 ]]; then
-  MY_PID="$BASHPID"
-  CURRENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2)
-  echo "$CURRENT_CPUSET"
-fi
-
-function show_help {
-  local cmd=$(basename "$0")
-  echo "Usage: ${cmd} <options> -- command ..." 
-  echo "  Uses hwloc to divide the node into the given number of groups,"
-  echo "  set the appropriate OMP_NUM_THREADS and execute the command on the"
-  echo "  selected group."
-  echo ""
-  echo "  NOTE: This command assumes it has exclusive use of the node"
-  echo ""
-  echo "Options:"
-  echo "  --proc-bind=<LOC>     Set the initial process mask for the script.  "
-  echo "                        LOC can be any valid location argumnet for"
-  echo "                        hwloc-calc.  Defaults to the entire machine"
-  echo "  --distribute=N        Distribute the current proc-bind into N groups" 
-  echo "  --index=I             Use the i'th group (zero based)" 
-  echo "  --openmp=M.m          Set env variables for the given OpenMP version"
-  echo "                        (default 4.0)"
-  echo "  --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES"    
-  echo "  --no-openmp-nested    Set OMP_NESTED to false"
-  echo "  -v|--verbose" 
-  echo "  -h|--help" 
-  echo ""
-  echo "Sample Usage:"
-  echo "  ${cmd} --distribute=4 --index=2 -v -- command ..."
-  echo ""
-}
-
-if [[ "$#" -eq 0 ]]; then
-  show_help 
-  exit 0
-fi
-
-
-for i in $@; do
-  case $i in
-    # number of partitions to create
-    --proc-bind=*)
-      PROC_BIND="${i#*=}"
-      shift
-      ;;
-    --distribute=*)
-      DISTRIBUTE="${i#*=}"
-      shift
-      ;;
-    # which group to use
-    --index=*)
-      INDEX="${i#*=}"
-      shift
-      ;;
-    --openmp=*)
-      OPENMP_VERSION="${i#*=}"
-      shift
-      ;;
-    --no-openmp-proc-bind)
-      OPENMP_PROC_BIND=False
-      shift
-      ;;
-    --no-openmp-nested)
-      OPENMP_NESTED=False
-      shift
-      ;;
-    -v|--verbose)
-      VERBOSE=True
-      shift
-      ;;
-    -h|--help)
-      show_help
-      exit 0
-      ;;
-    # ignore remaining arguments
-    --)
-      shift
-      break
-      ;;
-    # unknown option
-    *)
-      UNKNOWN_ARGS+=("$i")
-      shift
-      ;;
-  esac
-done
-
-if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then
-  echo "Uknown options: ${UNKNOWN_ARGS[*]}"
-  exit 1
-fi
-
-if [[ ${DISTRIBUTE} -le 0 ]]; then
-  echo "Invalid input for distribute, changing distribute to 1"
-  DISTRIBUTE=1
-fi
-
-if [[ ${INDEX} -ge ${DISTRIBUTE} ]]; then
-  echo "Invalid input for index, changing index to 0"
-  INDEX=0
-fi
-
-if [[ ${HAS_HWLOC} -ne 0 ]]; then
-  echo "hwloc not found, no process binding will occur"
-  DISTRIBUTE=1
-  INDEX=0
-fi
-
-if [[ ${HAS_HWLOC} -eq 0 ]]; then
-
-  if [[ "${CURRENT_CPUSET}" == "" ]]; then
-    BINDING=$(hwloc-calc ${PROC_BIND})
-  else 
-    BINDING=$(hwloc-calc --restrict ${CURRENT_CPUSET} ${PROC_BIND})
-  fi
-
-  CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${DISTRIBUTE}))
-  CPUSET=${CPUSETS[${INDEX}]}
-  NUM_THREADS=$(hwloc-ls --restrict ${CPUSET} --only pu | wc -l)
-
-  if [[ "${VERBOSE}" == "True" ]]; then
-    echo "hwloc:         true"
-    echo "  proc_bind:     ${PROC_BIND}"
-    echo "  distribute:    ${DISTRIBUTE}"
-    echo "  index:         ${INDEX}"
-    echo "  parent_cpuset: ${CURRENT_CPUSET}"
-    echo "  cpuset:        ${CPUSET}"
-    echo "omp_num_threads: ${NUM_THREADS}"
-    echo "omp_proc_bind:   ${OPENMP_PROC_BIND}"
-    echo "omp_nested:      ${OPENMP_NESTED}"
-    echo "OpenMP:          ${OPENMP_VERSION}"
-  fi
-
-  # set OMP env
-  if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then
-    if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then
-      export OMP_PLACES="threads"
-      export OMP_PROC_BIND="spread"
-    else
-      export OMP_PROC_BIND="true"
-      unset OMP_PLACES
-    fi
-  else
-    unset OMP_PLACES
-    unset OMP_PROC_BIND
-  fi
-  if [[ "${OPENMP_NESTED}" == "True" ]]; then
-    export OMP_NESTED="true"
-  else
-    export OMP_NESTED="false"
-  fi
-  export OMP_NUM_THREADS="${NUM_THREADS}"
-
-  hwloc-bind ${CPUSET} -- $@
-else
-  NUM_THREADS=$(cat /proc/cpuinfo | grep -c processor)
-
-  if [[ "${VERBOSE}" == "True" ]]; then
-    echo "hwloc:           false"
-    echo "omp_num_threads: ${NUM_THREADS}"
-    echo "omp_proc_bind:   ${OPENMP_PROC_BIND}"
-    echo "omp_nested:      ${OPENMP_NESTED}"
-    echo "OpenMP:          ${OPENMP_VERSION}"
-  fi
-    
-  # set OMP env
-  if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then
-    if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then
-      export OMP_PLACES="threads"
-      export OMP_PROC_BIND="spread"
-    else
-      export OMP_PROC_BIND="true"
-      unset OMP_PLACES
-    fi
-  else
-    unset OMP_PLACES
-    unset OMP_PROC_BIND
-  fi
-  if [[ "${OPENMP_NESTED}" == "True" ]]; then
-    export OMP_NESTED="true"
-  else
-    export OMP_NESTED="false"
-  fi
-  export OMP_NUM_THREADS="${NUM_THREADS}"
-
-  eval $@
-fi
-
diff --git a/lib/kokkos/bin/nvcc_wrapper b/lib/kokkos/bin/nvcc_wrapper
index 09fa5d500a..76e33f3c66 100755
--- a/lib/kokkos/bin/nvcc_wrapper
+++ b/lib/kokkos/bin/nvcc_wrapper
@@ -78,6 +78,9 @@ temp_dir=${TMPDIR:-/tmp}
 # Check if we have an optimization argument already
 optimization_applied=0
 
+# Check if we have -std=c++X  or --std=c++X already
+stdcxx_applied=0
+
 #echo "Arguments: $# $@"
 
 while [ $# -gt 0 ]
@@ -130,10 +133,16 @@ do
     cuda_args="$cuda_args $1 $2"
     shift
     ;;
-  #Handle c++11 setting
-  --std=c++11|-std=c++11)
-    shared_args="$shared_args $1"
+  #Handle c++11
+  --std=c++11|-std=c++11|--std=c++14|-std=c++14|--std=c++1z|-std=c++1z)
+    if [ $stdcxx_applied -eq 1 ]; then
+       echo "nvcc_wrapper - *warning* you have set multiple optimization flags (-std=c++1* or --std=c++1*), only the first is used because nvcc can only accept a single std setting"
+    else
+       shared_args="$shared_args $1"
+       stdcxx_applied=1
+    fi
     ;;
+
   #strip of -std=c++98 due to nvcc warnings and Tribits will place both -std=c++11 and -std=c++98
   -std=c++98|--std=c++98)
     ;;
diff --git a/lib/kokkos/config/master_history.txt b/lib/kokkos/config/master_history.txt
index 96b05c02e1..6f9ca897d9 100644
--- a/lib/kokkos/config/master_history.txt
+++ b/lib/kokkos/config/master_history.txt
@@ -9,3 +9,4 @@ tag:  2.03.00    date: 04:25:2017    master: 120d9ce7    develop: 015ba641
 tag:  2.03.05    date: 05:27:2017    master: 36b92f43    develop: 79073186
 tag:  2.03.13    date: 07:27:2017    master: da314444    develop: 29ccb58a
 tag:  2.04.00    date: 08:16:2017    master: 54eb75c0    develop: 32fb8ee1
+tag:  2.04.04    date: 09:11:2017    master: 2b7e9c20    develop: 51e7b25a
diff --git a/lib/kokkos/config/trilinos-integration/checkin-test b/lib/kokkos/config/trilinos-integration/checkin-test
index 92a1b1c068..ffb565fcbb 100644
--- a/lib/kokkos/config/trilinos-integration/checkin-test
+++ b/lib/kokkos/config/trilinos-integration/checkin-test
@@ -1,4 +1,4 @@
 module purge
-module load sems-env sems-gcc/4.9.3 sems-openmpi/1.10.1 sems-hdf5/1.8.12/parallel sems-netcdf/4.3.2/parallel sems-python/2.7.9 sems-zlib/1.2.8/base sems-cmake/3.5.2 sems-parmetis/4.0.3/64bit_parallel sems-scotch/6.0.3/nopthread_64bit_parallel sems-boost/1.59.0/base
+module load sems-env sems-gcc/4.9.3 sems-openmpi/1.10.1 sems-hdf5/1.8.12/parallel sems-netcdf/4.3.2/parallel sems-python/2.7.9 sems-zlib/1.2.8/base sems-cmake/3.5.2 sems-parmetis/4.0.3/64bit_parallel sems-scotch/6.0.3/nopthread_64bit_parallel sems-boost/1.63.0/base sems-yaml_cpp sems-superlu
 
 #Run Trilinos CheckinTest
diff --git a/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
index 0408472c68..996b6b5610 100644
--- a/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
+++ b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
@@ -125,6 +125,123 @@ namespace Impl {
   };
 }
 
+/// \class GraphRowViewConst
+/// \brief View of a row of a sparse graph.
+/// \tparam GraphType Sparse graph type, such as (but not limited to) StaticCrsGraph.
+///
+/// This class provides a generic view of a row of a sparse graph.
+/// We intended this class to view a row of a StaticCrsGraph, but
+/// GraphType need not necessarily be CrsMatrix.
+///
+/// The row view is suited for computational kernels like sparse
+/// matrix-vector multiply, as well as for modifying entries in the
+/// sparse matrix.  The view is always const as it does not allow graph modification.
+///
+/// Here is an example loop over the entries in the row:
+/// \code
+/// typedef typename GraphRowViewConst<MatrixType>::ordinal_type ordinal_type;
+///
+/// GraphRowView<GraphType> G_i = ...;
+/// const ordinal_type numEntries = G_i.length;
+/// for (ordinal_type k = 0; k < numEntries; ++k) {
+///   ordinal_type j = G_i.colidx (k);
+///   // ... do something with A_ij and j ...
+/// }
+/// \endcode
+///
+/// GraphType must provide the \c data_type
+/// typedefs. In addition, it must make sense to use GraphRowViewConst to
+/// view a row of GraphType. In particular, column
+/// indices of a row must be accessible using the <tt>entries</tt>
+/// resp. <tt>colidx</tt> arrays given to the constructor of this
+/// class, with a constant <tt>stride</tt> between successive entries.
+/// The stride is one for the compressed sparse row storage format (as
+/// is used by CrsMatrix), but may be greater than one for other
+/// sparse matrix storage formats (e.g., ELLPACK or jagged diagonal).
+template<class GraphType>
+struct GraphRowViewConst {
+  //! The type of the column indices in the row.
+  typedef const typename GraphType::data_type ordinal_type;
+
+private:
+  //! Array of (local) column indices in the row.
+  ordinal_type* colidx_;
+  /// \brief Stride between successive entries in the row.
+  ///
+  /// For compressed sparse row (CSR) storage, this is always one.
+  /// This might be greater than one for storage formats like ELLPACK
+  /// or Jagged Diagonal.  Nevertheless, the stride can never be
+  /// greater than the number of rows or columns in the matrix.  Thus,
+  /// \c ordinal_type is the correct type.
+  const ordinal_type stride_;
+
+public:
+  /// \brief Constructor
+  ///
+  /// \param values [in] Array of the row's values.
+  /// \param colidx [in] Array of the row's column indices.
+  /// \param stride [in] (Constant) stride between matrix entries in
+  ///   each of the above arrays.
+  /// \param count [in] Number of entries in the row.
+  KOKKOS_INLINE_FUNCTION
+  GraphRowViewConst ( ordinal_type* const colidx_in,
+                      const ordinal_type& stride,
+                      const ordinal_type& count) :
+    colidx_ (colidx_in), stride_ (stride), length (count)
+  {}
+
+  /// \brief Constructor with offset into \c colidx array
+  ///
+  /// \param colidx [in] Array of the row's column indices.
+  /// \param stride [in] (Constant) stride between matrix entries in
+  ///   each of the above arrays.
+  /// \param count [in] Number of entries in the row.
+  /// \param idx [in] Start offset into \c colidx array
+  ///
+  /// \tparam OffsetType The type of \c idx (see above).  Must be a
+  ///   built-in integer type.  This may differ from ordinal_type.
+  ///   For example, the matrix may have dimensions that fit in int,
+  ///   but a number of entries that does not fit in int.
+  template<class OffsetType>
+  KOKKOS_INLINE_FUNCTION
+  GraphRowViewConst ( const typename GraphType::entries_type& colidx_in,
+                      const ordinal_type& stride,
+                      const ordinal_type& count,
+                      const OffsetType& idx,
+                      const typename std::enable_if<std::is_integral<OffsetType>::value, int>::type& = 0) :
+    colidx_ (&colidx_in(idx)), stride_ (stride), length (count)
+  {}
+
+  /// \brief Number of entries in the row.
+  ///
+  /// This is a public const field rather than a public const method,
+  /// in order to avoid possible overhead of a method call if the
+  /// compiler is unable to inline that method call.
+  ///
+  /// We assume that rows contain no duplicate entries (i.e., entries
+  /// with the same column index).  Thus, a row may have up to
+  /// A.numCols() entries.  This means that the correct type of
+  /// 'length' is ordinal_type.
+  const ordinal_type length;
+
+  /// \brief (Const) reference to the column index of entry i in this
+  ///   row of the sparse matrix.
+  ///
+  /// "Entry i" is not necessarily the entry with column index i, nor
+  /// does i necessarily correspond to the (local) row index.
+  KOKKOS_INLINE_FUNCTION
+  ordinal_type& colidx (const ordinal_type& i) const {
+    return colidx_[i*stride_];
+  }
+
+  /// \brief An alias for colidx
+  KOKKOS_INLINE_FUNCTION
+  ordinal_type& operator()(const ordinal_type& i) const {
+    return colidx(i);
+  }
+};
+
+
 /// \class StaticCrsGraph
 /// \brief Compressed row storage array.
 ///
@@ -218,6 +335,38 @@ public:
       static_cast<size_type> (0);
   }
 
+  /// \brief Return a const view of row i of the graph.
+  ///
+  /// If row i does not belong to the graph, return an empty view.
+  ///
+  /// The returned object \c view implements the following interface:
+  /// <ul>
+  /// <li> \c view.length is the number of entries in the row </li>
+  /// <li> \c view.colidx(k) returns a const reference to the
+  ///      column index of the k-th entry in the row </li>
+  /// </ul>
+  /// k is not a column index; it just counts from 0 to
+  /// <tt>view.length - 1</tt>.
+  ///
+  /// Users should not rely on the return type of this method.  They
+  /// should instead assign to 'auto'.  That allows compile-time
+  /// polymorphism for different kinds of sparse matrix formats (e.g.,
+  /// ELLPACK or Jagged Diagonal) that we may wish to support in the
+  /// future.
+  KOKKOS_INLINE_FUNCTION
+  GraphRowViewConst<StaticCrsGraph> rowConst (const data_type i) const {
+    const size_type start = row_map(i);
+    // count is guaranteed to fit in ordinal_type, as long as no row
+    // has duplicate entries.
+    const data_type count = static_cast<data_type> (row_map(i+1) - start);
+
+    if (count == 0) {
+      return GraphRowViewConst<StaticCrsGraph> (NULL, 1, 0);
+    } else {
+      return GraphRowViewConst<StaticCrsGraph> (entries, 1, count, start);
+    }
+  }
+
   /**  \brief  Create a row partitioning into a given number of blocks
    *           balancing non-zeros + a fixed cost per row.
    */
diff --git a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp
index 46321378d9..c184c14d07 100644
--- a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp
+++ b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp
@@ -91,11 +91,11 @@ struct DeviceIterateTile<2,RP,Functor,void >
     // LL
     if (RP::inner_direction == RP::Left) {
       for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-        const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+        const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
         if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
 
           for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-            const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+            const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
             if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
               m_func(offset_0 , offset_1);
             }
@@ -106,11 +106,11 @@ struct DeviceIterateTile<2,RP,Functor,void >
     // LR
     else {
       for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
         if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
 
           for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
             if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
               m_func(offset_0 , offset_1);
             }
@@ -143,11 +143,11 @@ struct DeviceIterateTile<2,RP,Functor,Tag>
     if (RP::inner_direction == RP::Left) {
       // Loop over size maxnumblocks until full range covered
       for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-        const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+        const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
         if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
 
           for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-            const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+            const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
             if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
               m_func(Tag(), offset_0 , offset_1);
             }
@@ -157,11 +157,11 @@ struct DeviceIterateTile<2,RP,Functor,Tag>
     }
     else {
       for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
         if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
 
           for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
             if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
               m_func(Tag(), offset_0 , offset_1);
             }
@@ -196,15 +196,15 @@ struct DeviceIterateTile<3,RP,Functor,void >
     // LL
     if (RP::inner_direction == RP::Left) {
       for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
-        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
+        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
         if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
 
           for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
             if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
 
               for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
                 if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
                   m_func(offset_0 , offset_1 , offset_2);
                 }
@@ -217,15 +217,15 @@ struct DeviceIterateTile<3,RP,Functor,void >
     // LR
     else {
       for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
         if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
 
           for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
             if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
 
               for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
-                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
                 if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
                   m_func(offset_0 , offset_1 , offset_2);
                 }
@@ -259,15 +259,15 @@ struct DeviceIterateTile<3,RP,Functor,Tag>
   {
     if (RP::inner_direction == RP::Left) {
       for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
-        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
+        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
         if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
 
           for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
             if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
 
               for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
                 if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
                   m_func(Tag(), offset_0 , offset_1 , offset_2);
                 }
@@ -279,15 +279,15 @@ struct DeviceIterateTile<3,RP,Functor,Tag>
     }
     else {
       for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
         if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
 
           for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
             if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
 
               for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
-                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
                 if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
                   m_func(Tag(), offset_0 , offset_1 , offset_2);
                 }
@@ -340,19 +340,19 @@ struct DeviceIterateTile<4,RP,Functor,void >
       const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
 
       for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
-        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
+        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
         if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
 
           for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
-            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
+            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
             if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
 
               for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
                 if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
 
                   for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
                     if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
                       m_func(offset_0 , offset_1 , offset_2 , offset_3);
                     }
@@ -378,19 +378,19 @@ struct DeviceIterateTile<4,RP,Functor,void >
       const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
 
       for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
         if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
 
           for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
             if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
 
               for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
-                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
                 if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
 
                   for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
-                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
+                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
                     if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
                       m_func(offset_0 , offset_1 , offset_2 , offset_3);
                     }
@@ -442,19 +442,19 @@ struct DeviceIterateTile<4,RP,Functor,Tag>
       const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
 
       for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
-        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
+        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
         if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
 
           for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
-            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
+            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
             if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
 
               for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
                 if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
 
                   for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
                     if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
                       m_func(Tag(), offset_0 , offset_1 , offset_2 , offset_3);
                     }
@@ -479,19 +479,19 @@ struct DeviceIterateTile<4,RP,Functor,Tag>
       const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
 
       for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
         if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
 
           for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1;
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
             if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
 
               for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
-                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
                 if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
 
                   for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
-                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
+                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
                     if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
                       m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3);
                     }
@@ -558,23 +558,23 @@ struct DeviceIterateTile<5,RP,Functor,void >
       const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];
 
       for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
-        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
+        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
         if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
 
           for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
             if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
 
               for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                 if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
 
                   for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
                     if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
 
                       for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
                         if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
                           m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4);
                         }
@@ -613,23 +613,23 @@ struct DeviceIterateTile<5,RP,Functor,void >
       const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];
 
       for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
         if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
 
           for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
             if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
 
               for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                 if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
 
                   for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
                     if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
 
                       for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
-                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
+                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
                         if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
                           m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
                         }
@@ -695,23 +695,23 @@ struct DeviceIterateTile<5,RP,Functor,Tag>
       const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];
 
       for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
-        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
+        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
         if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
 
           for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
             if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
 
               for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                 if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
 
                   for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
                     if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
 
                       for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
                         if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
                           m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4);
                         }
@@ -750,23 +750,23 @@ struct DeviceIterateTile<5,RP,Functor,Tag>
       const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];
 
       for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
         if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
 
           for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
             if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
 
               for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                 if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
 
                   for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
                     if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
 
                       for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
-                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
+                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
                         if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
                           m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
                         }
@@ -845,27 +845,27 @@ struct DeviceIterateTile<6,RP,Functor,void >
       const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4];
 
       for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
-        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
         if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
 
           for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
-            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
             if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
 
               for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
                 if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
 
                   for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                     if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
 
                       for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
                         if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
 
                           for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
                             if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
                               m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
                             }
@@ -917,27 +917,27 @@ struct DeviceIterateTile<6,RP,Functor,void >
       const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5];
 
       for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
         if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
 
           for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
             if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
 
               for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                 if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
 
                   for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
                     if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
 
                       for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
-                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
                         if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
 
                           for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
-                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
                             if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
                               m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
                             }
@@ -1016,27 +1016,27 @@ struct DeviceIterateTile<6,RP,Functor,Tag>
       const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4];
 
       for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
-        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
         if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
 
           for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
-            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
             if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
 
               for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
                 if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
 
                   for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                     if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
 
                       for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
                         if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
 
                           for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
                             if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
                               m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
                             }
@@ -1088,27 +1088,27 @@ struct DeviceIterateTile<6,RP,Functor,Tag>
       const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5];
 
       for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
         if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
 
           for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
             if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
 
               for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                 if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
 
                   for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
                     if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
 
                       for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
-                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
                         if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
 
                           for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
-                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
                             if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
                               m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
                             }
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
index cae8ecd489..079d9f0889 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
@@ -164,7 +164,7 @@ static void cuda_parallel_launch_constant_memory()
 
 template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
 __global__
-__launch_bounds__(maxTperB, minBperSM)
+//__launch_bounds__(maxTperB, minBperSM)
 static void cuda_parallel_launch_constant_memory()
 {
   const DriverType & driver =
@@ -182,7 +182,7 @@ static void cuda_parallel_launch_local_memory( const DriverType driver )
 
 template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
 __global__
-__launch_bounds__(maxTperB, minBperSM)
+//__launch_bounds__(maxTperB, minBperSM)
 static void cuda_parallel_launch_local_memory( const DriverType driver )
 {
   driver();
diff --git a/lib/kokkos/core/src/Kokkos_Complex.hpp b/lib/kokkos/core/src/Kokkos_Complex.hpp
index 26b47a8b74..f8355f0d06 100644
--- a/lib/kokkos/core/src/Kokkos_Complex.hpp
+++ b/lib/kokkos/core/src/Kokkos_Complex.hpp
@@ -242,45 +242,89 @@ public:
     re_ = v;
   }
 
+  template<typename InputRealType>
   KOKKOS_INLINE_FUNCTION
-  complex<RealType>& operator += (const complex<RealType>& src) {
+  complex<RealType>&
+  operator += (const complex<InputRealType>& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
     re_ += src.re_;
     im_ += src.im_;
     return *this;
   }
 
+  template<typename InputRealType>
   KOKKOS_INLINE_FUNCTION
-  void operator += (const volatile complex<RealType>& src) volatile {
+  void
+  operator += (const volatile complex<InputRealType>& src) volatile {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
     re_ += src.re_;
     im_ += src.im_;
   }
 
   KOKKOS_INLINE_FUNCTION
-  complex<RealType>& operator += (const RealType& src) {
+  complex<RealType>&
+  operator += (const std::complex<RealType>& src) {
+    re_ += src.real();
+    im_ += src.imag();
+    return *this;
+  }
+
+  template<typename InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>&
+  operator += (const InputRealType& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
     re_ += src;
     return *this;
   }
 
+  template<typename InputRealType>
   KOKKOS_INLINE_FUNCTION
-  void operator += (const volatile RealType& src) volatile {
+  void
+  operator += (const volatile InputRealType& src) volatile {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
     re_ += src;
   }
-
+  
+  template<typename InputRealType>
   KOKKOS_INLINE_FUNCTION
-  complex<RealType>& operator -= (const complex<RealType>& src) {
+  complex<RealType>&
+  operator -= (const complex<InputRealType>& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
     re_ -= src.re_;
     im_ -= src.im_;
     return *this;
   }
 
   KOKKOS_INLINE_FUNCTION
-  complex<RealType>& operator -= (const RealType& src) {
+  complex<RealType>&
+  operator -= (const std::complex<RealType>& src) {
+    re_ -= src.real();
+    im_ -= src.imag();
+    return *this;
+  }
+
+  template<typename InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>&
+  operator -= (const InputRealType& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
     re_ -= src;
     return *this;
   }
 
+  template<typename InputRealType>
   KOKKOS_INLINE_FUNCTION
-  complex<RealType>& operator *= (const complex<RealType>& src) {
+  complex<RealType>&
+  operator *= (const complex<InputRealType>& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
     const RealType realPart = re_ * src.re_ - im_ * src.im_;
     const RealType imagPart = re_ * src.im_ + im_ * src.re_;
     re_ = realPart;
@@ -288,8 +332,12 @@ public:
     return *this;
   }
 
+  template<typename InputRealType>
   KOKKOS_INLINE_FUNCTION
-  void operator *= (const volatile complex<RealType>& src) volatile {
+  void
+  operator *= (const volatile complex<InputRealType>& src) volatile {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
     const RealType realPart = re_ * src.re_ - im_ * src.im_;
     const RealType imagPart = re_ * src.im_ + im_ * src.re_;
     re_ = realPart;
@@ -297,20 +345,70 @@ public:
   }
 
   KOKKOS_INLINE_FUNCTION
-  complex<RealType>& operator *= (const RealType& src) {
+  complex<RealType>&
+  operator *= (const std::complex<RealType>& src) {
+    const RealType realPart = re_ * src.real() - im_ * src.imag();
+    const RealType imagPart = re_ * src.imag() + im_ * src.real();
+    re_ = realPart;
+    im_ = imagPart;
+    return *this;
+  }
+
+  template<typename InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>&
+  operator *= (const InputRealType& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
     re_ *= src;
     im_ *= src;
     return *this;
   }
 
+  template<typename InputRealType>
   KOKKOS_INLINE_FUNCTION
-  void operator *= (const volatile RealType& src) volatile {
+  void
+  operator *= (const volatile InputRealType& src) volatile {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
     re_ *= src;
     im_ *= src;
   }
 
+  template<typename InputRealType>
   KOKKOS_INLINE_FUNCTION
-  complex<RealType>& operator /= (const complex<RealType>& y) {
+  complex<RealType>&
+  operator /= (const complex<InputRealType>& y) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
+
+    // Scale (by the "1-norm" of y) to avoid unwarranted overflow.
+    // If the real part is +/-Inf and the imaginary part is -/+Inf,
+    // this won't change the result.
+    const RealType s = std::fabs (y.real ()) + std::fabs (y.imag ());
+
+    // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0.
+    // In that case, the relation x/y == (x/s) / (y/s) doesn't hold,
+    // because y/s is NaN.
+    if (s == 0.0) {
+      this->re_ /= s;
+      this->im_ /= s;
+    }
+    else {
+      const complex<RealType> x_scaled (this->re_ / s, this->im_ / s);
+      const complex<RealType> y_conj_scaled (y.re_ / s, -(y.im_) / s);
+      const RealType y_scaled_abs = y_conj_scaled.re_ * y_conj_scaled.re_ +
+        y_conj_scaled.im_ * y_conj_scaled.im_; // abs(y) == abs(conj(y))
+      *this = x_scaled * y_conj_scaled;
+      *this /= y_scaled_abs;
+    }
+    return *this;
+  }
+  
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>&
+  operator /= (const std::complex<RealType>& y) {
+
     // Scale (by the "1-norm" of y) to avoid unwarranted overflow.
     // If the real part is +/-Inf and the imaginary part is -/+Inf,
     // this won't change the result.
@@ -334,57 +432,95 @@ public:
     return *this;
   }
 
+
+  template<typename InputRealType>
   KOKKOS_INLINE_FUNCTION
-  complex<RealType>& operator /= (const RealType& src) {
+  complex<RealType>&
+  operator /= (const InputRealType& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
+
     re_ /= src;
     im_ /= src;
     return *this;
   }
 
+  template<typename InputRealType>
   KOKKOS_INLINE_FUNCTION
-  bool operator == (const complex<RealType>& src) {
-    return (re_ == src.re_) && (im_ == src.im_);
+  bool
+  operator == (const complex<InputRealType>& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
+
+    return (re_ == static_cast<RealType>(src.re_)) && (im_ == static_cast<RealType>(src.im_));
   }
 
   KOKKOS_INLINE_FUNCTION
-  bool operator == (const RealType src) {
-    return (re_ == src) && (im_ == RealType(0));
+  bool
+  operator == (const std::complex<RealType>& src) {
+    return (re_ == src.real()) && (im_ == src.imag());
+  }
+
+  template<typename InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  bool
+  operator == (const InputRealType src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
+
+    return (re_ == static_cast<RealType>(src)) && (im_ == RealType(0));
+  }
+
+  template<typename InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  bool
+  operator != (const complex<InputRealType>& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
+
+    return (re_ != static_cast<RealType>(src.re_)) || (im_ != static_cast<RealType>(src.im_));
   }
 
   KOKKOS_INLINE_FUNCTION
-  bool operator != (const complex<RealType>& src) {
-    return (re_ != src.re_) || (im_ != src.im_);
+  bool
+  operator != (const std::complex<RealType>& src) {
+    return (re_ != src.real()) || (im_ != src.imag());
   }
 
+  template<typename InputRealType>
   KOKKOS_INLINE_FUNCTION
-  bool operator != (const RealType src) {
-    return (re_ != src) || (im_ != RealType(0));
-  }
+  bool
+  operator != (const InputRealType src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
 
+    return (re_ != static_cast<RealType>(src)) || (im_ != RealType(0));
+  }
+  
 };
 
 //! Binary + operator for complex complex.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator + (const complex<RealType>& x, const complex<RealType>& y) {
-  return complex<RealType> (x.real () + y.real (), x.imag () + y.imag ());
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator + (const complex<RealType1>& x, const complex<RealType2>& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type > (x.real () + y.real (), x.imag () + y.imag ());
 }
 
 //! Binary + operator for complex scalar.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator + (const complex<RealType>& x, const RealType& y) {
-  return complex<RealType> (x.real () + y , x.imag ());
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator + (const complex<RealType1>& x, const RealType2& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () + y , x.imag ());
 }
 
 //! Binary + operator for scalar complex.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator + (const RealType& x, const complex<RealType>& y) {
-  return complex<RealType> (x + y.real (), y.imag ());
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator + (const RealType1& x, const complex<RealType2>& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x + y.real (), y.imag ());
 }
 
 //! Unary + operator for complex.
@@ -396,27 +532,27 @@ operator + (const complex<RealType>& x) {
 }
 
 //! Binary - operator for complex.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator - (const complex<RealType>& x, const complex<RealType>& y) {
-  return complex<RealType> (x.real () - y.real (), x.imag () - y.imag ());
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator - (const complex<RealType1>& x, const complex<RealType2>& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () - y.real (), x.imag () - y.imag ());
 }
 
 //! Binary - operator for complex scalar.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator - (const complex<RealType>& x, const RealType& y) {
-  return complex<RealType> (x.real () - y , x.imag ());
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator - (const complex<RealType1>& x, const RealType2& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () - y , x.imag ());
 }
 
 //! Binary - operator for scalar complex.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator - (const RealType& x, const complex<RealType>& y) {
-  return complex<RealType> (x - y.real (), - y.imag ());
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator - (const RealType1& x, const complex<RealType2>& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x - y.real (), - y.imag ());
 }
 
 //! Unary - operator for complex.
@@ -428,12 +564,12 @@ operator - (const complex<RealType>& x) {
 }
 
 //! Binary * operator for complex.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator * (const complex<RealType>& x, const complex<RealType>& y) {
-  return complex<RealType> (x.real () * y.real () - x.imag () * y.imag (),
-                            x.real () * y.imag () + x.imag () * y.real ());
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator * (const complex<RealType1>& x, const complex<RealType2>& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () * y.real () - x.imag () * y.imag (),
+                                                                        x.real () * y.imag () + x.imag () * y.real ());
 }
 
 /// \brief Binary * operator for std::complex and complex.
@@ -446,33 +582,34 @@ operator * (const complex<RealType>& x, const complex<RealType>& y) {
 /// This function cannot be called in a CUDA device function, because
 /// std::complex's methods and nonmember functions are not marked as
 /// CUDA device functions.
-template<class RealType>
-complex<RealType>
-operator * (const std::complex<RealType>& x, const complex<RealType>& y) {
-  return complex<RealType> (x.real () * y.real () - x.imag () * y.imag (),
-                            x.real () * y.imag () + x.imag () * y.real ());
+template<class RealType1, class RealType2>
+inline
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator * (const std::complex<RealType1>& x, const complex<RealType2>& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () * y.real () - x.imag () * y.imag (),
+                                                                        x.real () * y.imag () + x.imag () * y.real ());
 }
 
 /// \brief Binary * operator for RealType times complex.
 ///
 /// This function exists because the compiler doesn't know that
 /// RealType and complex<RealType> commute with respect to operator*.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator * (const RealType& x, const complex<RealType>& y) {
-  return complex<RealType> (x * y.real (), x * y.imag ());
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator * (const RealType1& x, const complex<RealType2>& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x * y.real (), x * y.imag ());
 }
 
 /// \brief Binary * operator for RealType times complex.
 ///
 /// This function exists because the compiler doesn't know that
 /// RealType and complex<RealType> commute with respect to operator*.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator * (const complex<RealType>& y, const RealType& x) {
-  return complex<RealType> (x * y.real (), x * y.imag ());
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator * (const complex<RealType1>& y, const RealType2& x) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x * y.real (), x * y.imag ());
 }
 
 //! Imaginary part of a complex number.
@@ -539,33 +676,34 @@ complex<RealType> pow (const complex<RealType>& x) {
 //! Binary operator / for complex and real numbers
 template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType1>
+complex<typename std::common_type<RealType1,RealType2>::type>
 operator / (const complex<RealType1>& x, const RealType2& y) {
-  return complex<RealType1> (real (x) / y, imag (x) / y);
+  return complex<typename std::common_type<RealType1,RealType2>::type> (real (x) / y, imag (x) / y);
 }
 
 //! Binary operator / for complex.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator / (const complex<RealType>& x, const complex<RealType>& y) {
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator / (const complex<RealType1>& x, const complex<RealType2>& y) {
   // Scale (by the "1-norm" of y) to avoid unwarranted overflow.
   // If the real part is +/-Inf and the imaginary part is -/+Inf,
   // this won't change the result.
-  const RealType s = std::fabs (real (y)) + std::fabs (imag (y));
+  typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
+  const common_real_type s = std::fabs (real (y)) + std::fabs (imag (y));
 
   // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0.
   // In that case, the relation x/y == (x/s) / (y/s) doesn't hold,
   // because y/s is NaN.
   if (s == 0.0) {
-    return complex<RealType> (real (x) / s, imag (x) / s);
+    return complex<common_real_type> (real (x) / s, imag (x) / s);
   }
   else {
-    const complex<RealType> x_scaled (real (x) / s, imag (x) / s);
-    const complex<RealType> y_conj_scaled (real (y) / s, -imag (y) / s);
-    const RealType y_scaled_abs = real (y_conj_scaled) * real (y_conj_scaled) +
+    const complex<common_real_type> x_scaled (real (x) / s, imag (x) / s);
+    const complex<common_real_type> y_conj_scaled (real (y) / s, -imag (y) / s);
+    const RealType1 y_scaled_abs = real (y_conj_scaled) * real (y_conj_scaled) +
       imag (y_conj_scaled) * imag (y_conj_scaled); // abs(y) == abs(conj(y))
-    complex<RealType> result = x_scaled * y_conj_scaled;
+    complex<common_real_type> result = x_scaled * y_conj_scaled;
     result /= y_scaled_abs;
     return result;
   }
@@ -574,16 +712,19 @@ operator / (const complex<RealType>& x, const complex<RealType>& y) {
 //! Binary operator / for complex and real numbers
 template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType1>
+complex<typename std::common_type<RealType1,RealType2>::type>
 operator / (const RealType1& x, const complex<RealType2>& y) {
-  return complex<RealType1> (x)/y;
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x)/y;
 }
 
 //! Equality operator for two complex numbers.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-bool operator == (const complex<RealType>& x, const complex<RealType>& y) {
-  return real (x) == real (y) && imag (x) == imag (y);
+bool
+operator == (const complex<RealType1>& x, const complex<RealType2>& y) {
+  typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
+  return ( static_cast<common_real_type>(real (x)) == static_cast<common_real_type>(real (y)) && 
+           static_cast<common_real_type>(imag (x)) == static_cast<common_real_type>(imag (y)) );
 }
 
 /// \brief Equality operator for std::complex and Kokkos::complex.
@@ -592,50 +733,68 @@ bool operator == (const complex<RealType>& x, const complex<RealType>& y) {
 /// Otherwise, CUDA builds will give compiler warnings ("warning:
 /// calling a constexpr __host__ function("real") from a __host__
 /// __device__ function("operator==") is not allowed").
-template<class RealType>
-bool operator == (const std::complex<RealType>& x, const complex<RealType>& y) {
-  return std::real (x) == real (y) && std::imag (x) == imag (y);
+template<class RealType1, class RealType2>
+inline
+bool
+operator == (const std::complex<RealType1>& x, const complex<RealType2>& y) {
+  typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
+  return ( static_cast<common_real_type>(std::real (x)) == static_cast<common_real_type>(real (y)) && 
+           static_cast<common_real_type>(std::imag (x)) == static_cast<common_real_type>(imag (y)) );
 }
-
+  
 //! Equality operator for complex and real number.
 template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-bool operator == (const complex<RealType1>& x, const RealType2& y) {
-  return real (x) == y && imag (x) == static_cast<RealType1> (0.0);
+bool
+operator == (const complex<RealType1>& x, const RealType2& y) {
+  typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
+  return ( static_cast<common_real_type>(real (x)) == static_cast<common_real_type>(y) && 
+           static_cast<common_real_type>(imag (x)) == static_cast<common_real_type>(0.0) );
 }
 
 //! Equality operator for real and complex number.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-bool operator == (const RealType& x, const complex<RealType>& y) {
+bool
+operator == (const RealType1& x, const complex<RealType2>& y) {
   return y == x;
 }
 
 //! Inequality operator for two complex numbers.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-bool operator != (const complex<RealType>& x, const complex<RealType>& y) {
-  return real (x) != real (y) || imag (x) != imag (y);
+bool
+operator != (const complex<RealType1>& x, const complex<RealType2>& y) {
+  typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
+  return ( static_cast<common_real_type>(real (x)) != static_cast<common_real_type>(real (y)) || 
+           static_cast<common_real_type>(imag (x)) != static_cast<common_real_type>(imag (y)) );
 }
 
 //! Inequality operator for std::complex and Kokkos::complex.
-template<class RealType>
-KOKKOS_INLINE_FUNCTION
-bool operator != (const std::complex<RealType>& x, const complex<RealType>& y) {
-  return std::real (x) != real (y) || std::imag (x) != imag (y);
+template<class RealType1, class RealType2>
+inline
+bool
+operator != (const std::complex<RealType1>& x, const complex<RealType2>& y) {
+  typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
+  return ( static_cast<common_real_type>(std::real (x)) != static_cast<common_real_type>(real (y)) || 
+           static_cast<common_real_type>(std::imag (x)) != static_cast<common_real_type>(imag (y)) );
 }
 
 //! Inequality operator for complex and real number.
 template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-bool operator != (const complex<RealType1>& x, const RealType2& y) {
-  return real (x) != y || imag (x) != static_cast<RealType1> (0.0);
+bool
+operator != (const complex<RealType1>& x, const RealType2& y) {
+  typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
+  return ( static_cast<common_real_type>(real (x)) != static_cast<common_real_type>(y) || 
+           static_cast<common_real_type>(imag (x)) != static_cast<common_real_type>(0.0) );
 }
 
 //! Inequality operator for real and complex number.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-bool operator != (const RealType& x, const complex<RealType>& y) {
+bool
+operator != (const RealType1& x, const complex<RealType2>& y) {
   return y != x;
 }
 
diff --git a/lib/kokkos/core/src/Kokkos_Crs.hpp b/lib/kokkos/core/src/Kokkos_Crs.hpp
index f089c16ad2..b9c131cd7a 100644
--- a/lib/kokkos/core/src/Kokkos_Crs.hpp
+++ b/lib/kokkos/core/src/Kokkos_Crs.hpp
@@ -353,7 +353,14 @@ struct CountAndFill {
   struct Fill {};
   KOKKOS_INLINE_FUNCTION void operator()(Fill, size_type i) const {
     auto j = m_crs.row_map(i);
-    data_type* fill = &(m_crs.entries(j));
+    /* we don't want to access entries(entries.size()), even if its just to get its
+       address and never use it.
+       this can happen when row (i) is empty and all rows after it are also empty.
+       we could compare to row_map(i + 1), but that is a read from global memory,
+       whereas dimension_0() should be part of the View in registers (or constant memory) */
+    data_type* fill =
+      (j == static_cast<decltype(j)>(m_crs.entries.dimension_0())) ?
+      nullptr : (&(m_crs.entries(j)));
     m_functor(i, fill);
   }
   using self_type = CountAndFill<CrsType, Functor>;
diff --git a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
index 9c9af0dd8b..b811751a2c 100644
--- a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
@@ -147,12 +147,11 @@ public:
                  , const size_t arg_alloc_size ) const;
 
   /**\brief Return Name of the MemorySpace */
-  static constexpr const char* name();
+  static constexpr const char* name() { return "HBW"; }
 
 private:
 
   AllocationMechanism  m_alloc_mech;
-  static constexpr const char* m_name = "HBW";
   friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace, void >;
 };
 
diff --git a/lib/kokkos/core/src/Kokkos_NumericTraits.hpp b/lib/kokkos/core/src/Kokkos_NumericTraits.hpp
index 339571941d..a825fd54d3 100644
--- a/lib/kokkos/core/src/Kokkos_NumericTraits.hpp
+++ b/lib/kokkos/core/src/Kokkos_NumericTraits.hpp
@@ -192,7 +192,7 @@ template<>
 struct reduction_identity<float> {
   KOKKOS_FORCEINLINE_FUNCTION constexpr static float sum()  {return static_cast<float>(0.0f);}
   KOKKOS_FORCEINLINE_FUNCTION constexpr static float prod() {return static_cast<float>(1.0f);}
-  KOKKOS_FORCEINLINE_FUNCTION constexpr static float max()  {return FLT_MIN;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static float max()  {return -FLT_MAX;}
   KOKKOS_FORCEINLINE_FUNCTION constexpr static float min()  {return FLT_MAX;}
 };
 
@@ -200,7 +200,7 @@ template<>
 struct reduction_identity<double> {
   KOKKOS_FORCEINLINE_FUNCTION constexpr static double sum()  {return static_cast<double>(0.0);}
   KOKKOS_FORCEINLINE_FUNCTION constexpr static double prod() {return static_cast<double>(1.0);}
-  KOKKOS_FORCEINLINE_FUNCTION constexpr static double max()  {return DBL_MIN;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static double max()  {return -DBL_MAX;}
   KOKKOS_FORCEINLINE_FUNCTION constexpr static double min()  {return DBL_MAX;}
 };
 
@@ -208,7 +208,7 @@ template<>
 struct reduction_identity<long double> {
   KOKKOS_FORCEINLINE_FUNCTION constexpr static long double sum()  {return static_cast<long double>(0.0);}
   KOKKOS_FORCEINLINE_FUNCTION constexpr static long double prod() {return static_cast<long double>(1.0);}
-  KOKKOS_FORCEINLINE_FUNCTION constexpr static long double max()  {return LDBL_MIN;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long double max()  {return -LDBL_MAX;}
   KOKKOS_FORCEINLINE_FUNCTION constexpr static long double min()  {return LDBL_MAX;}
 };
 
diff --git a/lib/kokkos/core/src/Kokkos_ROCm.hpp b/lib/kokkos/core/src/Kokkos_ROCm.hpp
index b13b0b01de..0118d4667e 100644
--- a/lib/kokkos/core/src/Kokkos_ROCm.hpp
+++ b/lib/kokkos/core/src/Kokkos_ROCm.hpp
@@ -211,6 +211,24 @@ struct VerifyExecutionCanAccessMemorySpace
 } // namespace Kokkos
 
 
+
+#define threadIdx_x (hc_get_workitem_id(0))
+#define threadIdx_y (hc_get_workitem_id(1))
+#define threadIdx_z (hc_get_workitem_id(2))
+
+#define blockIdx_x  (hc_get_group_id(0))
+#define blockIdx_y  (hc_get_group_id(1))
+#define blockIdx_z  (hc_get_group_id(2))
+
+#define blockDim_x  (hc_get_group_size(0))
+#define blockDim_y  (hc_get_group_size(1))
+#define blockDim_z  (hc_get_group_size(2))
+
+#define gridDim_x   (hc_get_num_groups(0))
+#define gridDim_y   (hc_get_num_groups(1))
+#define gridDim_z   (hc_get_num_groups(2))
+
+
 #include <ROCm/Kokkos_ROCm_Parallel.hpp>
 #include <ROCm/Kokkos_ROCm_Task.hpp>
 
diff --git a/lib/kokkos/core/src/Makefile b/lib/kokkos/core/src/Makefile
index 8fb13b8954..a917cf1656 100644
--- a/lib/kokkos/core/src/Makefile
+++ b/lib/kokkos/core/src/Makefile
@@ -88,6 +88,7 @@ build-makefile-kokkos:
 	echo "KOKKOS_SRC = $(KOKKOS_SRC)" >> Makefile.kokkos
 	echo "" >> Makefile.kokkos
 	echo "#Variables used in application Makefiles" >> Makefile.kokkos
+	echo "KOKKOS_OS = $(KOKKOS_OS)" >> Makefile.kokkos
 	echo "KOKKOS_CPP_DEPENDS = $(KOKKOS_CPP_DEPENDS)" >> Makefile.kokkos
 	echo "KOKKOS_CXXFLAGS = $(KOKKOS_CXXFLAGS)" >> Makefile.kokkos
 	echo "KOKKOS_CPPFLAGS = $(KOKKOS_CPPFLAGS)" >> Makefile.kokkos
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
index 37d2ac8318..de84f6e59f 100644
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
@@ -211,6 +211,7 @@ void OpenMP::partition_master( F const& f
                                                  , thread_local_bytes
                                                  );
 
+      omp_set_num_threads(partition_size);
       f( omp_get_thread_num(), omp_get_num_threads() );
 
       Impl::t_openmp_instance->~Exec();
diff --git a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp
index 0b7a1e2583..f2674e5929 100644
--- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp
@@ -113,7 +113,6 @@ void reduce_enqueue(
 
   if (output_length < 1) return;
 
-  assert(output_result != nullptr);
   const auto td = get_tile_desc<T>(szElements,output_length,team_size,vector_size, shared_size);
 
   // allocate host and device memory for the results from each team
@@ -176,14 +175,17 @@ void reduce_enqueue(
       }
       
   });
-  ValueInit::init(ReducerConditional::select(f, reducer), output_result);
+  if (output_result != nullptr)
+     ValueInit::init(ReducerConditional::select(f, reducer), output_result);
   fut.wait();
 
   copy(result,result_cpu.data());
-  for(std::size_t i=0;i<td.num_tiles;i++)
-    ValueJoin::join(ReducerConditional::select(f, reducer), output_result, result_cpu.data()+i*output_length);
+  if (output_result != nullptr) {
+    for(std::size_t i=0;i<td.num_tiles;i++)
+       ValueJoin::join(ReducerConditional::select(f, reducer), output_result, result_cpu.data()+i*output_length);
 
-  ValueFinal::final( ReducerConditional::select(f, reducer) , output_result );
+    ValueFinal::final( ReducerConditional::select(f, reducer) , output_result );
+  }
 
 }
 
diff --git a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Scan.hpp b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Scan.hpp
index acf75f6f13..c2e85ad112 100644
--- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Scan.hpp
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Scan.hpp
@@ -67,7 +67,7 @@ void scan_enqueue(
     hc::array<value_type> result(td.num_tiles);
     hc::array<value_type> scratch(len);
 
-    tile_for<value_type>(td, [&,len,td](hc::tiled_index<1> t_idx, tile_buffer<value_type> buffer) [[hc]] 
+    tile_for<value_type>(td, [&,f,len,td](hc::tiled_index<1> t_idx, tile_buffer<value_type> buffer) [[hc]] 
     {
         const auto local = t_idx.local[0];
         const auto global = t_idx.global[0];
@@ -135,7 +135,7 @@ void scan_enqueue(
       ValueJoin::join(f, &result_cpu[i], &result_cpu[i-1]);
 
     copy(result_cpu.data(),result);
-    hc::parallel_for_each(hc::extent<1>(len).tile(td.tile_size), [&,len,td](hc::tiled_index<1> t_idx) [[hc]] 
+    hc::parallel_for_each(hc::extent<1>(len).tile(td.tile_size), [&,f,len,td](hc::tiled_index<1> t_idx) [[hc]] 
     {
 //        const auto local = t_idx.local[0];
         const auto global = t_idx.global[0];
diff --git a/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp b/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp
index 3d3029535e..c5e73c8b26 100644
--- a/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp
@@ -68,6 +68,8 @@ int bit_first_zero( unsigned i ) noexcept
   return full != i ? _bit_scan_forward( ~i ) : -1 ;
 #elif defined( KOKKOS_COMPILER_IBM )
   return full != i ? __cnttz4( ~i ) : -1 ;
+#elif defined( KOKKOS_COMPILER_CRAYC )
+  return full != i ? _popcnt( i ^ (i+1) ) - 1 : -1 ;
 #elif defined( KOKKOS_COMPILER_GNU ) || defined( __GNUC__ ) || defined( __GNUG__ )
   return full != i ? __builtin_ffs( ~i ) - 1 : -1 ;
 #else
@@ -90,17 +92,16 @@ int bit_scan_forward( unsigned i )
   return _bit_scan_forward(i);
 #elif defined( KOKKOS_COMPILER_IBM )
   return __cnttz4(i);
+#elif defined( KOKKOS_COMPILER_CRAYC )
+  return i ? _popcnt(~i & (i-1)) : -1;
 #elif defined( KOKKOS_COMPILER_GNU ) || defined( __GNUC__ ) || defined( __GNUG__ )
   return __builtin_ffs(i) - 1;
 #else
-  unsigned t = 1u;
-  int r = 0;
-  while ( i && ( i & t == 0 ) )
-  {
-    t = t << 1;
-    ++r;
+  int offset = -1;
+  if ( i ) {
+    for ( offset = 0 ; (i & ( 1 << offset ) ) == 0 ; ++offset );
   }
-  return r;
+  return offset;
 #endif
 }
 
@@ -116,17 +117,16 @@ int bit_scan_reverse( unsigned i )
   return _bit_scan_reverse(i);
 #elif defined( KOKKOS_COMPILER_IBM )
   return shift - __cntlz4(i);
+#elif defined( KOKKOS_COMPILER_CRAYC )
+  return i ? shift - _leadz32(i) : 0 ;
 #elif defined( __GNUC__ ) || defined( __GNUG__ )
   return shift - __builtin_clz(i);
 #else
-  unsigned t = 1u << shift;
-  int r = 0;
-  while ( i && ( i & t == 0 ) )
-  {
-    t = t >> 1;
-    ++r;
+  int offset = 0;
+  if ( i ) {
+    for ( offset = shift ; (i & ( 1 << offset ) ) == 0 ; --offset );
   }
-  return r;
+  return offset;
 #endif
 }
 
@@ -142,6 +142,8 @@ int bit_count( unsigned i )
   return _popcnt32(i);
 #elif defined( KOKKOS_COMPILER_IBM )
   return __popcnt4(i);
+#elif defined( KOKKOS_COMPILER_CRAYC )
+  return _popcnt(i);
 #elif defined( __GNUC__ ) || defined( __GNUG__ )
   return __builtin_popcount(i);
 #else
diff --git a/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp b/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
index e11f8b6d34..cd0553218d 100644
--- a/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
@@ -166,10 +166,6 @@ void HBWSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_s
   }
 }
 
-constexpr const char* HBWSpace::name() {
-  return m_name;
-}
-
 } // namespace Experimental
 } // namespace Kokkos
 
diff --git a/lib/kokkos/core/unit_test/TestComplex.hpp b/lib/kokkos/core/unit_test/TestComplex.hpp
index ce5537fed3..c7f681699e 100644
--- a/lib/kokkos/core/unit_test/TestComplex.hpp
+++ b/lib/kokkos/core/unit_test/TestComplex.hpp
@@ -114,7 +114,7 @@ struct TestComplexBasicMath {
   typename Kokkos::View<Kokkos::complex<double>*,ExecSpace>::HostMirror h_results;
 
   void testit () {
-    d_results = Kokkos::View<Kokkos::complex<double>*,ExecSpace>("TestComplexBasicMath",20);
+    d_results = Kokkos::View<Kokkos::complex<double>*,ExecSpace>("TestComplexBasicMath",24);
     h_results = Kokkos::create_mirror_view(d_results);
 
     Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0,1), *this);
@@ -125,6 +125,7 @@ struct TestComplexBasicMath {
     std::complex<double> b(3.25,5.75);
     std::complex<double> d(1.0,2.0);
     double c = 9.3;
+    int e = 2;
 
     std::complex<double> r;
     r = a+b; ASSERT_FLOAT_EQ(h_results(0).real(),  r.real()); ASSERT_FLOAT_EQ(h_results(0).imag(),  r.imag());
@@ -147,6 +148,12 @@ struct TestComplexBasicMath {
     r = c-a; ASSERT_FLOAT_EQ(h_results(17).real(), r.real()); ASSERT_FLOAT_EQ(h_results(17).imag(), r.imag());
     r = c*a; ASSERT_FLOAT_EQ(h_results(18).real(), r.real()); ASSERT_FLOAT_EQ(h_results(18).imag(), r.imag());
     r = c/a; ASSERT_FLOAT_EQ(h_results(19).real(), r.real()); ASSERT_FLOAT_EQ(h_results(19).imag(), r.imag());
+
+    r = a; 
+    /* r = a+e; */ ASSERT_FLOAT_EQ(h_results(20).real(),  r.real()+e); ASSERT_FLOAT_EQ(h_results(20).imag(),  r.imag());
+    /* r = a-e; */ ASSERT_FLOAT_EQ(h_results(21).real(),  r.real()-e); ASSERT_FLOAT_EQ(h_results(21).imag(),  r.imag());
+    /* r = a*e; */ ASSERT_FLOAT_EQ(h_results(22).real(),  r.real()*e); ASSERT_FLOAT_EQ(h_results(22).imag(),  r.imag()*e);
+    /* r = a/e; */ ASSERT_FLOAT_EQ(h_results(23).real(),  r.real()/2); ASSERT_FLOAT_EQ(h_results(23).imag(),  r.imag()/e);
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -190,6 +197,12 @@ struct TestComplexBasicMath {
     d_results(17) = c-a;
     d_results(18) = c*a;
     d_results(19) = c/a;
+
+    int e = 2;
+    d_results(20) = a+e;
+    d_results(21) = a-e;
+    d_results(22) = a*e;
+    d_results(23) = a/e;
   }
 };
 
diff --git a/lib/kokkos/core/unit_test/TestMDRange.hpp b/lib/kokkos/core/unit_test/TestMDRange.hpp
index f579ddf02c..fbc3a65c2f 100644
--- a/lib/kokkos/core/unit_test/TestMDRange.hpp
+++ b/lib/kokkos/core/unit_test/TestMDRange.hpp
@@ -286,7 +286,9 @@ struct TestMDRange_2D {
     // Test with reducers - scalar
     {
       typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType<int> > range_type;
-      range_type range( {{ 0, 0 }}, {{ N0, N1 }}, {{ 3, 3 }} );
+      int s0 = 1;
+      int s1 = 1;
+      range_type range( {{ s0, s1 }}, {{ N0, N1 }}, {{ 3, 3 }} );
 
       TestMDRange_2D functor( N0, N1 );
 
@@ -297,7 +299,7 @@ struct TestMDRange_2D {
 
       parallel_reduce( range, functor, reducer_scalar );
 
-      ASSERT_EQ( sum, 2 * N0 * N1 );
+      ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) );
     }
     // Test with reducers - scalar view
     {
@@ -445,7 +447,9 @@ struct TestMDRange_2D {
       typedef typename range_type::tile_type tile_type;
       typedef typename range_type::point_type point_type;
 
-      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
+      const int s0 = 1;
+      const int s1 = 1;
+      range_type range( point_type{ { s0, s1 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
       TestMDRange_2D functor( N0, N1 );
 
       parallel_for( range, functor );
@@ -454,8 +458,8 @@ struct TestMDRange_2D {
       Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i = 0; i < N0; ++i )
-      for ( int j = 0; j < N1; ++j )
+      for ( int i = s0; i < N0; ++i )
+      for ( int j = s1; j < N1; ++j )
       {
         if ( h_view( i, j ) != 3 ) {
           ++counter;
@@ -463,7 +467,7 @@ struct TestMDRange_2D {
       }
 
       if ( counter != 0 ) {
-        printf( "Default Layouts + InitTag op(): Errors in test_for2; mismatches = %d\n\n", counter );
+        printf( "Offset Start + Default Layouts + InitTag op(): Errors in test_for2; mismatches = %d\n\n", counter );
       }
 
       ASSERT_EQ( counter, 0 );
@@ -699,6 +703,7 @@ struct TestMDRange_2D {
 
       ASSERT_EQ( counter, 0 );
     }
+
   } // end test_for2
 }; // MDRange_2D
 
@@ -749,7 +754,10 @@ struct TestMDRange_3D {
       typedef typename range_type::tile_type tile_type;
       typedef typename range_type::point_type point_type;
 
-      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      range_type range( point_type{ { s0, s1, s2 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
 
       TestMDRange_3D functor( N0, N1, N2 );
 
@@ -757,7 +765,7 @@ struct TestMDRange_3D {
       double sum = 0.0;
       parallel_reduce( range, functor, sum );
 
-      ASSERT_EQ( sum, 2 * N0 * N1 * N2 );
+      ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) );
     }
 
     // Test with reducers - scalar
@@ -952,7 +960,10 @@ struct TestMDRange_3D {
       typedef typename range_type::tile_type tile_type;
       typedef typename range_type::point_type point_type;
 
-      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      range_type range( point_type{ { s0, s1, s2 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
       TestMDRange_3D functor( N0, N1, N2 );
 
       parallel_for( range, functor );
@@ -961,9 +972,9 @@ struct TestMDRange_3D {
       Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i = 0; i < N0; ++i )
-      for ( int j = 0; j < N1; ++j )
-      for ( int k = 0; k < N2; ++k )
+      for ( int i = s0; i < N0; ++i )
+      for ( int j = s1; j < N1; ++j )
+      for ( int k = s2; k < N2; ++k )
       {
         if ( h_view( i, j, k ) != 3 ) {
           ++counter;
@@ -971,7 +982,7 @@ struct TestMDRange_3D {
       }
 
       if ( counter != 0 ) {
-        printf( "Defaults + InitTag op(): Errors in test_for3; mismatches = %d\n\n", counter );
+        printf( "Offset Start + Defaults + InitTag op(): Errors in test_for3; mismatches = %d\n\n", counter );
       }
 
       ASSERT_EQ( counter, 0 );
@@ -1207,7 +1218,11 @@ struct TestMDRange_4D {
       typedef typename range_type::tile_type tile_type;
       typedef typename range_type::point_type point_type;
 
-      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 3, 3, 3 } } );
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      int s3 = 1;
+      range_type range( point_type{ { s0, s1, s2, s3 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 3, 3, 3 } } );
 
       TestMDRange_4D functor( N0, N1, N2, N3 );
 
@@ -1215,7 +1230,7 @@ struct TestMDRange_4D {
       double sum = 0.0;
       parallel_reduce( range, functor, sum );
 
-      ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 );
+      ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) );
     }
 
     // Test with reducers - scalar
@@ -1415,7 +1430,11 @@ struct TestMDRange_4D {
       typedef typename range_type::tile_type tile_type;
       typedef typename range_type::point_type point_type;
 
-      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 11, 3, 3 } } );
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      int s3 = 1;
+      range_type range( point_type{ { s0, s1, s2, s3 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 11, 3, 3 } } );
       TestMDRange_4D functor( N0, N1, N2, N3 );
 
       parallel_for( range, functor );
@@ -1424,10 +1443,10 @@ struct TestMDRange_4D {
       Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i = 0; i < N0; ++i )
-      for ( int j = 0; j < N1; ++j )
-      for ( int k = 0; k < N2; ++k )
-      for ( int l = 0; l < N3; ++l )
+      for ( int i = s0; i < N0; ++i )
+      for ( int j = s1; j < N1; ++j )
+      for ( int k = s2; k < N2; ++k )
+      for ( int l = s3; l < N3; ++l )
       {
         if ( h_view( i, j, k, l ) != 3 ) {
           ++counter;
@@ -1435,7 +1454,7 @@ struct TestMDRange_4D {
       }
 
       if ( counter != 0 ) {
-        printf("Defaults +m_tile > m_upper dim2 InitTag op(): Errors in test_for4; mismatches = %d\n\n",counter);
+        printf("Offset Start + Defaults +m_tile > m_upper dim2 InitTag op(): Errors in test_for4; mismatches = %d\n\n",counter);
       }
 
       ASSERT_EQ( counter, 0 );
@@ -1682,7 +1701,12 @@ struct TestMDRange_5D {
       typedef typename range_type::tile_type tile_type;
       typedef typename range_type::point_type point_type;
 
-      range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 3 } } );
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      int s3 = 1;
+      int s4 = 1;
+      range_type range( point_type{ { s0, s1, s2, s3, s4 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 3 } } );
 
       TestMDRange_5D functor( N0, N1, N2, N3, N4 );
 
@@ -1690,7 +1714,7 @@ struct TestMDRange_5D {
       double sum = 0.0;
       parallel_reduce( range, functor, sum );
 
-      ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 );
+      ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) * (N4 - s4) );
     }
 
     // Test with reducers - scalar
@@ -1810,7 +1834,12 @@ struct TestMDRange_5D {
       typedef typename range_type::tile_type tile_type;
       typedef typename range_type::point_type point_type;
 
-      range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 5 } } );
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      int s3 = 1;
+      int s4 = 1;
+      range_type range( point_type{ { s0, s1, s2, s3, s4 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 5 } } );
       TestMDRange_5D functor( N0, N1, N2, N3, N4 );
 
       parallel_for( range, functor );
@@ -1819,11 +1848,11 @@ struct TestMDRange_5D {
       Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i = 0; i < N0; ++i )
-      for ( int j = 0; j < N1; ++j )
-      for ( int k = 0; k < N2; ++k )
-      for ( int l = 0; l < N3; ++l )
-      for ( int m = 0; m < N4; ++m )
+      for ( int i = s0; i < N0; ++i )
+      for ( int j = s1; j < N1; ++j )
+      for ( int k = s2; k < N2; ++k )
+      for ( int l = s3; l < N3; ++l )
+      for ( int m = s4; m < N4; ++m )
       {
         if ( h_view( i, j, k, l, m ) != 3 ) {
           ++counter;
@@ -1831,7 +1860,7 @@ struct TestMDRange_5D {
       }
 
       if ( counter != 0 ) {
-        printf( "Defaults + InitTag op(): Errors in test_for5; mismatches = %d\n\n", counter );
+        printf( "Offset Start + Defaults + InitTag op(): Errors in test_for5; mismatches = %d\n\n", counter );
       }
 
       ASSERT_EQ( counter, 0 );
@@ -2084,7 +2113,13 @@ struct TestMDRange_6D {
       typedef typename range_type::tile_type tile_type;
       typedef typename range_type::point_type point_type;
 
-      range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 3, 2 } } );
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      int s3 = 1;
+      int s4 = 1;
+      int s5 = 1;
+      range_type range( point_type{ { s0, s1, s2, s3, s4, s5 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 3, 2 } } );
 
       TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
 
@@ -2092,7 +2127,7 @@ struct TestMDRange_6D {
       double sum = 0.0;
       parallel_reduce( range, functor, sum );
 
-      ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 * N5 );
+      ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) * (N4 - s4) * (N5 - s5) );
     }
 
     // Test with reducers - scalar
@@ -2214,7 +2249,13 @@ struct TestMDRange_6D {
       typedef typename range_type::tile_type tile_type;
       typedef typename range_type::point_type point_type;
 
-      range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 2, 3 } } ); //tile dims 3,3,3,3,3,3 more than cuda can handle with debugging
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      int s3 = 1;
+      int s4 = 1;
+      int s5 = 1;
+      range_type range( point_type{ { s0, s1, s2, s3, s4, s5 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 2, 3 } } ); //tile dims 3,3,3,3,3,3 more than cuda can handle with debugging
       TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
 
       parallel_for( range, functor );
@@ -2223,12 +2264,12 @@ struct TestMDRange_6D {
       Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i = 0; i < N0; ++i )
-      for ( int j = 0; j < N1; ++j )
-      for ( int k = 0; k < N2; ++k )
-      for ( int l = 0; l < N3; ++l )
-      for ( int m = 0; m < N4; ++m )
-      for ( int n = 0; n < N5; ++n )
+      for ( int i = s0; i < N0; ++i )
+      for ( int j = s1; j < N1; ++j )
+      for ( int k = s2; k < N2; ++k )
+      for ( int l = s3; l < N3; ++l )
+      for ( int m = s4; m < N4; ++m )
+      for ( int n = s5; n < N5; ++n )
       {
         if ( h_view( i, j, k, l, m, n ) != 3 ) {
           ++counter;
@@ -2236,7 +2277,7 @@ struct TestMDRange_6D {
       }
 
       if ( counter != 0 ) {
-        printf( "Defaults + InitTag op(): Errors in test_for6; mismatches = %d\n\n", counter );
+        printf( "Offset Start + Defaults + InitTag op(): Errors in test_for6; mismatches = %d\n\n", counter );
       }
 
       ASSERT_EQ( counter, 0 );