From a55adf4a6848a734a492e2f5dc993041927db08a Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Tue, 3 Oct 2017 11:30:00 -0600 Subject: [PATCH] Update to Kokkos r2.04.04 and add workaround for performance regression --- lib/kokkos/CHANGELOG.md | 19 + lib/kokkos/Makefile.kokkos | 32 +- lib/kokkos/algorithms/src/Kokkos_Random.hpp | 237 ++++++++++++ lib/kokkos/algorithms/unit_tests/Makefile | 12 + lib/kokkos/algorithms/unit_tests/TestROCm.cpp | 112 ++++++ lib/kokkos/bin/hpcbind | 239 ++++++++---- lib/kokkos/bin/kokkos-bind | 221 ----------- lib/kokkos/bin/nvcc_wrapper | 15 +- lib/kokkos/config/master_history.txt | 1 + .../config/trilinos-integration/checkin-test | 2 +- .../containers/src/Kokkos_StaticCrsGraph.hpp | 149 ++++++++ .../KokkosExp_Cuda_IterateTile_Refactor.hpp | 160 ++++---- lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp | 4 +- lib/kokkos/core/src/Kokkos_Complex.hpp | 357 +++++++++++++----- lib/kokkos/core/src/Kokkos_Crs.hpp | 9 +- lib/kokkos/core/src/Kokkos_HBWSpace.hpp | 3 +- lib/kokkos/core/src/Kokkos_NumericTraits.hpp | 6 +- lib/kokkos/core/src/Kokkos_ROCm.hpp | 18 + lib/kokkos/core/src/Makefile | 1 + .../core/src/OpenMP/Kokkos_OpenMP_Exec.hpp | 1 + .../core/src/ROCm/Kokkos_ROCm_Reduce.hpp | 12 +- lib/kokkos/core/src/ROCm/Kokkos_ROCm_Scan.hpp | 4 +- lib/kokkos/core/src/impl/Kokkos_BitOps.hpp | 30 +- lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp | 4 - lib/kokkos/core/unit_test/TestComplex.hpp | 15 +- lib/kokkos/core/unit_test/TestMDRange.hpp | 121 ++++-- 26 files changed, 1222 insertions(+), 562 deletions(-) create mode 100644 lib/kokkos/algorithms/unit_tests/TestROCm.cpp delete mode 100755 lib/kokkos/bin/kokkos-bind diff --git a/lib/kokkos/CHANGELOG.md b/lib/kokkos/CHANGELOG.md index 43d3f17d63..d414056187 100644 --- a/lib/kokkos/CHANGELOG.md +++ b/lib/kokkos/CHANGELOG.md @@ -1,5 +1,24 @@ # Change Log +## [2.04.04](https://github.com/kokkos/kokkos/tree/2.04.04) (2017-09-11) +[Full Changelog](https://github.com/kokkos/kokkos/compare/2.04.00...2.04.04) + +**Implemented enhancements:** + +- OpenMP partition: set number of threads on nested level [\#1082](https://github.com/kokkos/kokkos/issues/1082) +- Add StaticCrsGraph row\(\) method [\#1071](https://github.com/kokkos/kokkos/issues/1071) +- Enhance Kokkos complex operator overloading [\#1052](https://github.com/kokkos/kokkos/issues/1052) +- Tell Trilinos packages about host+device lambda [\#1019](https://github.com/kokkos/kokkos/issues/1019) +- Function markup for defaulted class members [\#952](https://github.com/kokkos/kokkos/issues/952) +- Add deterministic random number generator [\#857](https://github.com/kokkos/kokkos/issues/857) + +**Fixed bugs:** + +- Fix reduction\_identity\::max for floating point numbers [\#1048](https://github.com/kokkos/kokkos/issues/1048) +- Fix MD iteration policy ignores lower bound on GPUs [\#1041](https://github.com/kokkos/kokkos/issues/1041) +- (Experimental) HBWSpace Linking issues in KokkosKernels [\#1094](https://github.com/kokkos/kokkos/issues/1094) +- (Experimental) ROCm: algorithms/unit\_tests test\_sort failing with segfault [\#1070](https://github.com/kokkos/kokkos/issues/1070) + ## [2.04.00](https://github.com/kokkos/kokkos/tree/2.04.00) (2017-08-16) [Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.13...2.04.00) diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos index b8236e8fd1..4641232a1f 100644 --- a/lib/kokkos/Makefile.kokkos +++ b/lib/kokkos/Makefile.kokkos @@ -443,7 +443,7 @@ endif ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1) KOKKOS_CPPFLAGS += -I$(MEMKIND_PATH)/include KOKKOS_LDFLAGS += -L$(MEMKIND_PATH)/lib - KOKKOS_LIBS += -lmemkind + KOKKOS_LIBS += -lmemkind -lnuma tmp := $(shell echo "\#define KOKKOS_HAVE_HBWSPACE 1" >> KokkosCore_config.tmp ) endif @@ -614,9 +614,18 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1) ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) else - # Assume that this is a really a GNU compiler or it could be XL on P8. - KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8 - KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8 + ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1) + KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8 + KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8 + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) + + else + # Assume that this is a really a GNU compiler on P8. + KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8 + KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8 + endif + endif endif endif @@ -626,9 +635,18 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER9), 1) ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) else - # Assume that this is a really a GNU compiler or it could be XL on P9. - KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9 - KOKKOS_LDFLAGS += -mcpu=power9 -mtune=power9 + ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1) + KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9 + KOKKOS_LDFLAGS += -mcpu=power9 -mtune=power9 + else + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) + + else + # Assume that this is a really a GNU compiler on P9 + KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9 + KOKKOS_LDFLAGS += -mcpu=power9 -mtune=power9 + endif + endif endif endif diff --git a/lib/kokkos/algorithms/src/Kokkos_Random.hpp b/lib/kokkos/algorithms/src/Kokkos_Random.hpp index 9082e47052..3db9a145d7 100644 --- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp +++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp @@ -1265,6 +1265,243 @@ void Random_XorShift1024_Pool::free_state(const Random_XorShift102 } +#endif + +#if defined(KOKKOS_ENABLE_ROCM) + + template<> + class Random_XorShift1024 { + private: + int p_; + const int state_idx_; + uint64_t* state_; + const int stride_; + friend class Random_XorShift1024_Pool; + public: + + typedef Kokkos::Experimental::ROCm device_type; + typedef Random_XorShift1024_Pool pool_type; + + enum {MAX_URAND = 0xffffffffU}; + enum {MAX_URAND64 = 0xffffffffffffffffULL-1}; + enum {MAX_RAND = static_cast(0xffffffffU/2)}; + enum {MAX_RAND64 = static_cast(0xffffffffffffffffULL/2-1)}; + + KOKKOS_INLINE_FUNCTION + Random_XorShift1024 (const typename pool_type::state_data_type& state, int p, int state_idx = 0): + p_(p),state_idx_(state_idx),state_(&state(state_idx,0)),stride_(state.stride_1()){ + } + + KOKKOS_INLINE_FUNCTION + uint32_t urand() { + uint64_t state_0 = state_[ p_ * stride_ ]; + uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ]; + state_1 ^= state_1 << 31; + state_1 ^= state_1 >> 11; + state_0 ^= state_0 >> 30; + uint64_t tmp = ( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL; + tmp = tmp>>16; + return static_cast(tmp&MAX_URAND); + } + + KOKKOS_INLINE_FUNCTION + uint64_t urand64() { + uint64_t state_0 = state_[ p_ * stride_ ]; + uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ]; + state_1 ^= state_1 << 31; + state_1 ^= state_1 >> 11; + state_0 ^= state_0 >> 30; + return (( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1; + } + + KOKKOS_INLINE_FUNCTION + uint32_t urand(const uint32_t& range) { + const uint32_t max_val = (MAX_URAND/range)*range; + uint32_t tmp = urand(); + while(tmp>=max_val) + urand(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + uint32_t urand(const uint32_t& start, const uint32_t& end ) { + return urand(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + uint64_t urand64(const uint64_t& range) { + const uint64_t max_val = (MAX_URAND64/range)*range; + uint64_t tmp = urand64(); + while(tmp>=max_val) + urand64(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + uint64_t urand64(const uint64_t& start, const uint64_t& end ) { + return urand64(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + int rand() { + return static_cast(urand()/2); + } + + KOKKOS_INLINE_FUNCTION + int rand(const int& range) { + const int max_val = (MAX_RAND/range)*range; + int tmp = rand(); + while(tmp>=max_val) + rand(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + int rand(const int& start, const int& end ) { + return rand(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + int64_t rand64() { + return static_cast(urand64()/2); + } + + KOKKOS_INLINE_FUNCTION + int64_t rand64(const int64_t& range) { + const int64_t max_val = (MAX_RAND64/range)*range; + int64_t tmp = rand64(); + while(tmp>=max_val) + rand64(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + int64_t rand64(const int64_t& start, const int64_t& end ) { + return rand64(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + float frand() { + return 1.0f * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + float frand(const float& range) { + return range * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + float frand(const float& start, const float& end ) { + return frand(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + double drand() { + return 1.0 * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + double drand(const double& range) { + return range * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + double drand(const double& start, const double& end ) { + return frand(end-start)+start; + } + + //Marsaglia polar method for drawing a standard normal distributed random number + KOKKOS_INLINE_FUNCTION + double normal() { + double S = 2.0; + double U; + while(S>=1.0) { + U = 2.0*drand() - 1.0; + const double V = 2.0*drand() - 1.0; + S = U*U+V*V; + } + return U*std::sqrt(-2.0*log(S)/S); + } + + KOKKOS_INLINE_FUNCTION + double normal(const double& mean, const double& std_dev=1.0) { + return mean + normal()*std_dev; + } + }; + +template<> +inline +Random_XorShift64_Pool::Random_XorShift64_Pool(uint64_t seed) { + num_states_ = 0; + init(seed,4*32768); +} + +template<> +KOKKOS_INLINE_FUNCTION +Random_XorShift64 Random_XorShift64_Pool::get_state() const { +#ifdef __HCC_ACCELERATOR__ + const int i_offset = (threadIdx_x*blockDim_y + threadIdx_y)*blockDim_z+threadIdx_z; + int i = (((blockIdx_x*gridDim_y+blockIdx_y)*gridDim_z + blockIdx_z) * + blockDim_x*blockDim_y*blockDim_z + i_offset)%num_states_; + while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) { + i+=blockDim_x*blockDim_y*blockDim_z; + if(i>=num_states_) {i = i_offset;} + } + + return Random_XorShift64(state_(i),i); +#else + return Random_XorShift64(state_(0),0); +#endif +} + +template<> +KOKKOS_INLINE_FUNCTION +void Random_XorShift64_Pool::free_state(const Random_XorShift64 &state) const { +#ifdef __HCC_ACCELERATOR__ + state_(state.state_idx_) = state.state_; + locks_(state.state_idx_) = 0; + return; +#endif +} + + +template<> +inline +Random_XorShift1024_Pool::Random_XorShift1024_Pool(uint64_t seed) { + num_states_ = 0; + init(seed,4*32768); +} + +template<> +KOKKOS_INLINE_FUNCTION +Random_XorShift1024 Random_XorShift1024_Pool::get_state() const { +#ifdef __HCC_ACCELERATOR__ + const int i_offset = (threadIdx_x*blockDim_y + threadIdx_y)*blockDim_z+threadIdx_z; + int i = (((blockIdx_x*gridDim_y+blockIdx_y)*gridDim_z + blockIdx_z) * + blockDim_x*blockDim_y*blockDim_z + i_offset)%num_states_; + while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) { + i+=blockDim_x*blockDim_y*blockDim_z; + if(i>=num_states_) {i = i_offset;} + } + + return Random_XorShift1024(state_, p_(i), i); +#else + return Random_XorShift1024(state_, p_(0), 0); +#endif +} + +template<> +KOKKOS_INLINE_FUNCTION +void Random_XorShift1024_Pool::free_state(const Random_XorShift1024 &state) const { +#ifdef __HCC_ACCELERATOR__ + for(int i=0; i<16; i++) + state_(state.state_idx_,i) = state.state_[i]; + locks_(state.state_idx_) = 0; + return; +#endif +} + + #endif diff --git a/lib/kokkos/algorithms/unit_tests/Makefile b/lib/kokkos/algorithms/unit_tests/Makefile index b74192ef18..a5a10c82ee 100644 --- a/lib/kokkos/algorithms/unit_tests/Makefile +++ b/lib/kokkos/algorithms/unit_tests/Makefile @@ -30,6 +30,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) TEST_TARGETS += test-cuda endif +ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1) + OBJ_ROCM = TestROCm.o UnitTestMain.o gtest-all.o + TARGETS += KokkosAlgorithms_UnitTest_ROCm + TEST_TARGETS += test-rocm +endif + ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) OBJ_THREADS = TestThreads.o UnitTestMain.o gtest-all.o TARGETS += KokkosAlgorithms_UnitTest_Threads @@ -51,6 +57,9 @@ endif KokkosAlgorithms_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS) $(LINK) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Cuda +KokkosAlgorithms_UnitTest_ROCm: $(OBJ_ROCM) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) $(OBJ_ROCM) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_ROCm + KokkosAlgorithms_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS) $(LINK) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Threads @@ -63,6 +72,9 @@ KokkosAlgorithms_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS) test-cuda: KokkosAlgorithms_UnitTest_Cuda ./KokkosAlgorithms_UnitTest_Cuda +test-rocm: KokkosAlgorithms_UnitTest_ROCm + ./KokkosAlgorithms_UnitTest_ROCm + test-threads: KokkosAlgorithms_UnitTest_Threads ./KokkosAlgorithms_UnitTest_Threads diff --git a/lib/kokkos/algorithms/unit_tests/TestROCm.cpp b/lib/kokkos/algorithms/unit_tests/TestROCm.cpp new file mode 100644 index 0000000000..720b377ed2 --- /dev/null +++ b/lib/kokkos/algorithms/unit_tests/TestROCm.cpp @@ -0,0 +1,112 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#ifdef KOKKOS_ENABLE_ROCM + +#include +#include +#include + +#include + +#include + +#include +#include + +namespace Test { + +class rocm : public ::testing::Test { +protected: + static void SetUpTestCase() + { + std::cout << std::setprecision(5) << std::scientific; + Kokkos::HostSpace::execution_space::initialize(); + Kokkos::Experimental::ROCm::initialize( Kokkos::Experimental::ROCm::SelectDevice(0) ); + } + static void TearDownTestCase() + { + Kokkos::Experimental::ROCm::finalize(); + Kokkos::HostSpace::execution_space::finalize(); + } +}; + +void rocm_test_random_xorshift64( int num_draws ) +{ + Impl::test_random >(num_draws); +} + +void rocm_test_random_xorshift1024( int num_draws ) +{ + Impl::test_random >(num_draws); +} + + +#define ROCM_RANDOM_XORSHIFT64( num_draws ) \ + TEST_F( rocm, Random_XorShift64 ) { \ + rocm_test_random_xorshift64(num_draws); \ + } + +#define ROCM_RANDOM_XORSHIFT1024( num_draws ) \ + TEST_F( rocm, Random_XorShift1024 ) { \ + rocm_test_random_xorshift1024(num_draws); \ + } + +#define ROCM_SORT_UNSIGNED( size ) \ + TEST_F( rocm, SortUnsigned ) { \ + Impl::test_sort< Kokkos::Experimental::ROCm, unsigned >(size); \ + } + +ROCM_RANDOM_XORSHIFT64( 132141141 ) +ROCM_RANDOM_XORSHIFT1024( 52428813 ) +ROCM_SORT_UNSIGNED(171) + +#undef ROCM_RANDOM_XORSHIFT64 +#undef ROCM_RANDOM_XORSHIFT1024 +#undef ROCM_SORT_UNSIGNED +} +#else +void KOKKOS_ALGORITHMS_UNITTESTS_TESTROCM_PREVENT_LINK_ERROR() {} +#endif /* #ifdef KOKKOS_ENABLE_ROCM */ + diff --git a/lib/kokkos/bin/hpcbind b/lib/kokkos/bin/hpcbind index ca34648780..b88b334f8b 100755 --- a/lib/kokkos/bin/hpcbind +++ b/lib/kokkos/bin/hpcbind @@ -27,7 +27,7 @@ fi HPCBIND_HWLOC_PARENT_CPUSET="" if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then MY_PID="$BASHPID" - HPCBIND_HWLOC_PARENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2) + HPCBIND_HWLOC_PARENT_CPUSET="$(hwloc-ps -a --cpuset | grep ${MY_PID} | cut -f 2)" fi ################################################################################ @@ -58,23 +58,34 @@ declare -i HPCBIND_ENABLE_GPU_MAPPING=$((NUM_GPUS > 0)) ################################################################################ HPCBIND_QUEUE_NAME="" declare -i HPCBIND_QUEUE_INDEX=0 -declare -i HPCBIND_QUEUE_GPU_MAPPING=0 +declare -i HPCBIND_QUEUE_MAPPING=0 -if [[ ! -z "${SLURM_LOCAL_ID}" ]]; then - HPCBIND_QUEUE_GPU_MAPPING=1 - HPCBIND_QUEUE_NAME="sbatch" +if [[ ! -z "${PMI_RANK}" ]]; then + HPCBIND_QUEUE_MAPPING=1 + HPCBIND_QUEUE_NAME="mpich" + HPCBIND_QUEUE_INDEX=${PMI_RANK} +elif [[ ! -z "${OMPI_COMM_WORLD_RANK}" ]]; then + HPCBIND_QUEUE_MAPPING=1 + HPCBIND_QUEUE_NAME="openmpi" + HPCBIND_QUEUE_INDEX=${OMPI_COMM_WORLD_RANK} +elif [[ ! -z "${MV2_COMM_WORLD_RANK}" ]]; then + HPCBIND_QUEUE_MAPPING=1 + HPCBIND_QUEUE_NAME="mvapich2" + HPCBIND_QUEUE_INDEX=${MV2_COMM_WORLD_RANK} +elif [[ ! -z "${SLURM_LOCAL_ID}" ]]; then + HPCBIND_QUEUE_MAPPING=1 + HPCBIND_QUEUE_NAME="slurm" HPCBIND_QUEUE_INDEX=${SLURM_LOCAL_ID} elif [[ ! -z "${LBS_JOBINDEX}" ]]; then - HPCBIND_QUEUE_GPU_MAPPING=1 + HPCBIND_QUEUE_MAPPING=1 HPCBIND_QUEUE_NAME="bsub" HPCBIND_QUEUE_INDEX=${LBS_JOBINDEX} elif [[ ! -z "${ALPS_APP_PE}" ]]; then - HPCBIND_QUEUE_GPU_MAPPING=1 + HPCBIND_QUEUE_MAPPING=1 HPCBIND_QUEUE_NAME="aprun" HPCBIND_QUEUE_INDEX=${ALPS_APP_PE} fi - ################################################################################ # Show help ################################################################################ @@ -91,13 +102,14 @@ function show_help { echo " --proc-bind= Set the initial process mask for the script" echo " LOC can be any valid location argument for" echo " hwloc-calc Default: all" + echo " --whole-system ${cmd} will ignore the its parent process binding" echo " --distribute=N Distribute the current cpuset into N partitions" echo " --distribute-partition=I" echo " Use the i'th partition (zero based)" echo " --visible-gpus= Comma separated list of gpu ids" echo " Default: CUDA_VISIBLE_DEVICES or all gpus in" echo " sequential order" - echo " --gpu-ignore-queue Ignore queue job id when choosing visible GPU" + echo " --ignore-queue Ignore queue job id when choosing visible GPU and partition" echo " --no-gpu-mapping Do not set CUDA_VISIBLE_DEVICES" echo " --openmp=M.m Set env variables for the given OpenMP version" echo " Default: 4.0" @@ -110,22 +122,30 @@ function show_help { echo " --force-openmp-proc-bind=" echo " Override logic for selecting OMP_PROC_BIND" echo " --no-openmp-nested Set OMP_NESTED to false" - echo " --show-bindings Show the bindings" - echo " --lstopo Show bindings in lstopo without executing a command" - echo " -v|--verbose Show options and relevant environment variables" + echo " --output-prefix=

Save the output to files of the form" + echo " P-N.log, P-N.out and P-N.err where P is the prefix" + echo " and N is the queue index or mpi rank (no spaces)" + echo " --output-mode= How console output should be handled." + echo " Options are all, rank0, and none. Default: rank0" + echo " --lstopo Show bindings in lstopo" + echo " -v|--verbose Print bindings and relevant environment variables" echo " -h|--help Show this message" echo "" echo "Sample Usage:" echo " Split the current process cpuset into 4 and use the 3rd partition" echo " ${cmd} --distribute=4 --distribute-partition=2 -v -- command ..." - echo " Bing the process to all even cores" + echo " Launch 16 jobs over 4 nodes with 4 jobs per node using only the even pus" + echo " and save the output to rank specific files" + echo " mpiexec -N 16 -npernode 4 ${cmd} --whole-system --proc-bind=pu:even \\" + echo " --distribute=4 -v --output-prefix=output -- command ..." + echo " Bind the process to all even cores" echo " ${cmd} --proc-bind=core:even -v -- command ..." - echo " Bind to the first 64 cores and split the current process cpuset into 4" - echo " ${cmd} --proc-bind=core:0-63 --distribute=4 --distribute-partition=0 -- command ..." - echo " skip GPU 0 when mapping visible devices" + echo " Bind the the even cores of socket 0 and the odd cores of socket 1" + echo " ${cmd} --proc-bind='socket:0.core:even socket:1.core:odd' -v -- command ..." + echo " Skip GPU 0 when mapping visible devices" echo " ${cmd} --distribute=4 --distribute-partition=0 --visible-gpus=1,2 -v -- command ..." echo " Display the current bindings" - echo " ${cmd} --proc-bind=numa:0 --show-bindings -- command" + echo " ${cmd} --proc-bind=numa:0 -- command" echo " Display the current bindings using lstopo" echo " ${cmd} --proc-bind=numa:0.core:odd --lstopo" echo "" @@ -144,7 +164,7 @@ fi declare -a UNKNOWN_ARGS=() declare -i HPCBIND_ENABLE_HWLOC_BIND=${HPCBIND_HAS_HWLOC} declare -i HPCBIND_DISTRIBUTE=1 -declare -i HPCBIND_PARTITION=0 +declare -i HPCBIND_PARTITION=-1 HPCBIND_PROC_BIND="all" HPCBIND_OPENMP_VERSION=4.0 declare -i HPCBIND_OPENMP_PERCENT=100 @@ -155,11 +175,15 @@ HPCBIND_OPENMP_FORCE_PROC_BIND="" HPCBIND_OPENMP_NESTED=${OMP_NESTED:-true} declare -i HPCBIND_VERBOSE=0 -declare -i HPCBIND_SHOW_BINDINGS=0 declare -i HPCBIND_LSTOPO=0 -for i in $@; do - case $i in +HPCBIND_OUTPUT_PREFIX="" +HPCBIND_OUTPUT_MODE="rank0" + +declare -i HPCBIND_HAS_COMMAND=0 + +for i in "$@"; do + case "$i" in # number of partitions to create --no-hwloc-bind) HPCBIND_ENABLE_HWLOC_BIND=0 @@ -169,6 +193,10 @@ for i in $@; do HPCBIND_PROC_BIND="${i#*=}" shift ;; + --whole-system) + HPCBIND_HWLOC_PARENT_CPUSET="" + shift + ;; --distribute=*) HPCBIND_DISTRIBUTE="${i#*=}" shift @@ -182,8 +210,8 @@ for i in $@; do HPCBIND_VISIBLE_GPUS=$(echo "${i#*=}" | tr ',' ' ') shift ;; - --gpu-ignore-queue) - HPCBIND_QUEUE_GPU_MAPPING=0 + --ignore-queue) + HPCBIND_QUEUE_MAPPING=0 shift ;; --no-gpu-mapping) @@ -218,14 +246,18 @@ for i in $@; do HPCBIND_OPENMP_NESTED="false" shift ;; - --show-bindings) - HPCBIND_VERBOSE=1 - HPCBIND_SHOW_BINDINGS=1 + --output-prefix=*) + HPCBIND_OUTPUT_PREFIX="${i#*=}" + shift + ;; + --output-mode=*) + HPCBIND_OUTPUT_MODE="${i#*=}" + #convert to lower case + HPCBIND_OUTPUT_MODE="${HPCBIND_OUTPUT_MODE,,}" shift ;; --lstopo) HPCBIND_VERBOSE=1 - HPCBIND_SHOW_BINDINGS=0 HPCBIND_LSTOPO=1 shift ;; @@ -239,6 +271,7 @@ for i in $@; do ;; # ignore remaining arguments --) + HPCBIND_HAS_COMMAND=1 shift break ;; @@ -250,16 +283,41 @@ for i in $@; do esac done +################################################################################ +# Check output mode +################################################################################ +declare -i HPCBIND_TEE=0 + +if [[ "${HPCBIND_OUTPUT_MODE}" == "none" ]]; then + HPCBIND_TEE=0 +elif [[ "${HPCBIND_OUTPUT_MODE}" == "all" ]]; then + HPCBIND_TEE=1 +elif [[ ${HPCBIND_QUEUE_INDEX} -eq 0 ]]; then + #default to rank0 printing to screen + HPCBIND_TEE=1 +fi + + +if [[ "${HPCBIND_OUTPUT_PREFIX}" == "" ]]; then + HPCBIND_LOG=/dev/null + HPCBIND_ERR=/dev/null + HPCBIND_OUT=/dev/null +else + HPCBIND_LOG="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.hpc.log" + HPCBIND_ERR="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.err" + HPCBIND_OUT="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.out" + > ${HPCBIND_LOG} +fi + ################################################################################ # Check unknown arguments ################################################################################ if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then - echo "Uknown options: ${UNKNOWN_ARGS[*]}" + echo "HPCBIND Uknown options: ${UNKNOWN_ARGS[*]}" > >(tee -a ${HPCBIND_LOG}) exit 1 fi - ################################################################################ # Check that visible gpus are valid ################################################################################ @@ -268,22 +326,19 @@ if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then for ((i=0; i < ${#HPCBIND_VISIBLE_GPUS[*]}; i++)); do if [[ ${HPCBIND_VISIBLE_GPUS[$i]} -ge ${NUM_GPUS} || ${HPCBIND_VISIBLE_GPUS[$i]} -lt 0 ]]; then - echo "Invaild GPU ID ${HPCBIND_VISIBLE_GPUS[$i]}, setting to 0" + echo "HPCBIND Invaild GPU ID ${HPCBIND_VISIBLE_GPUS[$i]} (setting to 0)" > >(tee -a ${HPCBIND_LOG}) HPCBIND_VISIBLE_GPUS[$i]=0; fi done NUM_GPUS=${#HPCBIND_VISIBLE_GPUS[@]} fi - ################################################################################ # Check OpenMP percent ################################################################################ if [[ ${HPCBIND_OPENMP_PERCENT} -lt 1 ]]; then - echo "OpenMP percent < 1, setting to 1" HPCBIND_OPENMP_PERCENT=1 elif [[ ${HPCBIND_OPENMP_PERCENT} -gt 100 ]]; then - echo "OpenMP percent > 100, setting to 100" HPCBIND_OPENMP_PERCENT=100 fi @@ -291,15 +346,21 @@ fi # Check distribute ################################################################################ if [[ ${HPCBIND_DISTRIBUTE} -le 0 ]]; then - echo "Invalid input for distribute, changing distribute to 1" HPCBIND_DISTRIBUTE=1 fi -if [[ ${HPCBIND_PARTITION} -ge ${HPCBIND_DISTRIBUTE} ]]; then - echo "Invalid input for distribute-partition, changing to 0" +################################################################################ +#choose the correct partition +################################################################################ +if [[ ${HPCBIND_PARTITION} -lt 0 && ${HPCBIND_QUEUE_MAPPING} -eq 1 ]]; then + HPCBIND_PARTITION=${HPCBIND_QUEUE_INDEX} +elif [[ ${HPCBIND_PARTITION} -lt 0 ]]; then HPCBIND_PARTITION=0 fi +if [[ ${HPCBIND_PARTITION} -ge ${HPCBIND_DISTRIBUTE} ]]; then + HPCBIND_PARTITION=$((HPCBIND_PARTITION % HPCBIND_DISTRIBUTE)) +fi ################################################################################ # Find cpuset and num threads @@ -309,13 +370,17 @@ declare -i HPCBIND_NUM_PUS=0 if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then - BINDING=$(hwloc-calc ${HPCBIND_PROC_BIND}) + BINDING=$(hwloc-calc ${HPCBIND_PROC_BIND[*]}) else - BINDING=$(hwloc-calc --restrict ${HPCBIND_HWLOC_PARENT_CPUSET} ${HPCBIND_PROC_BIND}) + BINDING=$(hwloc-calc --restrict ${HPCBIND_HWLOC_PARENT_CPUSET} ${HPCBIND_PROC_BIND[*]}) fi - CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${HPCBIND_DISTRIBUTE})) - HPCBIND_HWLOC_CPUSET=${CPUSETS[${HPCBIND_PARTITION}]} + if [[ ${HPCBIND_DISTRIBUTE} -gt 1 ]]; then + CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${HPCBIND_DISTRIBUTE})) + HPCBIND_HWLOC_CPUSET="${CPUSETS[${HPCBIND_PARTITION}]}" + else + HPCBIND_HWLOC_CPUSET="${BINDING}" + fi HPCBIND_NUM_PUS=$(hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu | wc -l) else HPCBIND_NUM_PUS=$(cat /proc/cpuinfo | grep -c processor) @@ -373,13 +438,13 @@ export OMP_NESTED=${HPCBIND_OPENMP_NESTED} ################################################################################ if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then - if [[ ${HPCBIND_QUEUE_GPU_MAPPING} -eq 0 ]]; then + if [[ ${HPCBIND_QUEUE_MAPPING} -eq 0 ]]; then declare -i GPU_ID=$((HPCBIND_PARTITION % NUM_GPUS)) - export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]} + export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}" else declare -i MY_TASK_ID=$((HPCBIND_QUEUE_INDEX * HPCBIND_DISTRIBUTE + HPCBIND_PARTITION)) declare -i GPU_ID=$((MY_TASK_ID % NUM_GPUS)) - export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]} + export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}" fi fi @@ -389,22 +454,22 @@ fi export HPCBIND_HAS_HWLOC=${HPCBIND_HAS_HWLOC} export HPCBIND_HAS_NVIDIA=${HPCBIND_HAS_NVIDIA} export HPCBIND_NUM_PUS=${HPCBIND_NUM_PUS} -export HPCBIND_HWLOC_CPUSET=${HPCBIND_HWLOC_CPUSET} +export HPCBIND_HWLOC_CPUSET="${HPCBIND_HWLOC_CPUSET}" export HPCBIND_HWLOC_DISTRIBUTE=${HPCBIND_DISTRIBUTE} export HPCBIND_HWLOC_DISTRIBUTE_PARTITION=${HPCBIND_PARTITION} if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then export HPCBIND_HWLOC_PARENT_CPUSET="all" else - export HPCBIND_HWLOC_PARENT_CPUSET=${HPCBIND_HWLOC_PARENT_CPUSET} + export HPCBIND_HWLOC_PARENT_CPUSET="${HPCBIND_HWLOC_PARENT_CPUSET}" fi -export HPCBIND_HWLOC_PROC_BIND=${HPCBIND_PROC_BIND} +export HPCBIND_HWLOC_PROC_BIND="${HPCBIND_PROC_BIND}" export HPCBIND_NVIDIA_ENABLE_GPU_MAPPING=${HPCBIND_ENABLE_GPU_MAPPING} export HPCBIND_NVIDIA_VISIBLE_GPUS=$(echo "${HPCBIND_VISIBLE_GPUS[*]}" | tr ' ' ',') -export HPCBIND_OPENMP_VERSION=${HPCBIND_OPENMP_VERSION} +export HPCBIND_OPENMP_VERSION="${HPCBIND_OPENMP_VERSION}" if [[ "${HPCBIND_QUEUE_NAME}" != "" ]]; then export HPCBIND_QUEUE_INDEX=${HPCBIND_QUEUE_INDEX} - export HPCBIND_QUEUE_NAME=${HPCBIND_QUEUE_NAME} - export HPCBIND_QUEUE_GPU_MAPPING=${HPCBIND_QUEUE_GPU_MAPPING} + export HPCBIND_QUEUE_NAME="${HPCBIND_QUEUE_NAME}" + export HPCBIND_QUEUE_MAPPING=${HPCBIND_QUEUE_MAPPING} fi @@ -412,43 +477,63 @@ fi # Print verbose ################################################################################ -if [[ ${HPCBIND_VERBOSE} -eq 1 ]]; then - MY_ENV=$(env | sort) - echo "[HPCBIND]" - echo "${MY_ENV}" | grep -E "^HPCBIND_" - echo "[CUDA]" - echo "${MY_ENV}" | grep -E "^CUDA_" - echo "[OPENMP]" - echo "${MY_ENV}" | grep -E "^OMP_" -fi +TMP_ENV=$(env | sort) +if [[ ${HPCBIND_TEE} -eq 0 || ${HPCBIND_VERBOSE} -eq 0 ]]; then + echo "[HOST]" >> ${HPCBIND_LOG} + hostname -s >> ${HPCBIND_LOG} + echo "[HPCBIND]" >> ${HPCBIND_LOG} + echo "${TMP_ENV}" | grep -E "^HPCBIND_" >> ${HPCBIND_LOG} + echo "[CUDA]" >> ${HPCBIND_LOG} + echo "${TMP_ENV}" | grep -E "^CUDA_" >> ${HPCBIND_LOG} + echo "[OPENMP]" >> ${HPCBIND_LOG} + echo "${TMP_ENV}" | grep -E "^OMP_" >> ${HPCBIND_LOG} -if [[ ${HPCBIND_HAS_HWLOC} -eq 1 && ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then - echo "[BINDINGS]" - hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu -elif [[ ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then - echo "Unable to show bindings, hwloc not available." + if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then + echo "[BINDINGS]" >> ${HPCBIND_LOG} + hwloc-ls --restrict "${HPCBIND_HWLOC_CPUSET}" --only pu >> ${HPCBIND_LOG} + else + echo "Unable to show bindings, hwloc not available." >> ${HPCBIND_LOG} + fi +else + echo "[HOST]" > >(tee -a ${HPCBIND_LOG}) + hostname -s > >(tee -a ${HPCBIND_LOG}) + echo "[HPCBIND]" > >(tee -a ${HPCBIND_LOG}) + echo "${TMP_ENV}" | grep -E "^HPCBIND_" > >(tee -a ${HPCBIND_LOG}) + echo "[CUDA]" > >(tee -a ${HPCBIND_LOG}) + echo "${TMP_ENV}" | grep -E "^CUDA_" > >(tee -a ${HPCBIND_LOG}) + echo "[OPENMP]" > >(tee -a ${HPCBIND_LOG}) + echo "${TMP_ENV}" | grep -E "^OMP_" > >(tee -a ${HPCBIND_LOG}) + + if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then + echo "[BINDINGS]" > >(tee -a ${HPCBIND_LOG}) + hwloc-ls --restrict "${HPCBIND_HWLOC_CPUSET}" --only pu > >(tee -a ${HPCBIND_LOG}) + else + echo "Unable to show bindings, hwloc not available." > >(tee -a ${HPCBIND_LOG}) + fi fi ################################################################################ # Run command ################################################################################ -if [[ ${HPCBIND_LSTOPO} -eq 0 ]]; then - if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then - hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- $@ - else - eval $@ - fi -else - if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then - if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 && ! -z ${DISPLAY} ]]; then - echo "[BINDINGS]" - hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu - hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- lstopo --pid 0 +# must be the last executed command so that the return value is correct +if [[ ${HPCBIND_LSTOPO} -eq 1 && ${HPCBIND_HAS_HWLOC} -eq 1 && ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 && ! -z ${DISPLAY} ]]; then + hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- lstopo --pid 0 +elif [[ ${HPCBIND_HAS_COMMAND} -eq 1 ]]; then + # clear output files + > ${HPCBIND_ERR} + > ${HPCBIND_OUT} + if [[ ${HPCBIND_TEE} -eq 0 ]]; then + if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then + hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR} else - hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} + eval $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR} fi else - echo "Unable to show bindings, hwloc not available." + if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then + hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2) + else + eval $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2) + fi fi fi diff --git a/lib/kokkos/bin/kokkos-bind b/lib/kokkos/bin/kokkos-bind deleted file mode 100755 index b6fe07a1bd..0000000000 --- a/lib/kokkos/bin/kokkos-bind +++ /dev/null @@ -1,221 +0,0 @@ -#!/usr/bin/env bash - -# check if hwloc commands exist -declare -i HAS_HWLOC=0 -type hwloc-bind >/dev/null 2>&1 -HAS_HWLOC="${HAS_HWLOC} + $?" - -type hwloc-distrib >/dev/null 2>&1 -HAS_HWLOC="${HAS_HWLOC} + $?" - -type hwloc-ls >/dev/null 2>&1 -HAS_HWLOC="${HAS_HWLOC} + $?" - -type hwloc-calc >/dev/null 2>&1 -HAS_HWLOC="${HAS_HWLOC} + $?" - -type hwloc-ps >/dev/null 2>&1 -HAS_HWLOC="${HAS_HWLOC} + $?" - - -#parse args -declare -a UNKNOWN_ARGS=() -declare -i DISTRIBUTE=1 -declare -i INDEX=0 -PROC_BIND="all" -CURRENT_CPUSET="" -OPENMP_VERSION=4.0 -OPENMP_PROC_BIND=True -OPENMP_NESTED=True -VERBOSE=False - -#get the current process cpuset -if [[ ${HAS_HWLOC} -eq 0 ]]; then - MY_PID="$BASHPID" - CURRENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2) - echo "$CURRENT_CPUSET" -fi - -function show_help { - local cmd=$(basename "$0") - echo "Usage: ${cmd} -- command ..." - echo " Uses hwloc to divide the node into the given number of groups," - echo " set the appropriate OMP_NUM_THREADS and execute the command on the" - echo " selected group." - echo "" - echo " NOTE: This command assumes it has exclusive use of the node" - echo "" - echo "Options:" - echo " --proc-bind= Set the initial process mask for the script. " - echo " LOC can be any valid location argumnet for" - echo " hwloc-calc. Defaults to the entire machine" - echo " --distribute=N Distribute the current proc-bind into N groups" - echo " --index=I Use the i'th group (zero based)" - echo " --openmp=M.m Set env variables for the given OpenMP version" - echo " (default 4.0)" - echo " --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES" - echo " --no-openmp-nested Set OMP_NESTED to false" - echo " -v|--verbose" - echo " -h|--help" - echo "" - echo "Sample Usage:" - echo " ${cmd} --distribute=4 --index=2 -v -- command ..." - echo "" -} - -if [[ "$#" -eq 0 ]]; then - show_help - exit 0 -fi - - -for i in $@; do - case $i in - # number of partitions to create - --proc-bind=*) - PROC_BIND="${i#*=}" - shift - ;; - --distribute=*) - DISTRIBUTE="${i#*=}" - shift - ;; - # which group to use - --index=*) - INDEX="${i#*=}" - shift - ;; - --openmp=*) - OPENMP_VERSION="${i#*=}" - shift - ;; - --no-openmp-proc-bind) - OPENMP_PROC_BIND=False - shift - ;; - --no-openmp-nested) - OPENMP_NESTED=False - shift - ;; - -v|--verbose) - VERBOSE=True - shift - ;; - -h|--help) - show_help - exit 0 - ;; - # ignore remaining arguments - --) - shift - break - ;; - # unknown option - *) - UNKNOWN_ARGS+=("$i") - shift - ;; - esac -done - -if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then - echo "Uknown options: ${UNKNOWN_ARGS[*]}" - exit 1 -fi - -if [[ ${DISTRIBUTE} -le 0 ]]; then - echo "Invalid input for distribute, changing distribute to 1" - DISTRIBUTE=1 -fi - -if [[ ${INDEX} -ge ${DISTRIBUTE} ]]; then - echo "Invalid input for index, changing index to 0" - INDEX=0 -fi - -if [[ ${HAS_HWLOC} -ne 0 ]]; then - echo "hwloc not found, no process binding will occur" - DISTRIBUTE=1 - INDEX=0 -fi - -if [[ ${HAS_HWLOC} -eq 0 ]]; then - - if [[ "${CURRENT_CPUSET}" == "" ]]; then - BINDING=$(hwloc-calc ${PROC_BIND}) - else - BINDING=$(hwloc-calc --restrict ${CURRENT_CPUSET} ${PROC_BIND}) - fi - - CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${DISTRIBUTE})) - CPUSET=${CPUSETS[${INDEX}]} - NUM_THREADS=$(hwloc-ls --restrict ${CPUSET} --only pu | wc -l) - - if [[ "${VERBOSE}" == "True" ]]; then - echo "hwloc: true" - echo " proc_bind: ${PROC_BIND}" - echo " distribute: ${DISTRIBUTE}" - echo " index: ${INDEX}" - echo " parent_cpuset: ${CURRENT_CPUSET}" - echo " cpuset: ${CPUSET}" - echo "omp_num_threads: ${NUM_THREADS}" - echo "omp_proc_bind: ${OPENMP_PROC_BIND}" - echo "omp_nested: ${OPENMP_NESTED}" - echo "OpenMP: ${OPENMP_VERSION}" - fi - - # set OMP env - if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then - if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then - export OMP_PLACES="threads" - export OMP_PROC_BIND="spread" - else - export OMP_PROC_BIND="true" - unset OMP_PLACES - fi - else - unset OMP_PLACES - unset OMP_PROC_BIND - fi - if [[ "${OPENMP_NESTED}" == "True" ]]; then - export OMP_NESTED="true" - else - export OMP_NESTED="false" - fi - export OMP_NUM_THREADS="${NUM_THREADS}" - - hwloc-bind ${CPUSET} -- $@ -else - NUM_THREADS=$(cat /proc/cpuinfo | grep -c processor) - - if [[ "${VERBOSE}" == "True" ]]; then - echo "hwloc: false" - echo "omp_num_threads: ${NUM_THREADS}" - echo "omp_proc_bind: ${OPENMP_PROC_BIND}" - echo "omp_nested: ${OPENMP_NESTED}" - echo "OpenMP: ${OPENMP_VERSION}" - fi - - # set OMP env - if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then - if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then - export OMP_PLACES="threads" - export OMP_PROC_BIND="spread" - else - export OMP_PROC_BIND="true" - unset OMP_PLACES - fi - else - unset OMP_PLACES - unset OMP_PROC_BIND - fi - if [[ "${OPENMP_NESTED}" == "True" ]]; then - export OMP_NESTED="true" - else - export OMP_NESTED="false" - fi - export OMP_NUM_THREADS="${NUM_THREADS}" - - eval $@ -fi - diff --git a/lib/kokkos/bin/nvcc_wrapper b/lib/kokkos/bin/nvcc_wrapper index 09fa5d500a..76e33f3c66 100755 --- a/lib/kokkos/bin/nvcc_wrapper +++ b/lib/kokkos/bin/nvcc_wrapper @@ -78,6 +78,9 @@ temp_dir=${TMPDIR:-/tmp} # Check if we have an optimization argument already optimization_applied=0 +# Check if we have -std=c++X or --std=c++X already +stdcxx_applied=0 + #echo "Arguments: $# $@" while [ $# -gt 0 ] @@ -130,10 +133,16 @@ do cuda_args="$cuda_args $1 $2" shift ;; - #Handle c++11 setting - --std=c++11|-std=c++11) - shared_args="$shared_args $1" + #Handle c++11 + --std=c++11|-std=c++11|--std=c++14|-std=c++14|--std=c++1z|-std=c++1z) + if [ $stdcxx_applied -eq 1 ]; then + echo "nvcc_wrapper - *warning* you have set multiple optimization flags (-std=c++1* or --std=c++1*), only the first is used because nvcc can only accept a single std setting" + else + shared_args="$shared_args $1" + stdcxx_applied=1 + fi ;; + #strip of -std=c++98 due to nvcc warnings and Tribits will place both -std=c++11 and -std=c++98 -std=c++98|--std=c++98) ;; diff --git a/lib/kokkos/config/master_history.txt b/lib/kokkos/config/master_history.txt index 96b05c02e1..6f9ca897d9 100644 --- a/lib/kokkos/config/master_history.txt +++ b/lib/kokkos/config/master_history.txt @@ -9,3 +9,4 @@ tag: 2.03.00 date: 04:25:2017 master: 120d9ce7 develop: 015ba641 tag: 2.03.05 date: 05:27:2017 master: 36b92f43 develop: 79073186 tag: 2.03.13 date: 07:27:2017 master: da314444 develop: 29ccb58a tag: 2.04.00 date: 08:16:2017 master: 54eb75c0 develop: 32fb8ee1 +tag: 2.04.04 date: 09:11:2017 master: 2b7e9c20 develop: 51e7b25a diff --git a/lib/kokkos/config/trilinos-integration/checkin-test b/lib/kokkos/config/trilinos-integration/checkin-test index 92a1b1c068..ffb565fcbb 100644 --- a/lib/kokkos/config/trilinos-integration/checkin-test +++ b/lib/kokkos/config/trilinos-integration/checkin-test @@ -1,4 +1,4 @@ module purge -module load sems-env sems-gcc/4.9.3 sems-openmpi/1.10.1 sems-hdf5/1.8.12/parallel sems-netcdf/4.3.2/parallel sems-python/2.7.9 sems-zlib/1.2.8/base sems-cmake/3.5.2 sems-parmetis/4.0.3/64bit_parallel sems-scotch/6.0.3/nopthread_64bit_parallel sems-boost/1.59.0/base +module load sems-env sems-gcc/4.9.3 sems-openmpi/1.10.1 sems-hdf5/1.8.12/parallel sems-netcdf/4.3.2/parallel sems-python/2.7.9 sems-zlib/1.2.8/base sems-cmake/3.5.2 sems-parmetis/4.0.3/64bit_parallel sems-scotch/6.0.3/nopthread_64bit_parallel sems-boost/1.63.0/base sems-yaml_cpp sems-superlu #Run Trilinos CheckinTest diff --git a/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp index 0408472c68..996b6b5610 100644 --- a/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp +++ b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp @@ -125,6 +125,123 @@ namespace Impl { }; } +/// \class GraphRowViewConst +/// \brief View of a row of a sparse graph. +/// \tparam GraphType Sparse graph type, such as (but not limited to) StaticCrsGraph. +/// +/// This class provides a generic view of a row of a sparse graph. +/// We intended this class to view a row of a StaticCrsGraph, but +/// GraphType need not necessarily be CrsMatrix. +/// +/// The row view is suited for computational kernels like sparse +/// matrix-vector multiply, as well as for modifying entries in the +/// sparse matrix. The view is always const as it does not allow graph modification. +/// +/// Here is an example loop over the entries in the row: +/// \code +/// typedef typename GraphRowViewConst::ordinal_type ordinal_type; +/// +/// GraphRowView G_i = ...; +/// const ordinal_type numEntries = G_i.length; +/// for (ordinal_type k = 0; k < numEntries; ++k) { +/// ordinal_type j = G_i.colidx (k); +/// // ... do something with A_ij and j ... +/// } +/// \endcode +/// +/// GraphType must provide the \c data_type +/// typedefs. In addition, it must make sense to use GraphRowViewConst to +/// view a row of GraphType. In particular, column +/// indices of a row must be accessible using the entries +/// resp. colidx arrays given to the constructor of this +/// class, with a constant stride between successive entries. +/// The stride is one for the compressed sparse row storage format (as +/// is used by CrsMatrix), but may be greater than one for other +/// sparse matrix storage formats (e.g., ELLPACK or jagged diagonal). +template +struct GraphRowViewConst { + //! The type of the column indices in the row. + typedef const typename GraphType::data_type ordinal_type; + +private: + //! Array of (local) column indices in the row. + ordinal_type* colidx_; + /// \brief Stride between successive entries in the row. + /// + /// For compressed sparse row (CSR) storage, this is always one. + /// This might be greater than one for storage formats like ELLPACK + /// or Jagged Diagonal. Nevertheless, the stride can never be + /// greater than the number of rows or columns in the matrix. Thus, + /// \c ordinal_type is the correct type. + const ordinal_type stride_; + +public: + /// \brief Constructor + /// + /// \param values [in] Array of the row's values. + /// \param colidx [in] Array of the row's column indices. + /// \param stride [in] (Constant) stride between matrix entries in + /// each of the above arrays. + /// \param count [in] Number of entries in the row. + KOKKOS_INLINE_FUNCTION + GraphRowViewConst ( ordinal_type* const colidx_in, + const ordinal_type& stride, + const ordinal_type& count) : + colidx_ (colidx_in), stride_ (stride), length (count) + {} + + /// \brief Constructor with offset into \c colidx array + /// + /// \param colidx [in] Array of the row's column indices. + /// \param stride [in] (Constant) stride between matrix entries in + /// each of the above arrays. + /// \param count [in] Number of entries in the row. + /// \param idx [in] Start offset into \c colidx array + /// + /// \tparam OffsetType The type of \c idx (see above). Must be a + /// built-in integer type. This may differ from ordinal_type. + /// For example, the matrix may have dimensions that fit in int, + /// but a number of entries that does not fit in int. + template + KOKKOS_INLINE_FUNCTION + GraphRowViewConst ( const typename GraphType::entries_type& colidx_in, + const ordinal_type& stride, + const ordinal_type& count, + const OffsetType& idx, + const typename std::enable_if::value, int>::type& = 0) : + colidx_ (&colidx_in(idx)), stride_ (stride), length (count) + {} + + /// \brief Number of entries in the row. + /// + /// This is a public const field rather than a public const method, + /// in order to avoid possible overhead of a method call if the + /// compiler is unable to inline that method call. + /// + /// We assume that rows contain no duplicate entries (i.e., entries + /// with the same column index). Thus, a row may have up to + /// A.numCols() entries. This means that the correct type of + /// 'length' is ordinal_type. + const ordinal_type length; + + /// \brief (Const) reference to the column index of entry i in this + /// row of the sparse matrix. + /// + /// "Entry i" is not necessarily the entry with column index i, nor + /// does i necessarily correspond to the (local) row index. + KOKKOS_INLINE_FUNCTION + ordinal_type& colidx (const ordinal_type& i) const { + return colidx_[i*stride_]; + } + + /// \brief An alias for colidx + KOKKOS_INLINE_FUNCTION + ordinal_type& operator()(const ordinal_type& i) const { + return colidx(i); + } +}; + + /// \class StaticCrsGraph /// \brief Compressed row storage array. /// @@ -218,6 +335,38 @@ public: static_cast (0); } + /// \brief Return a const view of row i of the graph. + /// + /// If row i does not belong to the graph, return an empty view. + /// + /// The returned object \c view implements the following interface: + ///

    + ///
  • \c view.length is the number of entries in the row
  • + ///
  • \c view.colidx(k) returns a const reference to the + /// column index of the k-th entry in the row
  • + ///
+ /// k is not a column index; it just counts from 0 to + /// view.length - 1. + /// + /// Users should not rely on the return type of this method. They + /// should instead assign to 'auto'. That allows compile-time + /// polymorphism for different kinds of sparse matrix formats (e.g., + /// ELLPACK or Jagged Diagonal) that we may wish to support in the + /// future. + KOKKOS_INLINE_FUNCTION + GraphRowViewConst rowConst (const data_type i) const { + const size_type start = row_map(i); + // count is guaranteed to fit in ordinal_type, as long as no row + // has duplicate entries. + const data_type count = static_cast (row_map(i+1) - start); + + if (count == 0) { + return GraphRowViewConst (NULL, 1, 0); + } else { + return GraphRowViewConst (entries, 1, count, start); + } + } + /** \brief Create a row partitioning into a given number of blocks * balancing non-zeros + a fixed cost per row. */ diff --git a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp index 46321378d9..c184c14d07 100644 --- a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp +++ b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp @@ -91,11 +91,11 @@ struct DeviceIterateTile<2,RP,Functor,void > // LL if (RP::inner_direction == RP::Left) { for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { m_func(offset_0 , offset_1); } @@ -106,11 +106,11 @@ struct DeviceIterateTile<2,RP,Functor,void > // LR else { for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { m_func(offset_0 , offset_1); } @@ -143,11 +143,11 @@ struct DeviceIterateTile<2,RP,Functor,Tag> if (RP::inner_direction == RP::Left) { // Loop over size maxnumblocks until full range covered for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { m_func(Tag(), offset_0 , offset_1); } @@ -157,11 +157,11 @@ struct DeviceIterateTile<2,RP,Functor,Tag> } else { for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { m_func(Tag(), offset_0 , offset_1); } @@ -196,15 +196,15 @@ struct DeviceIterateTile<3,RP,Functor,void > // LL if (RP::inner_direction == RP::Left) { for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) { for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { m_func(offset_0 , offset_1 , offset_2); } @@ -217,15 +217,15 @@ struct DeviceIterateTile<3,RP,Functor,void > // LR else { for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) { m_func(offset_0 , offset_1 , offset_2); } @@ -259,15 +259,15 @@ struct DeviceIterateTile<3,RP,Functor,Tag> { if (RP::inner_direction == RP::Left) { for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) { for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { m_func(Tag(), offset_0 , offset_1 , offset_2); } @@ -279,15 +279,15 @@ struct DeviceIterateTile<3,RP,Functor,Tag> } else { for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) { for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) { for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) { m_func(Tag(), offset_0 , offset_1 , offset_2); } @@ -340,19 +340,19 @@ struct DeviceIterateTile<4,RP,Functor,void > const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0]; for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) { - const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z; + const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) { for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) { for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { m_func(offset_0 , offset_1 , offset_2 , offset_3); } @@ -378,19 +378,19 @@ struct DeviceIterateTile<4,RP,Functor,void > const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1]; for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) { for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) { - const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z; + const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) { m_func(offset_0 , offset_1 , offset_2 , offset_3); } @@ -442,19 +442,19 @@ struct DeviceIterateTile<4,RP,Functor,Tag> const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0]; for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) { - const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z; + const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) { for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) { for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { m_func(Tag(), offset_0 , offset_1 , offset_2 , offset_3); } @@ -479,19 +479,19 @@ struct DeviceIterateTile<4,RP,Functor,Tag> const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1]; for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) { for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) { - const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z; + const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) { m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3); } @@ -558,23 +558,23 @@ struct DeviceIterateTile<5,RP,Functor,void > const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2]; for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) { - const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z; + const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4); } @@ -613,23 +613,23 @@ struct DeviceIterateTile<5,RP,Functor,void > const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3]; for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) { - const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z; + const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) { m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4); } @@ -695,23 +695,23 @@ struct DeviceIterateTile<5,RP,Functor,Tag> const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2]; for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) { - const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z; + const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4); } @@ -750,23 +750,23 @@ struct DeviceIterateTile<5,RP,Functor,Tag> const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3]; for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) { - const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z; + const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) { m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4); } @@ -845,27 +845,27 @@ struct DeviceIterateTile<6,RP,Functor,void > const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4]; for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) { - const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5; + const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5]; if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) { for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) { - const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4; + const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5); } @@ -917,27 +917,27 @@ struct DeviceIterateTile<6,RP,Functor,void > const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5]; for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) { - const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4; + const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) { for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) { - const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5; + const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5]; if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) { m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5); } @@ -1016,27 +1016,27 @@ struct DeviceIterateTile<6,RP,Functor,Tag> const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4]; for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) { - const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5; + const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5]; if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) { for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) { - const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4; + const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5); } @@ -1088,27 +1088,27 @@ struct DeviceIterateTile<6,RP,Functor,Tag> const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5]; for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) { - const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4; + const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) { for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) { - const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5; + const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5]; if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) { m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5); } diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp index cae8ecd489..079d9f0889 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp @@ -164,7 +164,7 @@ static void cuda_parallel_launch_constant_memory() template< class DriverType, unsigned int maxTperB, unsigned int minBperSM > __global__ -__launch_bounds__(maxTperB, minBperSM) +//__launch_bounds__(maxTperB, minBperSM) static void cuda_parallel_launch_constant_memory() { const DriverType & driver = @@ -182,7 +182,7 @@ static void cuda_parallel_launch_local_memory( const DriverType driver ) template< class DriverType, unsigned int maxTperB, unsigned int minBperSM > __global__ -__launch_bounds__(maxTperB, minBperSM) +//__launch_bounds__(maxTperB, minBperSM) static void cuda_parallel_launch_local_memory( const DriverType driver ) { driver(); diff --git a/lib/kokkos/core/src/Kokkos_Complex.hpp b/lib/kokkos/core/src/Kokkos_Complex.hpp index 26b47a8b74..f8355f0d06 100644 --- a/lib/kokkos/core/src/Kokkos_Complex.hpp +++ b/lib/kokkos/core/src/Kokkos_Complex.hpp @@ -242,45 +242,89 @@ public: re_ = v; } + template KOKKOS_INLINE_FUNCTION - complex& operator += (const complex& src) { + complex& + operator += (const complex& src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); re_ += src.re_; im_ += src.im_; return *this; } + template KOKKOS_INLINE_FUNCTION - void operator += (const volatile complex& src) volatile { + void + operator += (const volatile complex& src) volatile { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); re_ += src.re_; im_ += src.im_; } KOKKOS_INLINE_FUNCTION - complex& operator += (const RealType& src) { + complex& + operator += (const std::complex& src) { + re_ += src.real(); + im_ += src.imag(); + return *this; + } + + template + KOKKOS_INLINE_FUNCTION + complex& + operator += (const InputRealType& src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); re_ += src; return *this; } + template KOKKOS_INLINE_FUNCTION - void operator += (const volatile RealType& src) volatile { + void + operator += (const volatile InputRealType& src) volatile { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); re_ += src; } - + + template KOKKOS_INLINE_FUNCTION - complex& operator -= (const complex& src) { + complex& + operator -= (const complex& src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); re_ -= src.re_; im_ -= src.im_; return *this; } KOKKOS_INLINE_FUNCTION - complex& operator -= (const RealType& src) { + complex& + operator -= (const std::complex& src) { + re_ -= src.real(); + im_ -= src.imag(); + return *this; + } + + template + KOKKOS_INLINE_FUNCTION + complex& + operator -= (const InputRealType& src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); re_ -= src; return *this; } + template KOKKOS_INLINE_FUNCTION - complex& operator *= (const complex& src) { + complex& + operator *= (const complex& src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); const RealType realPart = re_ * src.re_ - im_ * src.im_; const RealType imagPart = re_ * src.im_ + im_ * src.re_; re_ = realPart; @@ -288,8 +332,12 @@ public: return *this; } + template KOKKOS_INLINE_FUNCTION - void operator *= (const volatile complex& src) volatile { + void + operator *= (const volatile complex& src) volatile { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); const RealType realPart = re_ * src.re_ - im_ * src.im_; const RealType imagPart = re_ * src.im_ + im_ * src.re_; re_ = realPart; @@ -297,20 +345,70 @@ public: } KOKKOS_INLINE_FUNCTION - complex& operator *= (const RealType& src) { + complex& + operator *= (const std::complex& src) { + const RealType realPart = re_ * src.real() - im_ * src.imag(); + const RealType imagPart = re_ * src.imag() + im_ * src.real(); + re_ = realPart; + im_ = imagPart; + return *this; + } + + template + KOKKOS_INLINE_FUNCTION + complex& + operator *= (const InputRealType& src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); re_ *= src; im_ *= src; return *this; } + template KOKKOS_INLINE_FUNCTION - void operator *= (const volatile RealType& src) volatile { + void + operator *= (const volatile InputRealType& src) volatile { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); re_ *= src; im_ *= src; } + template KOKKOS_INLINE_FUNCTION - complex& operator /= (const complex& y) { + complex& + operator /= (const complex& y) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); + + // Scale (by the "1-norm" of y) to avoid unwarranted overflow. + // If the real part is +/-Inf and the imaginary part is -/+Inf, + // this won't change the result. + const RealType s = std::fabs (y.real ()) + std::fabs (y.imag ()); + + // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0. + // In that case, the relation x/y == (x/s) / (y/s) doesn't hold, + // because y/s is NaN. + if (s == 0.0) { + this->re_ /= s; + this->im_ /= s; + } + else { + const complex x_scaled (this->re_ / s, this->im_ / s); + const complex y_conj_scaled (y.re_ / s, -(y.im_) / s); + const RealType y_scaled_abs = y_conj_scaled.re_ * y_conj_scaled.re_ + + y_conj_scaled.im_ * y_conj_scaled.im_; // abs(y) == abs(conj(y)) + *this = x_scaled * y_conj_scaled; + *this /= y_scaled_abs; + } + return *this; + } + + KOKKOS_INLINE_FUNCTION + complex& + operator /= (const std::complex& y) { + // Scale (by the "1-norm" of y) to avoid unwarranted overflow. // If the real part is +/-Inf and the imaginary part is -/+Inf, // this won't change the result. @@ -334,57 +432,95 @@ public: return *this; } + + template KOKKOS_INLINE_FUNCTION - complex& operator /= (const RealType& src) { + complex& + operator /= (const InputRealType& src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); + re_ /= src; im_ /= src; return *this; } + template KOKKOS_INLINE_FUNCTION - bool operator == (const complex& src) { - return (re_ == src.re_) && (im_ == src.im_); + bool + operator == (const complex& src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); + + return (re_ == static_cast(src.re_)) && (im_ == static_cast(src.im_)); } KOKKOS_INLINE_FUNCTION - bool operator == (const RealType src) { - return (re_ == src) && (im_ == RealType(0)); + bool + operator == (const std::complex& src) { + return (re_ == src.real()) && (im_ == src.imag()); + } + + template + KOKKOS_INLINE_FUNCTION + bool + operator == (const InputRealType src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); + + return (re_ == static_cast(src)) && (im_ == RealType(0)); + } + + template + KOKKOS_INLINE_FUNCTION + bool + operator != (const complex& src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); + + return (re_ != static_cast(src.re_)) || (im_ != static_cast(src.im_)); } KOKKOS_INLINE_FUNCTION - bool operator != (const complex& src) { - return (re_ != src.re_) || (im_ != src.im_); + bool + operator != (const std::complex& src) { + return (re_ != src.real()) || (im_ != src.imag()); } + template KOKKOS_INLINE_FUNCTION - bool operator != (const RealType src) { - return (re_ != src) || (im_ != RealType(0)); - } + bool + operator != (const InputRealType src) { + static_assert(std::is_convertible::value, + "InputRealType must be convertible to RealType"); + return (re_ != static_cast(src)) || (im_ != RealType(0)); + } + }; //! Binary + operator for complex complex. -template +template KOKKOS_INLINE_FUNCTION -complex -operator + (const complex& x, const complex& y) { - return complex (x.real () + y.real (), x.imag () + y.imag ()); +complex::type> +operator + (const complex& x, const complex& y) { + return complex::type > (x.real () + y.real (), x.imag () + y.imag ()); } //! Binary + operator for complex scalar. -template +template KOKKOS_INLINE_FUNCTION -complex -operator + (const complex& x, const RealType& y) { - return complex (x.real () + y , x.imag ()); +complex::type> +operator + (const complex& x, const RealType2& y) { + return complex::type> (x.real () + y , x.imag ()); } //! Binary + operator for scalar complex. -template +template KOKKOS_INLINE_FUNCTION -complex -operator + (const RealType& x, const complex& y) { - return complex (x + y.real (), y.imag ()); +complex::type> +operator + (const RealType1& x, const complex& y) { + return complex::type> (x + y.real (), y.imag ()); } //! Unary + operator for complex. @@ -396,27 +532,27 @@ operator + (const complex& x) { } //! Binary - operator for complex. -template +template KOKKOS_INLINE_FUNCTION -complex -operator - (const complex& x, const complex& y) { - return complex (x.real () - y.real (), x.imag () - y.imag ()); +complex::type> +operator - (const complex& x, const complex& y) { + return complex::type> (x.real () - y.real (), x.imag () - y.imag ()); } //! Binary - operator for complex scalar. -template +template KOKKOS_INLINE_FUNCTION -complex -operator - (const complex& x, const RealType& y) { - return complex (x.real () - y , x.imag ()); +complex::type> +operator - (const complex& x, const RealType2& y) { + return complex::type> (x.real () - y , x.imag ()); } //! Binary - operator for scalar complex. -template +template KOKKOS_INLINE_FUNCTION -complex -operator - (const RealType& x, const complex& y) { - return complex (x - y.real (), - y.imag ()); +complex::type> +operator - (const RealType1& x, const complex& y) { + return complex::type> (x - y.real (), - y.imag ()); } //! Unary - operator for complex. @@ -428,12 +564,12 @@ operator - (const complex& x) { } //! Binary * operator for complex. -template +template KOKKOS_INLINE_FUNCTION -complex -operator * (const complex& x, const complex& y) { - return complex (x.real () * y.real () - x.imag () * y.imag (), - x.real () * y.imag () + x.imag () * y.real ()); +complex::type> +operator * (const complex& x, const complex& y) { + return complex::type> (x.real () * y.real () - x.imag () * y.imag (), + x.real () * y.imag () + x.imag () * y.real ()); } /// \brief Binary * operator for std::complex and complex. @@ -446,33 +582,34 @@ operator * (const complex& x, const complex& y) { /// This function cannot be called in a CUDA device function, because /// std::complex's methods and nonmember functions are not marked as /// CUDA device functions. -template -complex -operator * (const std::complex& x, const complex& y) { - return complex (x.real () * y.real () - x.imag () * y.imag (), - x.real () * y.imag () + x.imag () * y.real ()); +template +inline +complex::type> +operator * (const std::complex& x, const complex& y) { + return complex::type> (x.real () * y.real () - x.imag () * y.imag (), + x.real () * y.imag () + x.imag () * y.real ()); } /// \brief Binary * operator for RealType times complex. /// /// This function exists because the compiler doesn't know that /// RealType and complex commute with respect to operator*. -template +template KOKKOS_INLINE_FUNCTION -complex -operator * (const RealType& x, const complex& y) { - return complex (x * y.real (), x * y.imag ()); +complex::type> +operator * (const RealType1& x, const complex& y) { + return complex::type> (x * y.real (), x * y.imag ()); } /// \brief Binary * operator for RealType times complex. /// /// This function exists because the compiler doesn't know that /// RealType and complex commute with respect to operator*. -template +template KOKKOS_INLINE_FUNCTION -complex -operator * (const complex& y, const RealType& x) { - return complex (x * y.real (), x * y.imag ()); +complex::type> +operator * (const complex& y, const RealType2& x) { + return complex::type> (x * y.real (), x * y.imag ()); } //! Imaginary part of a complex number. @@ -539,33 +676,34 @@ complex pow (const complex& x) { //! Binary operator / for complex and real numbers template KOKKOS_INLINE_FUNCTION -complex +complex::type> operator / (const complex& x, const RealType2& y) { - return complex (real (x) / y, imag (x) / y); + return complex::type> (real (x) / y, imag (x) / y); } //! Binary operator / for complex. -template +template KOKKOS_INLINE_FUNCTION -complex -operator / (const complex& x, const complex& y) { +complex::type> +operator / (const complex& x, const complex& y) { // Scale (by the "1-norm" of y) to avoid unwarranted overflow. // If the real part is +/-Inf and the imaginary part is -/+Inf, // this won't change the result. - const RealType s = std::fabs (real (y)) + std::fabs (imag (y)); + typedef typename std::common_type::type common_real_type; + const common_real_type s = std::fabs (real (y)) + std::fabs (imag (y)); // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0. // In that case, the relation x/y == (x/s) / (y/s) doesn't hold, // because y/s is NaN. if (s == 0.0) { - return complex (real (x) / s, imag (x) / s); + return complex (real (x) / s, imag (x) / s); } else { - const complex x_scaled (real (x) / s, imag (x) / s); - const complex y_conj_scaled (real (y) / s, -imag (y) / s); - const RealType y_scaled_abs = real (y_conj_scaled) * real (y_conj_scaled) + + const complex x_scaled (real (x) / s, imag (x) / s); + const complex y_conj_scaled (real (y) / s, -imag (y) / s); + const RealType1 y_scaled_abs = real (y_conj_scaled) * real (y_conj_scaled) + imag (y_conj_scaled) * imag (y_conj_scaled); // abs(y) == abs(conj(y)) - complex result = x_scaled * y_conj_scaled; + complex result = x_scaled * y_conj_scaled; result /= y_scaled_abs; return result; } @@ -574,16 +712,19 @@ operator / (const complex& x, const complex& y) { //! Binary operator / for complex and real numbers template KOKKOS_INLINE_FUNCTION -complex +complex::type> operator / (const RealType1& x, const complex& y) { - return complex (x)/y; + return complex::type> (x)/y; } //! Equality operator for two complex numbers. -template +template KOKKOS_INLINE_FUNCTION -bool operator == (const complex& x, const complex& y) { - return real (x) == real (y) && imag (x) == imag (y); +bool +operator == (const complex& x, const complex& y) { + typedef typename std::common_type::type common_real_type; + return ( static_cast(real (x)) == static_cast(real (y)) && + static_cast(imag (x)) == static_cast(imag (y)) ); } /// \brief Equality operator for std::complex and Kokkos::complex. @@ -592,50 +733,68 @@ bool operator == (const complex& x, const complex& y) { /// Otherwise, CUDA builds will give compiler warnings ("warning: /// calling a constexpr __host__ function("real") from a __host__ /// __device__ function("operator==") is not allowed"). -template -bool operator == (const std::complex& x, const complex& y) { - return std::real (x) == real (y) && std::imag (x) == imag (y); +template +inline +bool +operator == (const std::complex& x, const complex& y) { + typedef typename std::common_type::type common_real_type; + return ( static_cast(std::real (x)) == static_cast(real (y)) && + static_cast(std::imag (x)) == static_cast(imag (y)) ); } - + //! Equality operator for complex and real number. template KOKKOS_INLINE_FUNCTION -bool operator == (const complex& x, const RealType2& y) { - return real (x) == y && imag (x) == static_cast (0.0); +bool +operator == (const complex& x, const RealType2& y) { + typedef typename std::common_type::type common_real_type; + return ( static_cast(real (x)) == static_cast(y) && + static_cast(imag (x)) == static_cast(0.0) ); } //! Equality operator for real and complex number. -template +template KOKKOS_INLINE_FUNCTION -bool operator == (const RealType& x, const complex& y) { +bool +operator == (const RealType1& x, const complex& y) { return y == x; } //! Inequality operator for two complex numbers. -template +template KOKKOS_INLINE_FUNCTION -bool operator != (const complex& x, const complex& y) { - return real (x) != real (y) || imag (x) != imag (y); +bool +operator != (const complex& x, const complex& y) { + typedef typename std::common_type::type common_real_type; + return ( static_cast(real (x)) != static_cast(real (y)) || + static_cast(imag (x)) != static_cast(imag (y)) ); } //! Inequality operator for std::complex and Kokkos::complex. -template -KOKKOS_INLINE_FUNCTION -bool operator != (const std::complex& x, const complex& y) { - return std::real (x) != real (y) || std::imag (x) != imag (y); +template +inline +bool +operator != (const std::complex& x, const complex& y) { + typedef typename std::common_type::type common_real_type; + return ( static_cast(std::real (x)) != static_cast(real (y)) || + static_cast(std::imag (x)) != static_cast(imag (y)) ); } //! Inequality operator for complex and real number. template KOKKOS_INLINE_FUNCTION -bool operator != (const complex& x, const RealType2& y) { - return real (x) != y || imag (x) != static_cast (0.0); +bool +operator != (const complex& x, const RealType2& y) { + typedef typename std::common_type::type common_real_type; + return ( static_cast(real (x)) != static_cast(y) || + static_cast(imag (x)) != static_cast(0.0) ); } //! Inequality operator for real and complex number. -template +template KOKKOS_INLINE_FUNCTION -bool operator != (const RealType& x, const complex& y) { +bool +operator != (const RealType1& x, const complex& y) { return y != x; } diff --git a/lib/kokkos/core/src/Kokkos_Crs.hpp b/lib/kokkos/core/src/Kokkos_Crs.hpp index f089c16ad2..b9c131cd7a 100644 --- a/lib/kokkos/core/src/Kokkos_Crs.hpp +++ b/lib/kokkos/core/src/Kokkos_Crs.hpp @@ -353,7 +353,14 @@ struct CountAndFill { struct Fill {}; KOKKOS_INLINE_FUNCTION void operator()(Fill, size_type i) const { auto j = m_crs.row_map(i); - data_type* fill = &(m_crs.entries(j)); + /* we don't want to access entries(entries.size()), even if its just to get its + address and never use it. + this can happen when row (i) is empty and all rows after it are also empty. + we could compare to row_map(i + 1), but that is a read from global memory, + whereas dimension_0() should be part of the View in registers (or constant memory) */ + data_type* fill = + (j == static_cast(m_crs.entries.dimension_0())) ? + nullptr : (&(m_crs.entries(j))); m_functor(i, fill); } using self_type = CountAndFill; diff --git a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp index 9c9af0dd8b..b811751a2c 100644 --- a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp @@ -147,12 +147,11 @@ public: , const size_t arg_alloc_size ) const; /**\brief Return Name of the MemorySpace */ - static constexpr const char* name(); + static constexpr const char* name() { return "HBW"; } private: AllocationMechanism m_alloc_mech; - static constexpr const char* m_name = "HBW"; friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace, void >; }; diff --git a/lib/kokkos/core/src/Kokkos_NumericTraits.hpp b/lib/kokkos/core/src/Kokkos_NumericTraits.hpp index 339571941d..a825fd54d3 100644 --- a/lib/kokkos/core/src/Kokkos_NumericTraits.hpp +++ b/lib/kokkos/core/src/Kokkos_NumericTraits.hpp @@ -192,7 +192,7 @@ template<> struct reduction_identity { KOKKOS_FORCEINLINE_FUNCTION constexpr static float sum() {return static_cast(0.0f);} KOKKOS_FORCEINLINE_FUNCTION constexpr static float prod() {return static_cast(1.0f);} - KOKKOS_FORCEINLINE_FUNCTION constexpr static float max() {return FLT_MIN;} + KOKKOS_FORCEINLINE_FUNCTION constexpr static float max() {return -FLT_MAX;} KOKKOS_FORCEINLINE_FUNCTION constexpr static float min() {return FLT_MAX;} }; @@ -200,7 +200,7 @@ template<> struct reduction_identity { KOKKOS_FORCEINLINE_FUNCTION constexpr static double sum() {return static_cast(0.0);} KOKKOS_FORCEINLINE_FUNCTION constexpr static double prod() {return static_cast(1.0);} - KOKKOS_FORCEINLINE_FUNCTION constexpr static double max() {return DBL_MIN;} + KOKKOS_FORCEINLINE_FUNCTION constexpr static double max() {return -DBL_MAX;} KOKKOS_FORCEINLINE_FUNCTION constexpr static double min() {return DBL_MAX;} }; @@ -208,7 +208,7 @@ template<> struct reduction_identity { KOKKOS_FORCEINLINE_FUNCTION constexpr static long double sum() {return static_cast(0.0);} KOKKOS_FORCEINLINE_FUNCTION constexpr static long double prod() {return static_cast(1.0);} - KOKKOS_FORCEINLINE_FUNCTION constexpr static long double max() {return LDBL_MIN;} + KOKKOS_FORCEINLINE_FUNCTION constexpr static long double max() {return -LDBL_MAX;} KOKKOS_FORCEINLINE_FUNCTION constexpr static long double min() {return LDBL_MAX;} }; diff --git a/lib/kokkos/core/src/Kokkos_ROCm.hpp b/lib/kokkos/core/src/Kokkos_ROCm.hpp index b13b0b01de..0118d4667e 100644 --- a/lib/kokkos/core/src/Kokkos_ROCm.hpp +++ b/lib/kokkos/core/src/Kokkos_ROCm.hpp @@ -211,6 +211,24 @@ struct VerifyExecutionCanAccessMemorySpace } // namespace Kokkos + +#define threadIdx_x (hc_get_workitem_id(0)) +#define threadIdx_y (hc_get_workitem_id(1)) +#define threadIdx_z (hc_get_workitem_id(2)) + +#define blockIdx_x (hc_get_group_id(0)) +#define blockIdx_y (hc_get_group_id(1)) +#define blockIdx_z (hc_get_group_id(2)) + +#define blockDim_x (hc_get_group_size(0)) +#define blockDim_y (hc_get_group_size(1)) +#define blockDim_z (hc_get_group_size(2)) + +#define gridDim_x (hc_get_num_groups(0)) +#define gridDim_y (hc_get_num_groups(1)) +#define gridDim_z (hc_get_num_groups(2)) + + #include #include diff --git a/lib/kokkos/core/src/Makefile b/lib/kokkos/core/src/Makefile index 8fb13b8954..a917cf1656 100644 --- a/lib/kokkos/core/src/Makefile +++ b/lib/kokkos/core/src/Makefile @@ -88,6 +88,7 @@ build-makefile-kokkos: echo "KOKKOS_SRC = $(KOKKOS_SRC)" >> Makefile.kokkos echo "" >> Makefile.kokkos echo "#Variables used in application Makefiles" >> Makefile.kokkos + echo "KOKKOS_OS = $(KOKKOS_OS)" >> Makefile.kokkos echo "KOKKOS_CPP_DEPENDS = $(KOKKOS_CPP_DEPENDS)" >> Makefile.kokkos echo "KOKKOS_CXXFLAGS = $(KOKKOS_CXXFLAGS)" >> Makefile.kokkos echo "KOKKOS_CPPFLAGS = $(KOKKOS_CPPFLAGS)" >> Makefile.kokkos diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp index 37d2ac8318..de84f6e59f 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp @@ -211,6 +211,7 @@ void OpenMP::partition_master( F const& f , thread_local_bytes ); + omp_set_num_threads(partition_size); f( omp_get_thread_num(), omp_get_num_threads() ); Impl::t_openmp_instance->~Exec(); diff --git a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp index 0b7a1e2583..f2674e5929 100644 --- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp +++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp @@ -113,7 +113,6 @@ void reduce_enqueue( if (output_length < 1) return; - assert(output_result != nullptr); const auto td = get_tile_desc(szElements,output_length,team_size,vector_size, shared_size); // allocate host and device memory for the results from each team @@ -176,14 +175,17 @@ void reduce_enqueue( } }); - ValueInit::init(ReducerConditional::select(f, reducer), output_result); + if (output_result != nullptr) + ValueInit::init(ReducerConditional::select(f, reducer), output_result); fut.wait(); copy(result,result_cpu.data()); - for(std::size_t i=0;i result(td.num_tiles); hc::array scratch(len); - tile_for(td, [&,len,td](hc::tiled_index<1> t_idx, tile_buffer buffer) [[hc]] + tile_for(td, [&,f,len,td](hc::tiled_index<1> t_idx, tile_buffer buffer) [[hc]] { const auto local = t_idx.local[0]; const auto global = t_idx.global[0]; @@ -135,7 +135,7 @@ void scan_enqueue( ValueJoin::join(f, &result_cpu[i], &result_cpu[i-1]); copy(result_cpu.data(),result); - hc::parallel_for_each(hc::extent<1>(len).tile(td.tile_size), [&,len,td](hc::tiled_index<1> t_idx) [[hc]] + hc::parallel_for_each(hc::extent<1>(len).tile(td.tile_size), [&,f,len,td](hc::tiled_index<1> t_idx) [[hc]] { // const auto local = t_idx.local[0]; const auto global = t_idx.global[0]; diff --git a/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp b/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp index 3d3029535e..c5e73c8b26 100644 --- a/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp @@ -68,6 +68,8 @@ int bit_first_zero( unsigned i ) noexcept return full != i ? _bit_scan_forward( ~i ) : -1 ; #elif defined( KOKKOS_COMPILER_IBM ) return full != i ? __cnttz4( ~i ) : -1 ; +#elif defined( KOKKOS_COMPILER_CRAYC ) + return full != i ? _popcnt( i ^ (i+1) ) - 1 : -1 ; #elif defined( KOKKOS_COMPILER_GNU ) || defined( __GNUC__ ) || defined( __GNUG__ ) return full != i ? __builtin_ffs( ~i ) - 1 : -1 ; #else @@ -90,17 +92,16 @@ int bit_scan_forward( unsigned i ) return _bit_scan_forward(i); #elif defined( KOKKOS_COMPILER_IBM ) return __cnttz4(i); +#elif defined( KOKKOS_COMPILER_CRAYC ) + return i ? _popcnt(~i & (i-1)) : -1; #elif defined( KOKKOS_COMPILER_GNU ) || defined( __GNUC__ ) || defined( __GNUG__ ) return __builtin_ffs(i) - 1; #else - unsigned t = 1u; - int r = 0; - while ( i && ( i & t == 0 ) ) - { - t = t << 1; - ++r; + int offset = -1; + if ( i ) { + for ( offset = 0 ; (i & ( 1 << offset ) ) == 0 ; ++offset ); } - return r; + return offset; #endif } @@ -116,17 +117,16 @@ int bit_scan_reverse( unsigned i ) return _bit_scan_reverse(i); #elif defined( KOKKOS_COMPILER_IBM ) return shift - __cntlz4(i); +#elif defined( KOKKOS_COMPILER_CRAYC ) + return i ? shift - _leadz32(i) : 0 ; #elif defined( __GNUC__ ) || defined( __GNUG__ ) return shift - __builtin_clz(i); #else - unsigned t = 1u << shift; - int r = 0; - while ( i && ( i & t == 0 ) ) - { - t = t >> 1; - ++r; + int offset = 0; + if ( i ) { + for ( offset = shift ; (i & ( 1 << offset ) ) == 0 ; --offset ); } - return r; + return offset; #endif } @@ -142,6 +142,8 @@ int bit_count( unsigned i ) return _popcnt32(i); #elif defined( KOKKOS_COMPILER_IBM ) return __popcnt4(i); +#elif defined( KOKKOS_COMPILER_CRAYC ) + return _popcnt(i); #elif defined( __GNUC__ ) || defined( __GNUG__ ) return __builtin_popcount(i); #else diff --git a/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp b/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp index e11f8b6d34..cd0553218d 100644 --- a/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp @@ -166,10 +166,6 @@ void HBWSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_s } } -constexpr const char* HBWSpace::name() { - return m_name; -} - } // namespace Experimental } // namespace Kokkos diff --git a/lib/kokkos/core/unit_test/TestComplex.hpp b/lib/kokkos/core/unit_test/TestComplex.hpp index ce5537fed3..c7f681699e 100644 --- a/lib/kokkos/core/unit_test/TestComplex.hpp +++ b/lib/kokkos/core/unit_test/TestComplex.hpp @@ -114,7 +114,7 @@ struct TestComplexBasicMath { typename Kokkos::View*,ExecSpace>::HostMirror h_results; void testit () { - d_results = Kokkos::View*,ExecSpace>("TestComplexBasicMath",20); + d_results = Kokkos::View*,ExecSpace>("TestComplexBasicMath",24); h_results = Kokkos::create_mirror_view(d_results); Kokkos::parallel_for(Kokkos::RangePolicy(0,1), *this); @@ -125,6 +125,7 @@ struct TestComplexBasicMath { std::complex b(3.25,5.75); std::complex d(1.0,2.0); double c = 9.3; + int e = 2; std::complex r; r = a+b; ASSERT_FLOAT_EQ(h_results(0).real(), r.real()); ASSERT_FLOAT_EQ(h_results(0).imag(), r.imag()); @@ -147,6 +148,12 @@ struct TestComplexBasicMath { r = c-a; ASSERT_FLOAT_EQ(h_results(17).real(), r.real()); ASSERT_FLOAT_EQ(h_results(17).imag(), r.imag()); r = c*a; ASSERT_FLOAT_EQ(h_results(18).real(), r.real()); ASSERT_FLOAT_EQ(h_results(18).imag(), r.imag()); r = c/a; ASSERT_FLOAT_EQ(h_results(19).real(), r.real()); ASSERT_FLOAT_EQ(h_results(19).imag(), r.imag()); + + r = a; + /* r = a+e; */ ASSERT_FLOAT_EQ(h_results(20).real(), r.real()+e); ASSERT_FLOAT_EQ(h_results(20).imag(), r.imag()); + /* r = a-e; */ ASSERT_FLOAT_EQ(h_results(21).real(), r.real()-e); ASSERT_FLOAT_EQ(h_results(21).imag(), r.imag()); + /* r = a*e; */ ASSERT_FLOAT_EQ(h_results(22).real(), r.real()*e); ASSERT_FLOAT_EQ(h_results(22).imag(), r.imag()*e); + /* r = a/e; */ ASSERT_FLOAT_EQ(h_results(23).real(), r.real()/2); ASSERT_FLOAT_EQ(h_results(23).imag(), r.imag()/e); } KOKKOS_INLINE_FUNCTION @@ -190,6 +197,12 @@ struct TestComplexBasicMath { d_results(17) = c-a; d_results(18) = c*a; d_results(19) = c/a; + + int e = 2; + d_results(20) = a+e; + d_results(21) = a-e; + d_results(22) = a*e; + d_results(23) = a/e; } }; diff --git a/lib/kokkos/core/unit_test/TestMDRange.hpp b/lib/kokkos/core/unit_test/TestMDRange.hpp index f579ddf02c..fbc3a65c2f 100644 --- a/lib/kokkos/core/unit_test/TestMDRange.hpp +++ b/lib/kokkos/core/unit_test/TestMDRange.hpp @@ -286,7 +286,9 @@ struct TestMDRange_2D { // Test with reducers - scalar { typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType > range_type; - range_type range( {{ 0, 0 }}, {{ N0, N1 }}, {{ 3, 3 }} ); + int s0 = 1; + int s1 = 1; + range_type range( {{ s0, s1 }}, {{ N0, N1 }}, {{ 3, 3 }} ); TestMDRange_2D functor( N0, N1 ); @@ -297,7 +299,7 @@ struct TestMDRange_2D { parallel_reduce( range, functor, reducer_scalar ); - ASSERT_EQ( sum, 2 * N0 * N1 ); + ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) ); } // Test with reducers - scalar view { @@ -445,7 +447,9 @@ struct TestMDRange_2D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } ); + const int s0 = 1; + const int s1 = 1; + range_type range( point_type{ { s0, s1 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } ); TestMDRange_2D functor( N0, N1 ); parallel_for( range, functor ); @@ -454,8 +458,8 @@ struct TestMDRange_2D { Kokkos::deep_copy( h_view, functor.input_view ); int counter = 0; - for ( int i = 0; i < N0; ++i ) - for ( int j = 0; j < N1; ++j ) + for ( int i = s0; i < N0; ++i ) + for ( int j = s1; j < N1; ++j ) { if ( h_view( i, j ) != 3 ) { ++counter; @@ -463,7 +467,7 @@ struct TestMDRange_2D { } if ( counter != 0 ) { - printf( "Default Layouts + InitTag op(): Errors in test_for2; mismatches = %d\n\n", counter ); + printf( "Offset Start + Default Layouts + InitTag op(): Errors in test_for2; mismatches = %d\n\n", counter ); } ASSERT_EQ( counter, 0 ); @@ -699,6 +703,7 @@ struct TestMDRange_2D { ASSERT_EQ( counter, 0 ); } + } // end test_for2 }; // MDRange_2D @@ -749,7 +754,10 @@ struct TestMDRange_3D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } ); + int s0 = 1; + int s1 = 1; + int s2 = 1; + range_type range( point_type{ { s0, s1, s2 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } ); TestMDRange_3D functor( N0, N1, N2 ); @@ -757,7 +765,7 @@ struct TestMDRange_3D { double sum = 0.0; parallel_reduce( range, functor, sum ); - ASSERT_EQ( sum, 2 * N0 * N1 * N2 ); + ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) ); } // Test with reducers - scalar @@ -952,7 +960,10 @@ struct TestMDRange_3D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } ); + int s0 = 1; + int s1 = 1; + int s2 = 1; + range_type range( point_type{ { s0, s1, s2 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } ); TestMDRange_3D functor( N0, N1, N2 ); parallel_for( range, functor ); @@ -961,9 +972,9 @@ struct TestMDRange_3D { Kokkos::deep_copy( h_view, functor.input_view ); int counter = 0; - for ( int i = 0; i < N0; ++i ) - for ( int j = 0; j < N1; ++j ) - for ( int k = 0; k < N2; ++k ) + for ( int i = s0; i < N0; ++i ) + for ( int j = s1; j < N1; ++j ) + for ( int k = s2; k < N2; ++k ) { if ( h_view( i, j, k ) != 3 ) { ++counter; @@ -971,7 +982,7 @@ struct TestMDRange_3D { } if ( counter != 0 ) { - printf( "Defaults + InitTag op(): Errors in test_for3; mismatches = %d\n\n", counter ); + printf( "Offset Start + Defaults + InitTag op(): Errors in test_for3; mismatches = %d\n\n", counter ); } ASSERT_EQ( counter, 0 ); @@ -1207,7 +1218,11 @@ struct TestMDRange_4D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 3, 3, 3 } } ); + int s0 = 1; + int s1 = 1; + int s2 = 1; + int s3 = 1; + range_type range( point_type{ { s0, s1, s2, s3 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 3, 3, 3 } } ); TestMDRange_4D functor( N0, N1, N2, N3 ); @@ -1215,7 +1230,7 @@ struct TestMDRange_4D { double sum = 0.0; parallel_reduce( range, functor, sum ); - ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 ); + ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) ); } // Test with reducers - scalar @@ -1415,7 +1430,11 @@ struct TestMDRange_4D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 11, 3, 3 } } ); + int s0 = 1; + int s1 = 1; + int s2 = 1; + int s3 = 1; + range_type range( point_type{ { s0, s1, s2, s3 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 11, 3, 3 } } ); TestMDRange_4D functor( N0, N1, N2, N3 ); parallel_for( range, functor ); @@ -1424,10 +1443,10 @@ struct TestMDRange_4D { Kokkos::deep_copy( h_view, functor.input_view ); int counter = 0; - for ( int i = 0; i < N0; ++i ) - for ( int j = 0; j < N1; ++j ) - for ( int k = 0; k < N2; ++k ) - for ( int l = 0; l < N3; ++l ) + for ( int i = s0; i < N0; ++i ) + for ( int j = s1; j < N1; ++j ) + for ( int k = s2; k < N2; ++k ) + for ( int l = s3; l < N3; ++l ) { if ( h_view( i, j, k, l ) != 3 ) { ++counter; @@ -1435,7 +1454,7 @@ struct TestMDRange_4D { } if ( counter != 0 ) { - printf("Defaults +m_tile > m_upper dim2 InitTag op(): Errors in test_for4; mismatches = %d\n\n",counter); + printf("Offset Start + Defaults +m_tile > m_upper dim2 InitTag op(): Errors in test_for4; mismatches = %d\n\n",counter); } ASSERT_EQ( counter, 0 ); @@ -1682,7 +1701,12 @@ struct TestMDRange_5D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 3 } } ); + int s0 = 1; + int s1 = 1; + int s2 = 1; + int s3 = 1; + int s4 = 1; + range_type range( point_type{ { s0, s1, s2, s3, s4 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 3 } } ); TestMDRange_5D functor( N0, N1, N2, N3, N4 ); @@ -1690,7 +1714,7 @@ struct TestMDRange_5D { double sum = 0.0; parallel_reduce( range, functor, sum ); - ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 ); + ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) * (N4 - s4) ); } // Test with reducers - scalar @@ -1810,7 +1834,12 @@ struct TestMDRange_5D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 5 } } ); + int s0 = 1; + int s1 = 1; + int s2 = 1; + int s3 = 1; + int s4 = 1; + range_type range( point_type{ { s0, s1, s2, s3, s4 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 5 } } ); TestMDRange_5D functor( N0, N1, N2, N3, N4 ); parallel_for( range, functor ); @@ -1819,11 +1848,11 @@ struct TestMDRange_5D { Kokkos::deep_copy( h_view, functor.input_view ); int counter = 0; - for ( int i = 0; i < N0; ++i ) - for ( int j = 0; j < N1; ++j ) - for ( int k = 0; k < N2; ++k ) - for ( int l = 0; l < N3; ++l ) - for ( int m = 0; m < N4; ++m ) + for ( int i = s0; i < N0; ++i ) + for ( int j = s1; j < N1; ++j ) + for ( int k = s2; k < N2; ++k ) + for ( int l = s3; l < N3; ++l ) + for ( int m = s4; m < N4; ++m ) { if ( h_view( i, j, k, l, m ) != 3 ) { ++counter; @@ -1831,7 +1860,7 @@ struct TestMDRange_5D { } if ( counter != 0 ) { - printf( "Defaults + InitTag op(): Errors in test_for5; mismatches = %d\n\n", counter ); + printf( "Offset Start + Defaults + InitTag op(): Errors in test_for5; mismatches = %d\n\n", counter ); } ASSERT_EQ( counter, 0 ); @@ -2084,7 +2113,13 @@ struct TestMDRange_6D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 3, 2 } } ); + int s0 = 1; + int s1 = 1; + int s2 = 1; + int s3 = 1; + int s4 = 1; + int s5 = 1; + range_type range( point_type{ { s0, s1, s2, s3, s4, s5 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 3, 2 } } ); TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 ); @@ -2092,7 +2127,7 @@ struct TestMDRange_6D { double sum = 0.0; parallel_reduce( range, functor, sum ); - ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 * N5 ); + ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) * (N4 - s4) * (N5 - s5) ); } // Test with reducers - scalar @@ -2214,7 +2249,13 @@ struct TestMDRange_6D { typedef typename range_type::tile_type tile_type; typedef typename range_type::point_type point_type; - range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 2, 3 } } ); //tile dims 3,3,3,3,3,3 more than cuda can handle with debugging + int s0 = 1; + int s1 = 1; + int s2 = 1; + int s3 = 1; + int s4 = 1; + int s5 = 1; + range_type range( point_type{ { s0, s1, s2, s3, s4, s5 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 2, 3 } } ); //tile dims 3,3,3,3,3,3 more than cuda can handle with debugging TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 ); parallel_for( range, functor ); @@ -2223,12 +2264,12 @@ struct TestMDRange_6D { Kokkos::deep_copy( h_view, functor.input_view ); int counter = 0; - for ( int i = 0; i < N0; ++i ) - for ( int j = 0; j < N1; ++j ) - for ( int k = 0; k < N2; ++k ) - for ( int l = 0; l < N3; ++l ) - for ( int m = 0; m < N4; ++m ) - for ( int n = 0; n < N5; ++n ) + for ( int i = s0; i < N0; ++i ) + for ( int j = s1; j < N1; ++j ) + for ( int k = s2; k < N2; ++k ) + for ( int l = s3; l < N3; ++l ) + for ( int m = s4; m < N4; ++m ) + for ( int n = s5; n < N5; ++n ) { if ( h_view( i, j, k, l, m, n ) != 3 ) { ++counter; @@ -2236,7 +2277,7 @@ struct TestMDRange_6D { } if ( counter != 0 ) { - printf( "Defaults + InitTag op(): Errors in test_for6; mismatches = %d\n\n", counter ); + printf( "Offset Start + Defaults + InitTag op(): Errors in test_for6; mismatches = %d\n\n", counter ); } ASSERT_EQ( counter, 0 );