Merge pull request #677 from stanmoore1/kk_update

Update to Kokkos r2.04.04 and add workaround for performance regression
This commit is contained in:
Steve Plimpton 2017-10-05 17:10:00 -06:00 committed by GitHub
commit 439c2fd980
26 changed files with 1222 additions and 562 deletions

View File

@ -1,5 +1,24 @@
# Change Log
## [2.04.04](https://github.com/kokkos/kokkos/tree/2.04.04) (2017-09-11)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.04.00...2.04.04)
**Implemented enhancements:**
- OpenMP partition: set number of threads on nested level [\#1082](https://github.com/kokkos/kokkos/issues/1082)
- Add StaticCrsGraph row\(\) method [\#1071](https://github.com/kokkos/kokkos/issues/1071)
- Enhance Kokkos complex operator overloading [\#1052](https://github.com/kokkos/kokkos/issues/1052)
- Tell Trilinos packages about host+device lambda [\#1019](https://github.com/kokkos/kokkos/issues/1019)
- Function markup for defaulted class members [\#952](https://github.com/kokkos/kokkos/issues/952)
- Add deterministic random number generator [\#857](https://github.com/kokkos/kokkos/issues/857)
**Fixed bugs:**
- Fix reduction\_identity\<T\>::max for floating point numbers [\#1048](https://github.com/kokkos/kokkos/issues/1048)
- Fix MD iteration policy ignores lower bound on GPUs [\#1041](https://github.com/kokkos/kokkos/issues/1041)
- (Experimental) HBWSpace Linking issues in KokkosKernels [\#1094](https://github.com/kokkos/kokkos/issues/1094)
- (Experimental) ROCm: algorithms/unit\_tests test\_sort failing with segfault [\#1070](https://github.com/kokkos/kokkos/issues/1070)
## [2.04.00](https://github.com/kokkos/kokkos/tree/2.04.00) (2017-08-16)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.13...2.04.00)

View File

@ -443,7 +443,7 @@ endif
ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
KOKKOS_CPPFLAGS += -I$(MEMKIND_PATH)/include
KOKKOS_LDFLAGS += -L$(MEMKIND_PATH)/lib
KOKKOS_LIBS += -lmemkind
KOKKOS_LIBS += -lmemkind -lnuma
tmp := $(shell echo "\#define KOKKOS_HAVE_HBWSPACE 1" >> KokkosCore_config.tmp )
endif
@ -614,9 +614,18 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1)
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
else
# Assume that this is a really a GNU compiler or it could be XL on P8.
KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8
ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
else
# Assume that this is a really a GNU compiler on P8.
KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8
endif
endif
endif
endif
@ -626,9 +635,18 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER9), 1)
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
else
# Assume that this is a really a GNU compiler or it could be XL on P9.
KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9
KOKKOS_LDFLAGS += -mcpu=power9 -mtune=power9
ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9
KOKKOS_LDFLAGS += -mcpu=power9 -mtune=power9
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
else
# Assume that this is a really a GNU compiler on P9
KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9
KOKKOS_LDFLAGS += -mcpu=power9 -mtune=power9
endif
endif
endif
endif

View File

@ -1265,6 +1265,243 @@ void Random_XorShift1024_Pool<Kokkos::Cuda>::free_state(const Random_XorShift102
}
#endif
#if defined(KOKKOS_ENABLE_ROCM)
template<>
class Random_XorShift1024<Kokkos::Experimental::ROCm> {
private:
int p_;
const int state_idx_;
uint64_t* state_;
const int stride_;
friend class Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>;
public:
typedef Kokkos::Experimental::ROCm device_type;
typedef Random_XorShift1024_Pool<device_type> pool_type;
enum {MAX_URAND = 0xffffffffU};
enum {MAX_URAND64 = 0xffffffffffffffffULL-1};
enum {MAX_RAND = static_cast<int>(0xffffffffU/2)};
enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)};
KOKKOS_INLINE_FUNCTION
Random_XorShift1024 (const typename pool_type::state_data_type& state, int p, int state_idx = 0):
p_(p),state_idx_(state_idx),state_(&state(state_idx,0)),stride_(state.stride_1()){
}
KOKKOS_INLINE_FUNCTION
uint32_t urand() {
uint64_t state_0 = state_[ p_ * stride_ ];
uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
state_1 ^= state_1 << 31;
state_1 ^= state_1 >> 11;
state_0 ^= state_0 >> 30;
uint64_t tmp = ( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL;
tmp = tmp>>16;
return static_cast<uint32_t>(tmp&MAX_URAND);
}
KOKKOS_INLINE_FUNCTION
uint64_t urand64() {
uint64_t state_0 = state_[ p_ * stride_ ];
uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
state_1 ^= state_1 << 31;
state_1 ^= state_1 >> 11;
state_0 ^= state_0 >> 30;
return (( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1;
}
KOKKOS_INLINE_FUNCTION
uint32_t urand(const uint32_t& range) {
const uint32_t max_val = (MAX_URAND/range)*range;
uint32_t tmp = urand();
while(tmp>=max_val)
urand();
return tmp%range;
}
KOKKOS_INLINE_FUNCTION
uint32_t urand(const uint32_t& start, const uint32_t& end ) {
return urand(end-start)+start;
}
KOKKOS_INLINE_FUNCTION
uint64_t urand64(const uint64_t& range) {
const uint64_t max_val = (MAX_URAND64/range)*range;
uint64_t tmp = urand64();
while(tmp>=max_val)
urand64();
return tmp%range;
}
KOKKOS_INLINE_FUNCTION
uint64_t urand64(const uint64_t& start, const uint64_t& end ) {
return urand64(end-start)+start;
}
KOKKOS_INLINE_FUNCTION
int rand() {
return static_cast<int>(urand()/2);
}
KOKKOS_INLINE_FUNCTION
int rand(const int& range) {
const int max_val = (MAX_RAND/range)*range;
int tmp = rand();
while(tmp>=max_val)
rand();
return tmp%range;
}
KOKKOS_INLINE_FUNCTION
int rand(const int& start, const int& end ) {
return rand(end-start)+start;
}
KOKKOS_INLINE_FUNCTION
int64_t rand64() {
return static_cast<int64_t>(urand64()/2);
}
KOKKOS_INLINE_FUNCTION
int64_t rand64(const int64_t& range) {
const int64_t max_val = (MAX_RAND64/range)*range;
int64_t tmp = rand64();
while(tmp>=max_val)
rand64();
return tmp%range;
}
KOKKOS_INLINE_FUNCTION
int64_t rand64(const int64_t& start, const int64_t& end ) {
return rand64(end-start)+start;
}
KOKKOS_INLINE_FUNCTION
float frand() {
return 1.0f * urand64()/MAX_URAND64;
}
KOKKOS_INLINE_FUNCTION
float frand(const float& range) {
return range * urand64()/MAX_URAND64;
}
KOKKOS_INLINE_FUNCTION
float frand(const float& start, const float& end ) {
return frand(end-start)+start;
}
KOKKOS_INLINE_FUNCTION
double drand() {
return 1.0 * urand64()/MAX_URAND64;
}
KOKKOS_INLINE_FUNCTION
double drand(const double& range) {
return range * urand64()/MAX_URAND64;
}
KOKKOS_INLINE_FUNCTION
double drand(const double& start, const double& end ) {
return frand(end-start)+start;
}
//Marsaglia polar method for drawing a standard normal distributed random number
KOKKOS_INLINE_FUNCTION
double normal() {
double S = 2.0;
double U;
while(S>=1.0) {
U = 2.0*drand() - 1.0;
const double V = 2.0*drand() - 1.0;
S = U*U+V*V;
}
return U*std::sqrt(-2.0*log(S)/S);
}
KOKKOS_INLINE_FUNCTION
double normal(const double& mean, const double& std_dev=1.0) {
return mean + normal()*std_dev;
}
};
template<>
inline
Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::Random_XorShift64_Pool(uint64_t seed) {
num_states_ = 0;
init(seed,4*32768);
}
template<>
KOKKOS_INLINE_FUNCTION
Random_XorShift64<Kokkos::Experimental::ROCm> Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::get_state() const {
#ifdef __HCC_ACCELERATOR__
const int i_offset = (threadIdx_x*blockDim_y + threadIdx_y)*blockDim_z+threadIdx_z;
int i = (((blockIdx_x*gridDim_y+blockIdx_y)*gridDim_z + blockIdx_z) *
blockDim_x*blockDim_y*blockDim_z + i_offset)%num_states_;
while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) {
i+=blockDim_x*blockDim_y*blockDim_z;
if(i>=num_states_) {i = i_offset;}
}
return Random_XorShift64<Kokkos::Experimental::ROCm>(state_(i),i);
#else
return Random_XorShift64<Kokkos::Experimental::ROCm>(state_(0),0);
#endif
}
template<>
KOKKOS_INLINE_FUNCTION
void Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::free_state(const Random_XorShift64<Kokkos::Experimental::ROCm> &state) const {
#ifdef __HCC_ACCELERATOR__
state_(state.state_idx_) = state.state_;
locks_(state.state_idx_) = 0;
return;
#endif
}
template<>
inline
Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::Random_XorShift1024_Pool(uint64_t seed) {
num_states_ = 0;
init(seed,4*32768);
}
template<>
KOKKOS_INLINE_FUNCTION
Random_XorShift1024<Kokkos::Experimental::ROCm> Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::get_state() const {
#ifdef __HCC_ACCELERATOR__
const int i_offset = (threadIdx_x*blockDim_y + threadIdx_y)*blockDim_z+threadIdx_z;
int i = (((blockIdx_x*gridDim_y+blockIdx_y)*gridDim_z + blockIdx_z) *
blockDim_x*blockDim_y*blockDim_z + i_offset)%num_states_;
while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) {
i+=blockDim_x*blockDim_y*blockDim_z;
if(i>=num_states_) {i = i_offset;}
}
return Random_XorShift1024<Kokkos::Experimental::ROCm>(state_, p_(i), i);
#else
return Random_XorShift1024<Kokkos::Experimental::ROCm>(state_, p_(0), 0);
#endif
}
template<>
KOKKOS_INLINE_FUNCTION
void Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::free_state(const Random_XorShift1024<Kokkos::Experimental::ROCm> &state) const {
#ifdef __HCC_ACCELERATOR__
for(int i=0; i<16; i++)
state_(state.state_idx_,i) = state.state_[i];
locks_(state.state_idx_) = 0;
return;
#endif
}
#endif

View File

@ -30,6 +30,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
TEST_TARGETS += test-cuda
endif
ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
OBJ_ROCM = TestROCm.o UnitTestMain.o gtest-all.o
TARGETS += KokkosAlgorithms_UnitTest_ROCm
TEST_TARGETS += test-rocm
endif
ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
OBJ_THREADS = TestThreads.o UnitTestMain.o gtest-all.o
TARGETS += KokkosAlgorithms_UnitTest_Threads
@ -51,6 +57,9 @@ endif
KokkosAlgorithms_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Cuda
KokkosAlgorithms_UnitTest_ROCm: $(OBJ_ROCM) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(EXTRA_PATH) $(OBJ_ROCM) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_ROCm
KokkosAlgorithms_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Threads
@ -63,6 +72,9 @@ KokkosAlgorithms_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS)
test-cuda: KokkosAlgorithms_UnitTest_Cuda
./KokkosAlgorithms_UnitTest_Cuda
test-rocm: KokkosAlgorithms_UnitTest_ROCm
./KokkosAlgorithms_UnitTest_ROCm
test-threads: KokkosAlgorithms_UnitTest_Threads
./KokkosAlgorithms_UnitTest_Threads

View File

@ -0,0 +1,112 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Macros.hpp>
#ifdef KOKKOS_ENABLE_ROCM
#include <cstdint>
#include <iostream>
#include <iomanip>
#include <gtest/gtest.h>
#include <Kokkos_Core.hpp>
#include <TestRandom.hpp>
#include <TestSort.hpp>
namespace Test {
class rocm : public ::testing::Test {
protected:
static void SetUpTestCase()
{
std::cout << std::setprecision(5) << std::scientific;
Kokkos::HostSpace::execution_space::initialize();
Kokkos::Experimental::ROCm::initialize( Kokkos::Experimental::ROCm::SelectDevice(0) );
}
static void TearDownTestCase()
{
Kokkos::Experimental::ROCm::finalize();
Kokkos::HostSpace::execution_space::finalize();
}
};
void rocm_test_random_xorshift64( int num_draws )
{
Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::Experimental::ROCm> >(num_draws);
}
void rocm_test_random_xorshift1024( int num_draws )
{
Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::Experimental::ROCm> >(num_draws);
}
#define ROCM_RANDOM_XORSHIFT64( num_draws ) \
TEST_F( rocm, Random_XorShift64 ) { \
rocm_test_random_xorshift64(num_draws); \
}
#define ROCM_RANDOM_XORSHIFT1024( num_draws ) \
TEST_F( rocm, Random_XorShift1024 ) { \
rocm_test_random_xorshift1024(num_draws); \
}
#define ROCM_SORT_UNSIGNED( size ) \
TEST_F( rocm, SortUnsigned ) { \
Impl::test_sort< Kokkos::Experimental::ROCm, unsigned >(size); \
}
ROCM_RANDOM_XORSHIFT64( 132141141 )
ROCM_RANDOM_XORSHIFT1024( 52428813 )
ROCM_SORT_UNSIGNED(171)
#undef ROCM_RANDOM_XORSHIFT64
#undef ROCM_RANDOM_XORSHIFT1024
#undef ROCM_SORT_UNSIGNED
}
#else
void KOKKOS_ALGORITHMS_UNITTESTS_TESTROCM_PREVENT_LINK_ERROR() {}
#endif /* #ifdef KOKKOS_ENABLE_ROCM */

View File

@ -27,7 +27,7 @@ fi
HPCBIND_HWLOC_PARENT_CPUSET=""
if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
MY_PID="$BASHPID"
HPCBIND_HWLOC_PARENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2)
HPCBIND_HWLOC_PARENT_CPUSET="$(hwloc-ps -a --cpuset | grep ${MY_PID} | cut -f 2)"
fi
################################################################################
@ -58,23 +58,34 @@ declare -i HPCBIND_ENABLE_GPU_MAPPING=$((NUM_GPUS > 0))
################################################################################
HPCBIND_QUEUE_NAME=""
declare -i HPCBIND_QUEUE_INDEX=0
declare -i HPCBIND_QUEUE_GPU_MAPPING=0
declare -i HPCBIND_QUEUE_MAPPING=0
if [[ ! -z "${SLURM_LOCAL_ID}" ]]; then
HPCBIND_QUEUE_GPU_MAPPING=1
HPCBIND_QUEUE_NAME="sbatch"
if [[ ! -z "${PMI_RANK}" ]]; then
HPCBIND_QUEUE_MAPPING=1
HPCBIND_QUEUE_NAME="mpich"
HPCBIND_QUEUE_INDEX=${PMI_RANK}
elif [[ ! -z "${OMPI_COMM_WORLD_RANK}" ]]; then
HPCBIND_QUEUE_MAPPING=1
HPCBIND_QUEUE_NAME="openmpi"
HPCBIND_QUEUE_INDEX=${OMPI_COMM_WORLD_RANK}
elif [[ ! -z "${MV2_COMM_WORLD_RANK}" ]]; then
HPCBIND_QUEUE_MAPPING=1
HPCBIND_QUEUE_NAME="mvapich2"
HPCBIND_QUEUE_INDEX=${MV2_COMM_WORLD_RANK}
elif [[ ! -z "${SLURM_LOCAL_ID}" ]]; then
HPCBIND_QUEUE_MAPPING=1
HPCBIND_QUEUE_NAME="slurm"
HPCBIND_QUEUE_INDEX=${SLURM_LOCAL_ID}
elif [[ ! -z "${LBS_JOBINDEX}" ]]; then
HPCBIND_QUEUE_GPU_MAPPING=1
HPCBIND_QUEUE_MAPPING=1
HPCBIND_QUEUE_NAME="bsub"
HPCBIND_QUEUE_INDEX=${LBS_JOBINDEX}
elif [[ ! -z "${ALPS_APP_PE}" ]]; then
HPCBIND_QUEUE_GPU_MAPPING=1
HPCBIND_QUEUE_MAPPING=1
HPCBIND_QUEUE_NAME="aprun"
HPCBIND_QUEUE_INDEX=${ALPS_APP_PE}
fi
################################################################################
# Show help
################################################################################
@ -91,13 +102,14 @@ function show_help {
echo " --proc-bind=<LOC> Set the initial process mask for the script"
echo " LOC can be any valid location argument for"
echo " hwloc-calc Default: all"
echo " --whole-system ${cmd} will ignore the its parent process binding"
echo " --distribute=N Distribute the current cpuset into N partitions"
echo " --distribute-partition=I"
echo " Use the i'th partition (zero based)"
echo " --visible-gpus=<L> Comma separated list of gpu ids"
echo " Default: CUDA_VISIBLE_DEVICES or all gpus in"
echo " sequential order"
echo " --gpu-ignore-queue Ignore queue job id when choosing visible GPU"
echo " --ignore-queue Ignore queue job id when choosing visible GPU and partition"
echo " --no-gpu-mapping Do not set CUDA_VISIBLE_DEVICES"
echo " --openmp=M.m Set env variables for the given OpenMP version"
echo " Default: 4.0"
@ -110,22 +122,30 @@ function show_help {
echo " --force-openmp-proc-bind=<OP>"
echo " Override logic for selecting OMP_PROC_BIND"
echo " --no-openmp-nested Set OMP_NESTED to false"
echo " --show-bindings Show the bindings"
echo " --lstopo Show bindings in lstopo without executing a command"
echo " -v|--verbose Show options and relevant environment variables"
echo " --output-prefix=<P> Save the output to files of the form"
echo " P-N.log, P-N.out and P-N.err where P is the prefix"
echo " and N is the queue index or mpi rank (no spaces)"
echo " --output-mode=<Op> How console output should be handled."
echo " Options are all, rank0, and none. Default: rank0"
echo " --lstopo Show bindings in lstopo"
echo " -v|--verbose Print bindings and relevant environment variables"
echo " -h|--help Show this message"
echo ""
echo "Sample Usage:"
echo " Split the current process cpuset into 4 and use the 3rd partition"
echo " ${cmd} --distribute=4 --distribute-partition=2 -v -- command ..."
echo " Bing the process to all even cores"
echo " Launch 16 jobs over 4 nodes with 4 jobs per node using only the even pus"
echo " and save the output to rank specific files"
echo " mpiexec -N 16 -npernode 4 ${cmd} --whole-system --proc-bind=pu:even \\"
echo " --distribute=4 -v --output-prefix=output -- command ..."
echo " Bind the process to all even cores"
echo " ${cmd} --proc-bind=core:even -v -- command ..."
echo " Bind to the first 64 cores and split the current process cpuset into 4"
echo " ${cmd} --proc-bind=core:0-63 --distribute=4 --distribute-partition=0 -- command ..."
echo " skip GPU 0 when mapping visible devices"
echo " Bind the the even cores of socket 0 and the odd cores of socket 1"
echo " ${cmd} --proc-bind='socket:0.core:even socket:1.core:odd' -v -- command ..."
echo " Skip GPU 0 when mapping visible devices"
echo " ${cmd} --distribute=4 --distribute-partition=0 --visible-gpus=1,2 -v -- command ..."
echo " Display the current bindings"
echo " ${cmd} --proc-bind=numa:0 --show-bindings -- command"
echo " ${cmd} --proc-bind=numa:0 -- command"
echo " Display the current bindings using lstopo"
echo " ${cmd} --proc-bind=numa:0.core:odd --lstopo"
echo ""
@ -144,7 +164,7 @@ fi
declare -a UNKNOWN_ARGS=()
declare -i HPCBIND_ENABLE_HWLOC_BIND=${HPCBIND_HAS_HWLOC}
declare -i HPCBIND_DISTRIBUTE=1
declare -i HPCBIND_PARTITION=0
declare -i HPCBIND_PARTITION=-1
HPCBIND_PROC_BIND="all"
HPCBIND_OPENMP_VERSION=4.0
declare -i HPCBIND_OPENMP_PERCENT=100
@ -155,11 +175,15 @@ HPCBIND_OPENMP_FORCE_PROC_BIND=""
HPCBIND_OPENMP_NESTED=${OMP_NESTED:-true}
declare -i HPCBIND_VERBOSE=0
declare -i HPCBIND_SHOW_BINDINGS=0
declare -i HPCBIND_LSTOPO=0
for i in $@; do
case $i in
HPCBIND_OUTPUT_PREFIX=""
HPCBIND_OUTPUT_MODE="rank0"
declare -i HPCBIND_HAS_COMMAND=0
for i in "$@"; do
case "$i" in
# number of partitions to create
--no-hwloc-bind)
HPCBIND_ENABLE_HWLOC_BIND=0
@ -169,6 +193,10 @@ for i in $@; do
HPCBIND_PROC_BIND="${i#*=}"
shift
;;
--whole-system)
HPCBIND_HWLOC_PARENT_CPUSET=""
shift
;;
--distribute=*)
HPCBIND_DISTRIBUTE="${i#*=}"
shift
@ -182,8 +210,8 @@ for i in $@; do
HPCBIND_VISIBLE_GPUS=$(echo "${i#*=}" | tr ',' ' ')
shift
;;
--gpu-ignore-queue)
HPCBIND_QUEUE_GPU_MAPPING=0
--ignore-queue)
HPCBIND_QUEUE_MAPPING=0
shift
;;
--no-gpu-mapping)
@ -218,14 +246,18 @@ for i in $@; do
HPCBIND_OPENMP_NESTED="false"
shift
;;
--show-bindings)
HPCBIND_VERBOSE=1
HPCBIND_SHOW_BINDINGS=1
--output-prefix=*)
HPCBIND_OUTPUT_PREFIX="${i#*=}"
shift
;;
--output-mode=*)
HPCBIND_OUTPUT_MODE="${i#*=}"
#convert to lower case
HPCBIND_OUTPUT_MODE="${HPCBIND_OUTPUT_MODE,,}"
shift
;;
--lstopo)
HPCBIND_VERBOSE=1
HPCBIND_SHOW_BINDINGS=0
HPCBIND_LSTOPO=1
shift
;;
@ -239,6 +271,7 @@ for i in $@; do
;;
# ignore remaining arguments
--)
HPCBIND_HAS_COMMAND=1
shift
break
;;
@ -250,16 +283,41 @@ for i in $@; do
esac
done
################################################################################
# Check output mode
################################################################################
declare -i HPCBIND_TEE=0
if [[ "${HPCBIND_OUTPUT_MODE}" == "none" ]]; then
HPCBIND_TEE=0
elif [[ "${HPCBIND_OUTPUT_MODE}" == "all" ]]; then
HPCBIND_TEE=1
elif [[ ${HPCBIND_QUEUE_INDEX} -eq 0 ]]; then
#default to rank0 printing to screen
HPCBIND_TEE=1
fi
if [[ "${HPCBIND_OUTPUT_PREFIX}" == "" ]]; then
HPCBIND_LOG=/dev/null
HPCBIND_ERR=/dev/null
HPCBIND_OUT=/dev/null
else
HPCBIND_LOG="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.hpc.log"
HPCBIND_ERR="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.err"
HPCBIND_OUT="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.out"
> ${HPCBIND_LOG}
fi
################################################################################
# Check unknown arguments
################################################################################
if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then
echo "Uknown options: ${UNKNOWN_ARGS[*]}"
echo "HPCBIND Uknown options: ${UNKNOWN_ARGS[*]}" > >(tee -a ${HPCBIND_LOG})
exit 1
fi
################################################################################
# Check that visible gpus are valid
################################################################################
@ -268,22 +326,19 @@ if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then
for ((i=0; i < ${#HPCBIND_VISIBLE_GPUS[*]}; i++)); do
if [[ ${HPCBIND_VISIBLE_GPUS[$i]} -ge ${NUM_GPUS} ||
${HPCBIND_VISIBLE_GPUS[$i]} -lt 0 ]]; then
echo "Invaild GPU ID ${HPCBIND_VISIBLE_GPUS[$i]}, setting to 0"
echo "HPCBIND Invaild GPU ID ${HPCBIND_VISIBLE_GPUS[$i]} (setting to 0)" > >(tee -a ${HPCBIND_LOG})
HPCBIND_VISIBLE_GPUS[$i]=0;
fi
done
NUM_GPUS=${#HPCBIND_VISIBLE_GPUS[@]}
fi
################################################################################
# Check OpenMP percent
################################################################################
if [[ ${HPCBIND_OPENMP_PERCENT} -lt 1 ]]; then
echo "OpenMP percent < 1, setting to 1"
HPCBIND_OPENMP_PERCENT=1
elif [[ ${HPCBIND_OPENMP_PERCENT} -gt 100 ]]; then
echo "OpenMP percent > 100, setting to 100"
HPCBIND_OPENMP_PERCENT=100
fi
@ -291,15 +346,21 @@ fi
# Check distribute
################################################################################
if [[ ${HPCBIND_DISTRIBUTE} -le 0 ]]; then
echo "Invalid input for distribute, changing distribute to 1"
HPCBIND_DISTRIBUTE=1
fi
if [[ ${HPCBIND_PARTITION} -ge ${HPCBIND_DISTRIBUTE} ]]; then
echo "Invalid input for distribute-partition, changing to 0"
################################################################################
#choose the correct partition
################################################################################
if [[ ${HPCBIND_PARTITION} -lt 0 && ${HPCBIND_QUEUE_MAPPING} -eq 1 ]]; then
HPCBIND_PARTITION=${HPCBIND_QUEUE_INDEX}
elif [[ ${HPCBIND_PARTITION} -lt 0 ]]; then
HPCBIND_PARTITION=0
fi
if [[ ${HPCBIND_PARTITION} -ge ${HPCBIND_DISTRIBUTE} ]]; then
HPCBIND_PARTITION=$((HPCBIND_PARTITION % HPCBIND_DISTRIBUTE))
fi
################################################################################
# Find cpuset and num threads
@ -309,13 +370,17 @@ declare -i HPCBIND_NUM_PUS=0
if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then
BINDING=$(hwloc-calc ${HPCBIND_PROC_BIND})
BINDING=$(hwloc-calc ${HPCBIND_PROC_BIND[*]})
else
BINDING=$(hwloc-calc --restrict ${HPCBIND_HWLOC_PARENT_CPUSET} ${HPCBIND_PROC_BIND})
BINDING=$(hwloc-calc --restrict ${HPCBIND_HWLOC_PARENT_CPUSET} ${HPCBIND_PROC_BIND[*]})
fi
CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${HPCBIND_DISTRIBUTE}))
HPCBIND_HWLOC_CPUSET=${CPUSETS[${HPCBIND_PARTITION}]}
if [[ ${HPCBIND_DISTRIBUTE} -gt 1 ]]; then
CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${HPCBIND_DISTRIBUTE}))
HPCBIND_HWLOC_CPUSET="${CPUSETS[${HPCBIND_PARTITION}]}"
else
HPCBIND_HWLOC_CPUSET="${BINDING}"
fi
HPCBIND_NUM_PUS=$(hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu | wc -l)
else
HPCBIND_NUM_PUS=$(cat /proc/cpuinfo | grep -c processor)
@ -373,13 +438,13 @@ export OMP_NESTED=${HPCBIND_OPENMP_NESTED}
################################################################################
if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then
if [[ ${HPCBIND_QUEUE_GPU_MAPPING} -eq 0 ]]; then
if [[ ${HPCBIND_QUEUE_MAPPING} -eq 0 ]]; then
declare -i GPU_ID=$((HPCBIND_PARTITION % NUM_GPUS))
export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}
export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}"
else
declare -i MY_TASK_ID=$((HPCBIND_QUEUE_INDEX * HPCBIND_DISTRIBUTE + HPCBIND_PARTITION))
declare -i GPU_ID=$((MY_TASK_ID % NUM_GPUS))
export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}
export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}"
fi
fi
@ -389,22 +454,22 @@ fi
export HPCBIND_HAS_HWLOC=${HPCBIND_HAS_HWLOC}
export HPCBIND_HAS_NVIDIA=${HPCBIND_HAS_NVIDIA}
export HPCBIND_NUM_PUS=${HPCBIND_NUM_PUS}
export HPCBIND_HWLOC_CPUSET=${HPCBIND_HWLOC_CPUSET}
export HPCBIND_HWLOC_CPUSET="${HPCBIND_HWLOC_CPUSET}"
export HPCBIND_HWLOC_DISTRIBUTE=${HPCBIND_DISTRIBUTE}
export HPCBIND_HWLOC_DISTRIBUTE_PARTITION=${HPCBIND_PARTITION}
if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then
export HPCBIND_HWLOC_PARENT_CPUSET="all"
else
export HPCBIND_HWLOC_PARENT_CPUSET=${HPCBIND_HWLOC_PARENT_CPUSET}
export HPCBIND_HWLOC_PARENT_CPUSET="${HPCBIND_HWLOC_PARENT_CPUSET}"
fi
export HPCBIND_HWLOC_PROC_BIND=${HPCBIND_PROC_BIND}
export HPCBIND_HWLOC_PROC_BIND="${HPCBIND_PROC_BIND}"
export HPCBIND_NVIDIA_ENABLE_GPU_MAPPING=${HPCBIND_ENABLE_GPU_MAPPING}
export HPCBIND_NVIDIA_VISIBLE_GPUS=$(echo "${HPCBIND_VISIBLE_GPUS[*]}" | tr ' ' ',')
export HPCBIND_OPENMP_VERSION=${HPCBIND_OPENMP_VERSION}
export HPCBIND_OPENMP_VERSION="${HPCBIND_OPENMP_VERSION}"
if [[ "${HPCBIND_QUEUE_NAME}" != "" ]]; then
export HPCBIND_QUEUE_INDEX=${HPCBIND_QUEUE_INDEX}
export HPCBIND_QUEUE_NAME=${HPCBIND_QUEUE_NAME}
export HPCBIND_QUEUE_GPU_MAPPING=${HPCBIND_QUEUE_GPU_MAPPING}
export HPCBIND_QUEUE_NAME="${HPCBIND_QUEUE_NAME}"
export HPCBIND_QUEUE_MAPPING=${HPCBIND_QUEUE_MAPPING}
fi
@ -412,43 +477,63 @@ fi
# Print verbose
################################################################################
if [[ ${HPCBIND_VERBOSE} -eq 1 ]]; then
MY_ENV=$(env | sort)
echo "[HPCBIND]"
echo "${MY_ENV}" | grep -E "^HPCBIND_"
echo "[CUDA]"
echo "${MY_ENV}" | grep -E "^CUDA_"
echo "[OPENMP]"
echo "${MY_ENV}" | grep -E "^OMP_"
fi
TMP_ENV=$(env | sort)
if [[ ${HPCBIND_TEE} -eq 0 || ${HPCBIND_VERBOSE} -eq 0 ]]; then
echo "[HOST]" >> ${HPCBIND_LOG}
hostname -s >> ${HPCBIND_LOG}
echo "[HPCBIND]" >> ${HPCBIND_LOG}
echo "${TMP_ENV}" | grep -E "^HPCBIND_" >> ${HPCBIND_LOG}
echo "[CUDA]" >> ${HPCBIND_LOG}
echo "${TMP_ENV}" | grep -E "^CUDA_" >> ${HPCBIND_LOG}
echo "[OPENMP]" >> ${HPCBIND_LOG}
echo "${TMP_ENV}" | grep -E "^OMP_" >> ${HPCBIND_LOG}
if [[ ${HPCBIND_HAS_HWLOC} -eq 1 && ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then
echo "[BINDINGS]"
hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu
elif [[ ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then
echo "Unable to show bindings, hwloc not available."
if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
echo "[BINDINGS]" >> ${HPCBIND_LOG}
hwloc-ls --restrict "${HPCBIND_HWLOC_CPUSET}" --only pu >> ${HPCBIND_LOG}
else
echo "Unable to show bindings, hwloc not available." >> ${HPCBIND_LOG}
fi
else
echo "[HOST]" > >(tee -a ${HPCBIND_LOG})
hostname -s > >(tee -a ${HPCBIND_LOG})
echo "[HPCBIND]" > >(tee -a ${HPCBIND_LOG})
echo "${TMP_ENV}" | grep -E "^HPCBIND_" > >(tee -a ${HPCBIND_LOG})
echo "[CUDA]" > >(tee -a ${HPCBIND_LOG})
echo "${TMP_ENV}" | grep -E "^CUDA_" > >(tee -a ${HPCBIND_LOG})
echo "[OPENMP]" > >(tee -a ${HPCBIND_LOG})
echo "${TMP_ENV}" | grep -E "^OMP_" > >(tee -a ${HPCBIND_LOG})
if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
echo "[BINDINGS]" > >(tee -a ${HPCBIND_LOG})
hwloc-ls --restrict "${HPCBIND_HWLOC_CPUSET}" --only pu > >(tee -a ${HPCBIND_LOG})
else
echo "Unable to show bindings, hwloc not available." > >(tee -a ${HPCBIND_LOG})
fi
fi
################################################################################
# Run command
################################################################################
if [[ ${HPCBIND_LSTOPO} -eq 0 ]]; then
if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- $@
else
eval $@
fi
else
if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 && ! -z ${DISPLAY} ]]; then
echo "[BINDINGS]"
hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu
hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- lstopo --pid 0
# must be the last executed command so that the return value is correct
if [[ ${HPCBIND_LSTOPO} -eq 1 && ${HPCBIND_HAS_HWLOC} -eq 1 && ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 && ! -z ${DISPLAY} ]]; then
hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- lstopo --pid 0
elif [[ ${HPCBIND_HAS_COMMAND} -eq 1 ]]; then
# clear output files
> ${HPCBIND_ERR}
> ${HPCBIND_OUT}
if [[ ${HPCBIND_TEE} -eq 0 ]]; then
if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
else
hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET}
eval $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
fi
else
echo "Unable to show bindings, hwloc not available."
if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
else
eval $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
fi
fi
fi

View File

@ -1,221 +0,0 @@
#!/usr/bin/env bash
# check if hwloc commands exist
declare -i HAS_HWLOC=0
type hwloc-bind >/dev/null 2>&1
HAS_HWLOC="${HAS_HWLOC} + $?"
type hwloc-distrib >/dev/null 2>&1
HAS_HWLOC="${HAS_HWLOC} + $?"
type hwloc-ls >/dev/null 2>&1
HAS_HWLOC="${HAS_HWLOC} + $?"
type hwloc-calc >/dev/null 2>&1
HAS_HWLOC="${HAS_HWLOC} + $?"
type hwloc-ps >/dev/null 2>&1
HAS_HWLOC="${HAS_HWLOC} + $?"
#parse args
declare -a UNKNOWN_ARGS=()
declare -i DISTRIBUTE=1
declare -i INDEX=0
PROC_BIND="all"
CURRENT_CPUSET=""
OPENMP_VERSION=4.0
OPENMP_PROC_BIND=True
OPENMP_NESTED=True
VERBOSE=False
#get the current process cpuset
if [[ ${HAS_HWLOC} -eq 0 ]]; then
MY_PID="$BASHPID"
CURRENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2)
echo "$CURRENT_CPUSET"
fi
function show_help {
local cmd=$(basename "$0")
echo "Usage: ${cmd} <options> -- command ..."
echo " Uses hwloc to divide the node into the given number of groups,"
echo " set the appropriate OMP_NUM_THREADS and execute the command on the"
echo " selected group."
echo ""
echo " NOTE: This command assumes it has exclusive use of the node"
echo ""
echo "Options:"
echo " --proc-bind=<LOC> Set the initial process mask for the script. "
echo " LOC can be any valid location argumnet for"
echo " hwloc-calc. Defaults to the entire machine"
echo " --distribute=N Distribute the current proc-bind into N groups"
echo " --index=I Use the i'th group (zero based)"
echo " --openmp=M.m Set env variables for the given OpenMP version"
echo " (default 4.0)"
echo " --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES"
echo " --no-openmp-nested Set OMP_NESTED to false"
echo " -v|--verbose"
echo " -h|--help"
echo ""
echo "Sample Usage:"
echo " ${cmd} --distribute=4 --index=2 -v -- command ..."
echo ""
}
if [[ "$#" -eq 0 ]]; then
show_help
exit 0
fi
for i in $@; do
case $i in
# number of partitions to create
--proc-bind=*)
PROC_BIND="${i#*=}"
shift
;;
--distribute=*)
DISTRIBUTE="${i#*=}"
shift
;;
# which group to use
--index=*)
INDEX="${i#*=}"
shift
;;
--openmp=*)
OPENMP_VERSION="${i#*=}"
shift
;;
--no-openmp-proc-bind)
OPENMP_PROC_BIND=False
shift
;;
--no-openmp-nested)
OPENMP_NESTED=False
shift
;;
-v|--verbose)
VERBOSE=True
shift
;;
-h|--help)
show_help
exit 0
;;
# ignore remaining arguments
--)
shift
break
;;
# unknown option
*)
UNKNOWN_ARGS+=("$i")
shift
;;
esac
done
if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then
echo "Uknown options: ${UNKNOWN_ARGS[*]}"
exit 1
fi
if [[ ${DISTRIBUTE} -le 0 ]]; then
echo "Invalid input for distribute, changing distribute to 1"
DISTRIBUTE=1
fi
if [[ ${INDEX} -ge ${DISTRIBUTE} ]]; then
echo "Invalid input for index, changing index to 0"
INDEX=0
fi
if [[ ${HAS_HWLOC} -ne 0 ]]; then
echo "hwloc not found, no process binding will occur"
DISTRIBUTE=1
INDEX=0
fi
if [[ ${HAS_HWLOC} -eq 0 ]]; then
if [[ "${CURRENT_CPUSET}" == "" ]]; then
BINDING=$(hwloc-calc ${PROC_BIND})
else
BINDING=$(hwloc-calc --restrict ${CURRENT_CPUSET} ${PROC_BIND})
fi
CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${DISTRIBUTE}))
CPUSET=${CPUSETS[${INDEX}]}
NUM_THREADS=$(hwloc-ls --restrict ${CPUSET} --only pu | wc -l)
if [[ "${VERBOSE}" == "True" ]]; then
echo "hwloc: true"
echo " proc_bind: ${PROC_BIND}"
echo " distribute: ${DISTRIBUTE}"
echo " index: ${INDEX}"
echo " parent_cpuset: ${CURRENT_CPUSET}"
echo " cpuset: ${CPUSET}"
echo "omp_num_threads: ${NUM_THREADS}"
echo "omp_proc_bind: ${OPENMP_PROC_BIND}"
echo "omp_nested: ${OPENMP_NESTED}"
echo "OpenMP: ${OPENMP_VERSION}"
fi
# set OMP env
if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then
if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then
export OMP_PLACES="threads"
export OMP_PROC_BIND="spread"
else
export OMP_PROC_BIND="true"
unset OMP_PLACES
fi
else
unset OMP_PLACES
unset OMP_PROC_BIND
fi
if [[ "${OPENMP_NESTED}" == "True" ]]; then
export OMP_NESTED="true"
else
export OMP_NESTED="false"
fi
export OMP_NUM_THREADS="${NUM_THREADS}"
hwloc-bind ${CPUSET} -- $@
else
NUM_THREADS=$(cat /proc/cpuinfo | grep -c processor)
if [[ "${VERBOSE}" == "True" ]]; then
echo "hwloc: false"
echo "omp_num_threads: ${NUM_THREADS}"
echo "omp_proc_bind: ${OPENMP_PROC_BIND}"
echo "omp_nested: ${OPENMP_NESTED}"
echo "OpenMP: ${OPENMP_VERSION}"
fi
# set OMP env
if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then
if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then
export OMP_PLACES="threads"
export OMP_PROC_BIND="spread"
else
export OMP_PROC_BIND="true"
unset OMP_PLACES
fi
else
unset OMP_PLACES
unset OMP_PROC_BIND
fi
if [[ "${OPENMP_NESTED}" == "True" ]]; then
export OMP_NESTED="true"
else
export OMP_NESTED="false"
fi
export OMP_NUM_THREADS="${NUM_THREADS}"
eval $@
fi

View File

@ -78,6 +78,9 @@ temp_dir=${TMPDIR:-/tmp}
# Check if we have an optimization argument already
optimization_applied=0
# Check if we have -std=c++X or --std=c++X already
stdcxx_applied=0
#echo "Arguments: $# $@"
while [ $# -gt 0 ]
@ -130,10 +133,16 @@ do
cuda_args="$cuda_args $1 $2"
shift
;;
#Handle c++11 setting
--std=c++11|-std=c++11)
shared_args="$shared_args $1"
#Handle c++11
--std=c++11|-std=c++11|--std=c++14|-std=c++14|--std=c++1z|-std=c++1z)
if [ $stdcxx_applied -eq 1 ]; then
echo "nvcc_wrapper - *warning* you have set multiple optimization flags (-std=c++1* or --std=c++1*), only the first is used because nvcc can only accept a single std setting"
else
shared_args="$shared_args $1"
stdcxx_applied=1
fi
;;
#strip of -std=c++98 due to nvcc warnings and Tribits will place both -std=c++11 and -std=c++98
-std=c++98|--std=c++98)
;;

View File

@ -9,3 +9,4 @@ tag: 2.03.00 date: 04:25:2017 master: 120d9ce7 develop: 015ba641
tag: 2.03.05 date: 05:27:2017 master: 36b92f43 develop: 79073186
tag: 2.03.13 date: 07:27:2017 master: da314444 develop: 29ccb58a
tag: 2.04.00 date: 08:16:2017 master: 54eb75c0 develop: 32fb8ee1
tag: 2.04.04 date: 09:11:2017 master: 2b7e9c20 develop: 51e7b25a

View File

@ -1,4 +1,4 @@
module purge
module load sems-env sems-gcc/4.9.3 sems-openmpi/1.10.1 sems-hdf5/1.8.12/parallel sems-netcdf/4.3.2/parallel sems-python/2.7.9 sems-zlib/1.2.8/base sems-cmake/3.5.2 sems-parmetis/4.0.3/64bit_parallel sems-scotch/6.0.3/nopthread_64bit_parallel sems-boost/1.59.0/base
module load sems-env sems-gcc/4.9.3 sems-openmpi/1.10.1 sems-hdf5/1.8.12/parallel sems-netcdf/4.3.2/parallel sems-python/2.7.9 sems-zlib/1.2.8/base sems-cmake/3.5.2 sems-parmetis/4.0.3/64bit_parallel sems-scotch/6.0.3/nopthread_64bit_parallel sems-boost/1.63.0/base sems-yaml_cpp sems-superlu
#Run Trilinos CheckinTest

View File

@ -125,6 +125,123 @@ namespace Impl {
};
}
/// \class GraphRowViewConst
/// \brief View of a row of a sparse graph.
/// \tparam GraphType Sparse graph type, such as (but not limited to) StaticCrsGraph.
///
/// This class provides a generic view of a row of a sparse graph.
/// We intended this class to view a row of a StaticCrsGraph, but
/// GraphType need not necessarily be CrsMatrix.
///
/// The row view is suited for computational kernels like sparse
/// matrix-vector multiply, as well as for modifying entries in the
/// sparse matrix. The view is always const as it does not allow graph modification.
///
/// Here is an example loop over the entries in the row:
/// \code
/// typedef typename GraphRowViewConst<MatrixType>::ordinal_type ordinal_type;
///
/// GraphRowView<GraphType> G_i = ...;
/// const ordinal_type numEntries = G_i.length;
/// for (ordinal_type k = 0; k < numEntries; ++k) {
/// ordinal_type j = G_i.colidx (k);
/// // ... do something with A_ij and j ...
/// }
/// \endcode
///
/// GraphType must provide the \c data_type
/// typedefs. In addition, it must make sense to use GraphRowViewConst to
/// view a row of GraphType. In particular, column
/// indices of a row must be accessible using the <tt>entries</tt>
/// resp. <tt>colidx</tt> arrays given to the constructor of this
/// class, with a constant <tt>stride</tt> between successive entries.
/// The stride is one for the compressed sparse row storage format (as
/// is used by CrsMatrix), but may be greater than one for other
/// sparse matrix storage formats (e.g., ELLPACK or jagged diagonal).
template<class GraphType>
struct GraphRowViewConst {
//! The type of the column indices in the row.
typedef const typename GraphType::data_type ordinal_type;
private:
//! Array of (local) column indices in the row.
ordinal_type* colidx_;
/// \brief Stride between successive entries in the row.
///
/// For compressed sparse row (CSR) storage, this is always one.
/// This might be greater than one for storage formats like ELLPACK
/// or Jagged Diagonal. Nevertheless, the stride can never be
/// greater than the number of rows or columns in the matrix. Thus,
/// \c ordinal_type is the correct type.
const ordinal_type stride_;
public:
/// \brief Constructor
///
/// \param values [in] Array of the row's values.
/// \param colidx [in] Array of the row's column indices.
/// \param stride [in] (Constant) stride between matrix entries in
/// each of the above arrays.
/// \param count [in] Number of entries in the row.
KOKKOS_INLINE_FUNCTION
GraphRowViewConst ( ordinal_type* const colidx_in,
const ordinal_type& stride,
const ordinal_type& count) :
colidx_ (colidx_in), stride_ (stride), length (count)
{}
/// \brief Constructor with offset into \c colidx array
///
/// \param colidx [in] Array of the row's column indices.
/// \param stride [in] (Constant) stride between matrix entries in
/// each of the above arrays.
/// \param count [in] Number of entries in the row.
/// \param idx [in] Start offset into \c colidx array
///
/// \tparam OffsetType The type of \c idx (see above). Must be a
/// built-in integer type. This may differ from ordinal_type.
/// For example, the matrix may have dimensions that fit in int,
/// but a number of entries that does not fit in int.
template<class OffsetType>
KOKKOS_INLINE_FUNCTION
GraphRowViewConst ( const typename GraphType::entries_type& colidx_in,
const ordinal_type& stride,
const ordinal_type& count,
const OffsetType& idx,
const typename std::enable_if<std::is_integral<OffsetType>::value, int>::type& = 0) :
colidx_ (&colidx_in(idx)), stride_ (stride), length (count)
{}
/// \brief Number of entries in the row.
///
/// This is a public const field rather than a public const method,
/// in order to avoid possible overhead of a method call if the
/// compiler is unable to inline that method call.
///
/// We assume that rows contain no duplicate entries (i.e., entries
/// with the same column index). Thus, a row may have up to
/// A.numCols() entries. This means that the correct type of
/// 'length' is ordinal_type.
const ordinal_type length;
/// \brief (Const) reference to the column index of entry i in this
/// row of the sparse matrix.
///
/// "Entry i" is not necessarily the entry with column index i, nor
/// does i necessarily correspond to the (local) row index.
KOKKOS_INLINE_FUNCTION
ordinal_type& colidx (const ordinal_type& i) const {
return colidx_[i*stride_];
}
/// \brief An alias for colidx
KOKKOS_INLINE_FUNCTION
ordinal_type& operator()(const ordinal_type& i) const {
return colidx(i);
}
};
/// \class StaticCrsGraph
/// \brief Compressed row storage array.
///
@ -218,6 +335,38 @@ public:
static_cast<size_type> (0);
}
/// \brief Return a const view of row i of the graph.
///
/// If row i does not belong to the graph, return an empty view.
///
/// The returned object \c view implements the following interface:
/// <ul>
/// <li> \c view.length is the number of entries in the row </li>
/// <li> \c view.colidx(k) returns a const reference to the
/// column index of the k-th entry in the row </li>
/// </ul>
/// k is not a column index; it just counts from 0 to
/// <tt>view.length - 1</tt>.
///
/// Users should not rely on the return type of this method. They
/// should instead assign to 'auto'. That allows compile-time
/// polymorphism for different kinds of sparse matrix formats (e.g.,
/// ELLPACK or Jagged Diagonal) that we may wish to support in the
/// future.
KOKKOS_INLINE_FUNCTION
GraphRowViewConst<StaticCrsGraph> rowConst (const data_type i) const {
const size_type start = row_map(i);
// count is guaranteed to fit in ordinal_type, as long as no row
// has duplicate entries.
const data_type count = static_cast<data_type> (row_map(i+1) - start);
if (count == 0) {
return GraphRowViewConst<StaticCrsGraph> (NULL, 1, 0);
} else {
return GraphRowViewConst<StaticCrsGraph> (entries, 1, count, start);
}
}
/** \brief Create a row partitioning into a given number of blocks
* balancing non-zeros + a fixed cost per row.
*/

View File

@ -91,11 +91,11 @@ struct DeviceIterateTile<2,RP,Functor,void >
// LL
if (RP::inner_direction == RP::Left) {
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
m_func(offset_0 , offset_1);
}
@ -106,11 +106,11 @@ struct DeviceIterateTile<2,RP,Functor,void >
// LR
else {
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
m_func(offset_0 , offset_1);
}
@ -143,11 +143,11 @@ struct DeviceIterateTile<2,RP,Functor,Tag>
if (RP::inner_direction == RP::Left) {
// Loop over size maxnumblocks until full range covered
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
m_func(Tag(), offset_0 , offset_1);
}
@ -157,11 +157,11 @@ struct DeviceIterateTile<2,RP,Functor,Tag>
}
else {
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
m_func(Tag(), offset_0 , offset_1);
}
@ -196,15 +196,15 @@ struct DeviceIterateTile<3,RP,Functor,void >
// LL
if (RP::inner_direction == RP::Left) {
for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
m_func(offset_0 , offset_1 , offset_2);
}
@ -217,15 +217,15 @@ struct DeviceIterateTile<3,RP,Functor,void >
// LR
else {
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
m_func(offset_0 , offset_1 , offset_2);
}
@ -259,15 +259,15 @@ struct DeviceIterateTile<3,RP,Functor,Tag>
{
if (RP::inner_direction == RP::Left) {
for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
m_func(Tag(), offset_0 , offset_1 , offset_2);
}
@ -279,15 +279,15 @@ struct DeviceIterateTile<3,RP,Functor,Tag>
}
else {
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
m_func(Tag(), offset_0 , offset_1 , offset_2);
}
@ -340,19 +340,19 @@ struct DeviceIterateTile<4,RP,Functor,void >
const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
m_func(offset_0 , offset_1 , offset_2 , offset_3);
}
@ -378,19 +378,19 @@ struct DeviceIterateTile<4,RP,Functor,void >
const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
m_func(offset_0 , offset_1 , offset_2 , offset_3);
}
@ -442,19 +442,19 @@ struct DeviceIterateTile<4,RP,Functor,Tag>
const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
m_func(Tag(), offset_0 , offset_1 , offset_2 , offset_3);
}
@ -479,19 +479,19 @@ struct DeviceIterateTile<4,RP,Functor,Tag>
const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3);
}
@ -558,23 +558,23 @@ struct DeviceIterateTile<5,RP,Functor,void >
const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];
for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4);
}
@ -613,23 +613,23 @@ struct DeviceIterateTile<5,RP,Functor,void >
const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
}
@ -695,23 +695,23 @@ struct DeviceIterateTile<5,RP,Functor,Tag>
const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];
for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4);
}
@ -750,23 +750,23 @@ struct DeviceIterateTile<5,RP,Functor,Tag>
const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
}
@ -845,27 +845,27 @@ struct DeviceIterateTile<6,RP,Functor,void >
const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4];
for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
}
@ -917,27 +917,27 @@ struct DeviceIterateTile<6,RP,Functor,void >
const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5];
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
}
@ -1016,27 +1016,27 @@ struct DeviceIterateTile<6,RP,Functor,Tag>
const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4];
for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
}
@ -1088,27 +1088,27 @@ struct DeviceIterateTile<6,RP,Functor,Tag>
const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5];
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
}

View File

@ -164,7 +164,7 @@ static void cuda_parallel_launch_constant_memory()
template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
__global__
__launch_bounds__(maxTperB, minBperSM)
//__launch_bounds__(maxTperB, minBperSM)
static void cuda_parallel_launch_constant_memory()
{
const DriverType & driver =
@ -182,7 +182,7 @@ static void cuda_parallel_launch_local_memory( const DriverType driver )
template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
__global__
__launch_bounds__(maxTperB, minBperSM)
//__launch_bounds__(maxTperB, minBperSM)
static void cuda_parallel_launch_local_memory( const DriverType driver )
{
driver();

View File

@ -242,45 +242,89 @@ public:
re_ = v;
}
template<typename InputRealType>
KOKKOS_INLINE_FUNCTION
complex<RealType>& operator += (const complex<RealType>& src) {
complex<RealType>&
operator += (const complex<InputRealType>& src) {
static_assert(std::is_convertible<InputRealType,RealType>::value,
"InputRealType must be convertible to RealType");
re_ += src.re_;
im_ += src.im_;
return *this;
}
template<typename InputRealType>
KOKKOS_INLINE_FUNCTION
void operator += (const volatile complex<RealType>& src) volatile {
void
operator += (const volatile complex<InputRealType>& src) volatile {
static_assert(std::is_convertible<InputRealType,RealType>::value,
"InputRealType must be convertible to RealType");
re_ += src.re_;
im_ += src.im_;
}
KOKKOS_INLINE_FUNCTION
complex<RealType>& operator += (const RealType& src) {
complex<RealType>&
operator += (const std::complex<RealType>& src) {
re_ += src.real();
im_ += src.imag();
return *this;
}
template<typename InputRealType>
KOKKOS_INLINE_FUNCTION
complex<RealType>&
operator += (const InputRealType& src) {
static_assert(std::is_convertible<InputRealType,RealType>::value,
"InputRealType must be convertible to RealType");
re_ += src;
return *this;
}
template<typename InputRealType>
KOKKOS_INLINE_FUNCTION
void operator += (const volatile RealType& src) volatile {
void
operator += (const volatile InputRealType& src) volatile {
static_assert(std::is_convertible<InputRealType,RealType>::value,
"InputRealType must be convertible to RealType");
re_ += src;
}
template<typename InputRealType>
KOKKOS_INLINE_FUNCTION
complex<RealType>& operator -= (const complex<RealType>& src) {
complex<RealType>&
operator -= (const complex<InputRealType>& src) {
static_assert(std::is_convertible<InputRealType,RealType>::value,
"InputRealType must be convertible to RealType");
re_ -= src.re_;
im_ -= src.im_;
return *this;
}
KOKKOS_INLINE_FUNCTION
complex<RealType>& operator -= (const RealType& src) {
complex<RealType>&
operator -= (const std::complex<RealType>& src) {
re_ -= src.real();
im_ -= src.imag();
return *this;
}
template<typename InputRealType>
KOKKOS_INLINE_FUNCTION
complex<RealType>&
operator -= (const InputRealType& src) {
static_assert(std::is_convertible<InputRealType,RealType>::value,
"InputRealType must be convertible to RealType");
re_ -= src;
return *this;
}
template<typename InputRealType>
KOKKOS_INLINE_FUNCTION
complex<RealType>& operator *= (const complex<RealType>& src) {
complex<RealType>&
operator *= (const complex<InputRealType>& src) {
static_assert(std::is_convertible<InputRealType,RealType>::value,
"InputRealType must be convertible to RealType");
const RealType realPart = re_ * src.re_ - im_ * src.im_;
const RealType imagPart = re_ * src.im_ + im_ * src.re_;
re_ = realPart;
@ -288,8 +332,12 @@ public:
return *this;
}
template<typename InputRealType>
KOKKOS_INLINE_FUNCTION
void operator *= (const volatile complex<RealType>& src) volatile {
void
operator *= (const volatile complex<InputRealType>& src) volatile {
static_assert(std::is_convertible<InputRealType,RealType>::value,
"InputRealType must be convertible to RealType");
const RealType realPart = re_ * src.re_ - im_ * src.im_;
const RealType imagPart = re_ * src.im_ + im_ * src.re_;
re_ = realPart;
@ -297,20 +345,70 @@ public:
}
KOKKOS_INLINE_FUNCTION
complex<RealType>& operator *= (const RealType& src) {
complex<RealType>&
operator *= (const std::complex<RealType>& src) {
const RealType realPart = re_ * src.real() - im_ * src.imag();
const RealType imagPart = re_ * src.imag() + im_ * src.real();
re_ = realPart;
im_ = imagPart;
return *this;
}
template<typename InputRealType>
KOKKOS_INLINE_FUNCTION
complex<RealType>&
operator *= (const InputRealType& src) {
static_assert(std::is_convertible<InputRealType,RealType>::value,
"InputRealType must be convertible to RealType");
re_ *= src;
im_ *= src;
return *this;
}
template<typename InputRealType>
KOKKOS_INLINE_FUNCTION
void operator *= (const volatile RealType& src) volatile {
void
operator *= (const volatile InputRealType& src) volatile {
static_assert(std::is_convertible<InputRealType,RealType>::value,
"InputRealType must be convertible to RealType");
re_ *= src;
im_ *= src;
}
template<typename InputRealType>
KOKKOS_INLINE_FUNCTION
complex<RealType>& operator /= (const complex<RealType>& y) {
complex<RealType>&
operator /= (const complex<InputRealType>& y) {
static_assert(std::is_convertible<InputRealType,RealType>::value,
"InputRealType must be convertible to RealType");
// Scale (by the "1-norm" of y) to avoid unwarranted overflow.
// If the real part is +/-Inf and the imaginary part is -/+Inf,
// this won't change the result.
const RealType s = std::fabs (y.real ()) + std::fabs (y.imag ());
// If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0.
// In that case, the relation x/y == (x/s) / (y/s) doesn't hold,
// because y/s is NaN.
if (s == 0.0) {
this->re_ /= s;
this->im_ /= s;
}
else {
const complex<RealType> x_scaled (this->re_ / s, this->im_ / s);
const complex<RealType> y_conj_scaled (y.re_ / s, -(y.im_) / s);
const RealType y_scaled_abs = y_conj_scaled.re_ * y_conj_scaled.re_ +
y_conj_scaled.im_ * y_conj_scaled.im_; // abs(y) == abs(conj(y))
*this = x_scaled * y_conj_scaled;
*this /= y_scaled_abs;
}
return *this;
}
KOKKOS_INLINE_FUNCTION
complex<RealType>&
operator /= (const std::complex<RealType>& y) {
// Scale (by the "1-norm" of y) to avoid unwarranted overflow.
// If the real part is +/-Inf and the imaginary part is -/+Inf,
// this won't change the result.
@ -334,57 +432,95 @@ public:
return *this;
}
template<typename InputRealType>
KOKKOS_INLINE_FUNCTION
complex<RealType>& operator /= (const RealType& src) {
complex<RealType>&
operator /= (const InputRealType& src) {
static_assert(std::is_convertible<InputRealType,RealType>::value,
"InputRealType must be convertible to RealType");
re_ /= src;
im_ /= src;
return *this;
}
template<typename InputRealType>
KOKKOS_INLINE_FUNCTION
bool operator == (const complex<RealType>& src) {
return (re_ == src.re_) && (im_ == src.im_);
bool
operator == (const complex<InputRealType>& src) {
static_assert(std::is_convertible<InputRealType,RealType>::value,
"InputRealType must be convertible to RealType");
return (re_ == static_cast<RealType>(src.re_)) && (im_ == static_cast<RealType>(src.im_));
}
KOKKOS_INLINE_FUNCTION
bool operator == (const RealType src) {
return (re_ == src) && (im_ == RealType(0));
bool
operator == (const std::complex<RealType>& src) {
return (re_ == src.real()) && (im_ == src.imag());
}
template<typename InputRealType>
KOKKOS_INLINE_FUNCTION
bool
operator == (const InputRealType src) {
static_assert(std::is_convertible<InputRealType,RealType>::value,
"InputRealType must be convertible to RealType");
return (re_ == static_cast<RealType>(src)) && (im_ == RealType(0));
}
template<typename InputRealType>
KOKKOS_INLINE_FUNCTION
bool
operator != (const complex<InputRealType>& src) {
static_assert(std::is_convertible<InputRealType,RealType>::value,
"InputRealType must be convertible to RealType");
return (re_ != static_cast<RealType>(src.re_)) || (im_ != static_cast<RealType>(src.im_));
}
KOKKOS_INLINE_FUNCTION
bool operator != (const complex<RealType>& src) {
return (re_ != src.re_) || (im_ != src.im_);
bool
operator != (const std::complex<RealType>& src) {
return (re_ != src.real()) || (im_ != src.imag());
}
template<typename InputRealType>
KOKKOS_INLINE_FUNCTION
bool operator != (const RealType src) {
return (re_ != src) || (im_ != RealType(0));
}
bool
operator != (const InputRealType src) {
static_assert(std::is_convertible<InputRealType,RealType>::value,
"InputRealType must be convertible to RealType");
return (re_ != static_cast<RealType>(src)) || (im_ != RealType(0));
}
};
//! Binary + operator for complex complex.
template<class RealType>
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
complex<RealType>
operator + (const complex<RealType>& x, const complex<RealType>& y) {
return complex<RealType> (x.real () + y.real (), x.imag () + y.imag ());
complex<typename std::common_type<RealType1,RealType2>::type>
operator + (const complex<RealType1>& x, const complex<RealType2>& y) {
return complex<typename std::common_type<RealType1,RealType2>::type > (x.real () + y.real (), x.imag () + y.imag ());
}
//! Binary + operator for complex scalar.
template<class RealType>
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
complex<RealType>
operator + (const complex<RealType>& x, const RealType& y) {
return complex<RealType> (x.real () + y , x.imag ());
complex<typename std::common_type<RealType1,RealType2>::type>
operator + (const complex<RealType1>& x, const RealType2& y) {
return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () + y , x.imag ());
}
//! Binary + operator for scalar complex.
template<class RealType>
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
complex<RealType>
operator + (const RealType& x, const complex<RealType>& y) {
return complex<RealType> (x + y.real (), y.imag ());
complex<typename std::common_type<RealType1,RealType2>::type>
operator + (const RealType1& x, const complex<RealType2>& y) {
return complex<typename std::common_type<RealType1,RealType2>::type> (x + y.real (), y.imag ());
}
//! Unary + operator for complex.
@ -396,27 +532,27 @@ operator + (const complex<RealType>& x) {
}
//! Binary - operator for complex.
template<class RealType>
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
complex<RealType>
operator - (const complex<RealType>& x, const complex<RealType>& y) {
return complex<RealType> (x.real () - y.real (), x.imag () - y.imag ());
complex<typename std::common_type<RealType1,RealType2>::type>
operator - (const complex<RealType1>& x, const complex<RealType2>& y) {
return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () - y.real (), x.imag () - y.imag ());
}
//! Binary - operator for complex scalar.
template<class RealType>
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
complex<RealType>
operator - (const complex<RealType>& x, const RealType& y) {
return complex<RealType> (x.real () - y , x.imag ());
complex<typename std::common_type<RealType1,RealType2>::type>
operator - (const complex<RealType1>& x, const RealType2& y) {
return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () - y , x.imag ());
}
//! Binary - operator for scalar complex.
template<class RealType>
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
complex<RealType>
operator - (const RealType& x, const complex<RealType>& y) {
return complex<RealType> (x - y.real (), - y.imag ());
complex<typename std::common_type<RealType1,RealType2>::type>
operator - (const RealType1& x, const complex<RealType2>& y) {
return complex<typename std::common_type<RealType1,RealType2>::type> (x - y.real (), - y.imag ());
}
//! Unary - operator for complex.
@ -428,12 +564,12 @@ operator - (const complex<RealType>& x) {
}
//! Binary * operator for complex.
template<class RealType>
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
complex<RealType>
operator * (const complex<RealType>& x, const complex<RealType>& y) {
return complex<RealType> (x.real () * y.real () - x.imag () * y.imag (),
x.real () * y.imag () + x.imag () * y.real ());
complex<typename std::common_type<RealType1,RealType2>::type>
operator * (const complex<RealType1>& x, const complex<RealType2>& y) {
return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () * y.real () - x.imag () * y.imag (),
x.real () * y.imag () + x.imag () * y.real ());
}
/// \brief Binary * operator for std::complex and complex.
@ -446,33 +582,34 @@ operator * (const complex<RealType>& x, const complex<RealType>& y) {
/// This function cannot be called in a CUDA device function, because
/// std::complex's methods and nonmember functions are not marked as
/// CUDA device functions.
template<class RealType>
complex<RealType>
operator * (const std::complex<RealType>& x, const complex<RealType>& y) {
return complex<RealType> (x.real () * y.real () - x.imag () * y.imag (),
x.real () * y.imag () + x.imag () * y.real ());
template<class RealType1, class RealType2>
inline
complex<typename std::common_type<RealType1,RealType2>::type>
operator * (const std::complex<RealType1>& x, const complex<RealType2>& y) {
return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () * y.real () - x.imag () * y.imag (),
x.real () * y.imag () + x.imag () * y.real ());
}
/// \brief Binary * operator for RealType times complex.
///
/// This function exists because the compiler doesn't know that
/// RealType and complex<RealType> commute with respect to operator*.
template<class RealType>
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
complex<RealType>
operator * (const RealType& x, const complex<RealType>& y) {
return complex<RealType> (x * y.real (), x * y.imag ());
complex<typename std::common_type<RealType1,RealType2>::type>
operator * (const RealType1& x, const complex<RealType2>& y) {
return complex<typename std::common_type<RealType1,RealType2>::type> (x * y.real (), x * y.imag ());
}
/// \brief Binary * operator for RealType times complex.
///
/// This function exists because the compiler doesn't know that
/// RealType and complex<RealType> commute with respect to operator*.
template<class RealType>
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
complex<RealType>
operator * (const complex<RealType>& y, const RealType& x) {
return complex<RealType> (x * y.real (), x * y.imag ());
complex<typename std::common_type<RealType1,RealType2>::type>
operator * (const complex<RealType1>& y, const RealType2& x) {
return complex<typename std::common_type<RealType1,RealType2>::type> (x * y.real (), x * y.imag ());
}
//! Imaginary part of a complex number.
@ -539,33 +676,34 @@ complex<RealType> pow (const complex<RealType>& x) {
//! Binary operator / for complex and real numbers
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
complex<RealType1>
complex<typename std::common_type<RealType1,RealType2>::type>
operator / (const complex<RealType1>& x, const RealType2& y) {
return complex<RealType1> (real (x) / y, imag (x) / y);
return complex<typename std::common_type<RealType1,RealType2>::type> (real (x) / y, imag (x) / y);
}
//! Binary operator / for complex.
template<class RealType>
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
complex<RealType>
operator / (const complex<RealType>& x, const complex<RealType>& y) {
complex<typename std::common_type<RealType1,RealType2>::type>
operator / (const complex<RealType1>& x, const complex<RealType2>& y) {
// Scale (by the "1-norm" of y) to avoid unwarranted overflow.
// If the real part is +/-Inf and the imaginary part is -/+Inf,
// this won't change the result.
const RealType s = std::fabs (real (y)) + std::fabs (imag (y));
typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
const common_real_type s = std::fabs (real (y)) + std::fabs (imag (y));
// If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0.
// In that case, the relation x/y == (x/s) / (y/s) doesn't hold,
// because y/s is NaN.
if (s == 0.0) {
return complex<RealType> (real (x) / s, imag (x) / s);
return complex<common_real_type> (real (x) / s, imag (x) / s);
}
else {
const complex<RealType> x_scaled (real (x) / s, imag (x) / s);
const complex<RealType> y_conj_scaled (real (y) / s, -imag (y) / s);
const RealType y_scaled_abs = real (y_conj_scaled) * real (y_conj_scaled) +
const complex<common_real_type> x_scaled (real (x) / s, imag (x) / s);
const complex<common_real_type> y_conj_scaled (real (y) / s, -imag (y) / s);
const RealType1 y_scaled_abs = real (y_conj_scaled) * real (y_conj_scaled) +
imag (y_conj_scaled) * imag (y_conj_scaled); // abs(y) == abs(conj(y))
complex<RealType> result = x_scaled * y_conj_scaled;
complex<common_real_type> result = x_scaled * y_conj_scaled;
result /= y_scaled_abs;
return result;
}
@ -574,16 +712,19 @@ operator / (const complex<RealType>& x, const complex<RealType>& y) {
//! Binary operator / for complex and real numbers
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
complex<RealType1>
complex<typename std::common_type<RealType1,RealType2>::type>
operator / (const RealType1& x, const complex<RealType2>& y) {
return complex<RealType1> (x)/y;
return complex<typename std::common_type<RealType1,RealType2>::type> (x)/y;
}
//! Equality operator for two complex numbers.
template<class RealType>
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
bool operator == (const complex<RealType>& x, const complex<RealType>& y) {
return real (x) == real (y) && imag (x) == imag (y);
bool
operator == (const complex<RealType1>& x, const complex<RealType2>& y) {
typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
return ( static_cast<common_real_type>(real (x)) == static_cast<common_real_type>(real (y)) &&
static_cast<common_real_type>(imag (x)) == static_cast<common_real_type>(imag (y)) );
}
/// \brief Equality operator for std::complex and Kokkos::complex.
@ -592,50 +733,68 @@ bool operator == (const complex<RealType>& x, const complex<RealType>& y) {
/// Otherwise, CUDA builds will give compiler warnings ("warning:
/// calling a constexpr __host__ function("real") from a __host__
/// __device__ function("operator==") is not allowed").
template<class RealType>
bool operator == (const std::complex<RealType>& x, const complex<RealType>& y) {
return std::real (x) == real (y) && std::imag (x) == imag (y);
template<class RealType1, class RealType2>
inline
bool
operator == (const std::complex<RealType1>& x, const complex<RealType2>& y) {
typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
return ( static_cast<common_real_type>(std::real (x)) == static_cast<common_real_type>(real (y)) &&
static_cast<common_real_type>(std::imag (x)) == static_cast<common_real_type>(imag (y)) );
}
//! Equality operator for complex and real number.
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
bool operator == (const complex<RealType1>& x, const RealType2& y) {
return real (x) == y && imag (x) == static_cast<RealType1> (0.0);
bool
operator == (const complex<RealType1>& x, const RealType2& y) {
typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
return ( static_cast<common_real_type>(real (x)) == static_cast<common_real_type>(y) &&
static_cast<common_real_type>(imag (x)) == static_cast<common_real_type>(0.0) );
}
//! Equality operator for real and complex number.
template<class RealType>
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
bool operator == (const RealType& x, const complex<RealType>& y) {
bool
operator == (const RealType1& x, const complex<RealType2>& y) {
return y == x;
}
//! Inequality operator for two complex numbers.
template<class RealType>
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
bool operator != (const complex<RealType>& x, const complex<RealType>& y) {
return real (x) != real (y) || imag (x) != imag (y);
bool
operator != (const complex<RealType1>& x, const complex<RealType2>& y) {
typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
return ( static_cast<common_real_type>(real (x)) != static_cast<common_real_type>(real (y)) ||
static_cast<common_real_type>(imag (x)) != static_cast<common_real_type>(imag (y)) );
}
//! Inequality operator for std::complex and Kokkos::complex.
template<class RealType>
KOKKOS_INLINE_FUNCTION
bool operator != (const std::complex<RealType>& x, const complex<RealType>& y) {
return std::real (x) != real (y) || std::imag (x) != imag (y);
template<class RealType1, class RealType2>
inline
bool
operator != (const std::complex<RealType1>& x, const complex<RealType2>& y) {
typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
return ( static_cast<common_real_type>(std::real (x)) != static_cast<common_real_type>(real (y)) ||
static_cast<common_real_type>(std::imag (x)) != static_cast<common_real_type>(imag (y)) );
}
//! Inequality operator for complex and real number.
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
bool operator != (const complex<RealType1>& x, const RealType2& y) {
return real (x) != y || imag (x) != static_cast<RealType1> (0.0);
bool
operator != (const complex<RealType1>& x, const RealType2& y) {
typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
return ( static_cast<common_real_type>(real (x)) != static_cast<common_real_type>(y) ||
static_cast<common_real_type>(imag (x)) != static_cast<common_real_type>(0.0) );
}
//! Inequality operator for real and complex number.
template<class RealType>
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
bool operator != (const RealType& x, const complex<RealType>& y) {
bool
operator != (const RealType1& x, const complex<RealType2>& y) {
return y != x;
}

View File

@ -353,7 +353,14 @@ struct CountAndFill {
struct Fill {};
KOKKOS_INLINE_FUNCTION void operator()(Fill, size_type i) const {
auto j = m_crs.row_map(i);
data_type* fill = &(m_crs.entries(j));
/* we don't want to access entries(entries.size()), even if its just to get its
address and never use it.
this can happen when row (i) is empty and all rows after it are also empty.
we could compare to row_map(i + 1), but that is a read from global memory,
whereas dimension_0() should be part of the View in registers (or constant memory) */
data_type* fill =
(j == static_cast<decltype(j)>(m_crs.entries.dimension_0())) ?
nullptr : (&(m_crs.entries(j)));
m_functor(i, fill);
}
using self_type = CountAndFill<CrsType, Functor>;

View File

@ -147,12 +147,11 @@ public:
, const size_t arg_alloc_size ) const;
/**\brief Return Name of the MemorySpace */
static constexpr const char* name();
static constexpr const char* name() { return "HBW"; }
private:
AllocationMechanism m_alloc_mech;
static constexpr const char* m_name = "HBW";
friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace, void >;
};

View File

@ -192,7 +192,7 @@ template<>
struct reduction_identity<float> {
KOKKOS_FORCEINLINE_FUNCTION constexpr static float sum() {return static_cast<float>(0.0f);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static float prod() {return static_cast<float>(1.0f);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static float max() {return FLT_MIN;}
KOKKOS_FORCEINLINE_FUNCTION constexpr static float max() {return -FLT_MAX;}
KOKKOS_FORCEINLINE_FUNCTION constexpr static float min() {return FLT_MAX;}
};
@ -200,7 +200,7 @@ template<>
struct reduction_identity<double> {
KOKKOS_FORCEINLINE_FUNCTION constexpr static double sum() {return static_cast<double>(0.0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static double prod() {return static_cast<double>(1.0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static double max() {return DBL_MIN;}
KOKKOS_FORCEINLINE_FUNCTION constexpr static double max() {return -DBL_MAX;}
KOKKOS_FORCEINLINE_FUNCTION constexpr static double min() {return DBL_MAX;}
};
@ -208,7 +208,7 @@ template<>
struct reduction_identity<long double> {
KOKKOS_FORCEINLINE_FUNCTION constexpr static long double sum() {return static_cast<long double>(0.0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static long double prod() {return static_cast<long double>(1.0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static long double max() {return LDBL_MIN;}
KOKKOS_FORCEINLINE_FUNCTION constexpr static long double max() {return -LDBL_MAX;}
KOKKOS_FORCEINLINE_FUNCTION constexpr static long double min() {return LDBL_MAX;}
};

View File

@ -211,6 +211,24 @@ struct VerifyExecutionCanAccessMemorySpace
} // namespace Kokkos
#define threadIdx_x (hc_get_workitem_id(0))
#define threadIdx_y (hc_get_workitem_id(1))
#define threadIdx_z (hc_get_workitem_id(2))
#define blockIdx_x (hc_get_group_id(0))
#define blockIdx_y (hc_get_group_id(1))
#define blockIdx_z (hc_get_group_id(2))
#define blockDim_x (hc_get_group_size(0))
#define blockDim_y (hc_get_group_size(1))
#define blockDim_z (hc_get_group_size(2))
#define gridDim_x (hc_get_num_groups(0))
#define gridDim_y (hc_get_num_groups(1))
#define gridDim_z (hc_get_num_groups(2))
#include <ROCm/Kokkos_ROCm_Parallel.hpp>
#include <ROCm/Kokkos_ROCm_Task.hpp>

View File

@ -88,6 +88,7 @@ build-makefile-kokkos:
echo "KOKKOS_SRC = $(KOKKOS_SRC)" >> Makefile.kokkos
echo "" >> Makefile.kokkos
echo "#Variables used in application Makefiles" >> Makefile.kokkos
echo "KOKKOS_OS = $(KOKKOS_OS)" >> Makefile.kokkos
echo "KOKKOS_CPP_DEPENDS = $(KOKKOS_CPP_DEPENDS)" >> Makefile.kokkos
echo "KOKKOS_CXXFLAGS = $(KOKKOS_CXXFLAGS)" >> Makefile.kokkos
echo "KOKKOS_CPPFLAGS = $(KOKKOS_CPPFLAGS)" >> Makefile.kokkos

View File

@ -211,6 +211,7 @@ void OpenMP::partition_master( F const& f
, thread_local_bytes
);
omp_set_num_threads(partition_size);
f( omp_get_thread_num(), omp_get_num_threads() );
Impl::t_openmp_instance->~Exec();

View File

@ -113,7 +113,6 @@ void reduce_enqueue(
if (output_length < 1) return;
assert(output_result != nullptr);
const auto td = get_tile_desc<T>(szElements,output_length,team_size,vector_size, shared_size);
// allocate host and device memory for the results from each team
@ -176,14 +175,17 @@ void reduce_enqueue(
}
});
ValueInit::init(ReducerConditional::select(f, reducer), output_result);
if (output_result != nullptr)
ValueInit::init(ReducerConditional::select(f, reducer), output_result);
fut.wait();
copy(result,result_cpu.data());
for(std::size_t i=0;i<td.num_tiles;i++)
ValueJoin::join(ReducerConditional::select(f, reducer), output_result, result_cpu.data()+i*output_length);
if (output_result != nullptr) {
for(std::size_t i=0;i<td.num_tiles;i++)
ValueJoin::join(ReducerConditional::select(f, reducer), output_result, result_cpu.data()+i*output_length);
ValueFinal::final( ReducerConditional::select(f, reducer) , output_result );
ValueFinal::final( ReducerConditional::select(f, reducer) , output_result );
}
}

View File

@ -67,7 +67,7 @@ void scan_enqueue(
hc::array<value_type> result(td.num_tiles);
hc::array<value_type> scratch(len);
tile_for<value_type>(td, [&,len,td](hc::tiled_index<1> t_idx, tile_buffer<value_type> buffer) [[hc]]
tile_for<value_type>(td, [&,f,len,td](hc::tiled_index<1> t_idx, tile_buffer<value_type> buffer) [[hc]]
{
const auto local = t_idx.local[0];
const auto global = t_idx.global[0];
@ -135,7 +135,7 @@ void scan_enqueue(
ValueJoin::join(f, &result_cpu[i], &result_cpu[i-1]);
copy(result_cpu.data(),result);
hc::parallel_for_each(hc::extent<1>(len).tile(td.tile_size), [&,len,td](hc::tiled_index<1> t_idx) [[hc]]
hc::parallel_for_each(hc::extent<1>(len).tile(td.tile_size), [&,f,len,td](hc::tiled_index<1> t_idx) [[hc]]
{
// const auto local = t_idx.local[0];
const auto global = t_idx.global[0];

View File

@ -68,6 +68,8 @@ int bit_first_zero( unsigned i ) noexcept
return full != i ? _bit_scan_forward( ~i ) : -1 ;
#elif defined( KOKKOS_COMPILER_IBM )
return full != i ? __cnttz4( ~i ) : -1 ;
#elif defined( KOKKOS_COMPILER_CRAYC )
return full != i ? _popcnt( i ^ (i+1) ) - 1 : -1 ;
#elif defined( KOKKOS_COMPILER_GNU ) || defined( __GNUC__ ) || defined( __GNUG__ )
return full != i ? __builtin_ffs( ~i ) - 1 : -1 ;
#else
@ -90,17 +92,16 @@ int bit_scan_forward( unsigned i )
return _bit_scan_forward(i);
#elif defined( KOKKOS_COMPILER_IBM )
return __cnttz4(i);
#elif defined( KOKKOS_COMPILER_CRAYC )
return i ? _popcnt(~i & (i-1)) : -1;
#elif defined( KOKKOS_COMPILER_GNU ) || defined( __GNUC__ ) || defined( __GNUG__ )
return __builtin_ffs(i) - 1;
#else
unsigned t = 1u;
int r = 0;
while ( i && ( i & t == 0 ) )
{
t = t << 1;
++r;
int offset = -1;
if ( i ) {
for ( offset = 0 ; (i & ( 1 << offset ) ) == 0 ; ++offset );
}
return r;
return offset;
#endif
}
@ -116,17 +117,16 @@ int bit_scan_reverse( unsigned i )
return _bit_scan_reverse(i);
#elif defined( KOKKOS_COMPILER_IBM )
return shift - __cntlz4(i);
#elif defined( KOKKOS_COMPILER_CRAYC )
return i ? shift - _leadz32(i) : 0 ;
#elif defined( __GNUC__ ) || defined( __GNUG__ )
return shift - __builtin_clz(i);
#else
unsigned t = 1u << shift;
int r = 0;
while ( i && ( i & t == 0 ) )
{
t = t >> 1;
++r;
int offset = 0;
if ( i ) {
for ( offset = shift ; (i & ( 1 << offset ) ) == 0 ; --offset );
}
return r;
return offset;
#endif
}
@ -142,6 +142,8 @@ int bit_count( unsigned i )
return _popcnt32(i);
#elif defined( KOKKOS_COMPILER_IBM )
return __popcnt4(i);
#elif defined( KOKKOS_COMPILER_CRAYC )
return _popcnt(i);
#elif defined( __GNUC__ ) || defined( __GNUG__ )
return __builtin_popcount(i);
#else

View File

@ -166,10 +166,6 @@ void HBWSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_s
}
}
constexpr const char* HBWSpace::name() {
return m_name;
}
} // namespace Experimental
} // namespace Kokkos

View File

@ -114,7 +114,7 @@ struct TestComplexBasicMath {
typename Kokkos::View<Kokkos::complex<double>*,ExecSpace>::HostMirror h_results;
void testit () {
d_results = Kokkos::View<Kokkos::complex<double>*,ExecSpace>("TestComplexBasicMath",20);
d_results = Kokkos::View<Kokkos::complex<double>*,ExecSpace>("TestComplexBasicMath",24);
h_results = Kokkos::create_mirror_view(d_results);
Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0,1), *this);
@ -125,6 +125,7 @@ struct TestComplexBasicMath {
std::complex<double> b(3.25,5.75);
std::complex<double> d(1.0,2.0);
double c = 9.3;
int e = 2;
std::complex<double> r;
r = a+b; ASSERT_FLOAT_EQ(h_results(0).real(), r.real()); ASSERT_FLOAT_EQ(h_results(0).imag(), r.imag());
@ -147,6 +148,12 @@ struct TestComplexBasicMath {
r = c-a; ASSERT_FLOAT_EQ(h_results(17).real(), r.real()); ASSERT_FLOAT_EQ(h_results(17).imag(), r.imag());
r = c*a; ASSERT_FLOAT_EQ(h_results(18).real(), r.real()); ASSERT_FLOAT_EQ(h_results(18).imag(), r.imag());
r = c/a; ASSERT_FLOAT_EQ(h_results(19).real(), r.real()); ASSERT_FLOAT_EQ(h_results(19).imag(), r.imag());
r = a;
/* r = a+e; */ ASSERT_FLOAT_EQ(h_results(20).real(), r.real()+e); ASSERT_FLOAT_EQ(h_results(20).imag(), r.imag());
/* r = a-e; */ ASSERT_FLOAT_EQ(h_results(21).real(), r.real()-e); ASSERT_FLOAT_EQ(h_results(21).imag(), r.imag());
/* r = a*e; */ ASSERT_FLOAT_EQ(h_results(22).real(), r.real()*e); ASSERT_FLOAT_EQ(h_results(22).imag(), r.imag()*e);
/* r = a/e; */ ASSERT_FLOAT_EQ(h_results(23).real(), r.real()/2); ASSERT_FLOAT_EQ(h_results(23).imag(), r.imag()/e);
}
KOKKOS_INLINE_FUNCTION
@ -190,6 +197,12 @@ struct TestComplexBasicMath {
d_results(17) = c-a;
d_results(18) = c*a;
d_results(19) = c/a;
int e = 2;
d_results(20) = a+e;
d_results(21) = a-e;
d_results(22) = a*e;
d_results(23) = a/e;
}
};

View File

@ -286,7 +286,9 @@ struct TestMDRange_2D {
// Test with reducers - scalar
{
typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType<int> > range_type;
range_type range( {{ 0, 0 }}, {{ N0, N1 }}, {{ 3, 3 }} );
int s0 = 1;
int s1 = 1;
range_type range( {{ s0, s1 }}, {{ N0, N1 }}, {{ 3, 3 }} );
TestMDRange_2D functor( N0, N1 );
@ -297,7 +299,7 @@ struct TestMDRange_2D {
parallel_reduce( range, functor, reducer_scalar );
ASSERT_EQ( sum, 2 * N0 * N1 );
ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) );
}
// Test with reducers - scalar view
{
@ -445,7 +447,9 @@ struct TestMDRange_2D {
typedef typename range_type::tile_type tile_type;
typedef typename range_type::point_type point_type;
range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
const int s0 = 1;
const int s1 = 1;
range_type range( point_type{ { s0, s1 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
TestMDRange_2D functor( N0, N1 );
parallel_for( range, functor );
@ -454,8 +458,8 @@ struct TestMDRange_2D {
Kokkos::deep_copy( h_view, functor.input_view );
int counter = 0;
for ( int i = 0; i < N0; ++i )
for ( int j = 0; j < N1; ++j )
for ( int i = s0; i < N0; ++i )
for ( int j = s1; j < N1; ++j )
{
if ( h_view( i, j ) != 3 ) {
++counter;
@ -463,7 +467,7 @@ struct TestMDRange_2D {
}
if ( counter != 0 ) {
printf( "Default Layouts + InitTag op(): Errors in test_for2; mismatches = %d\n\n", counter );
printf( "Offset Start + Default Layouts + InitTag op(): Errors in test_for2; mismatches = %d\n\n", counter );
}
ASSERT_EQ( counter, 0 );
@ -699,6 +703,7 @@ struct TestMDRange_2D {
ASSERT_EQ( counter, 0 );
}
} // end test_for2
}; // MDRange_2D
@ -749,7 +754,10 @@ struct TestMDRange_3D {
typedef typename range_type::tile_type tile_type;
typedef typename range_type::point_type point_type;
range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
int s0 = 1;
int s1 = 1;
int s2 = 1;
range_type range( point_type{ { s0, s1, s2 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
TestMDRange_3D functor( N0, N1, N2 );
@ -757,7 +765,7 @@ struct TestMDRange_3D {
double sum = 0.0;
parallel_reduce( range, functor, sum );
ASSERT_EQ( sum, 2 * N0 * N1 * N2 );
ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) );
}
// Test with reducers - scalar
@ -952,7 +960,10 @@ struct TestMDRange_3D {
typedef typename range_type::tile_type tile_type;
typedef typename range_type::point_type point_type;
range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
int s0 = 1;
int s1 = 1;
int s2 = 1;
range_type range( point_type{ { s0, s1, s2 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
TestMDRange_3D functor( N0, N1, N2 );
parallel_for( range, functor );
@ -961,9 +972,9 @@ struct TestMDRange_3D {
Kokkos::deep_copy( h_view, functor.input_view );
int counter = 0;
for ( int i = 0; i < N0; ++i )
for ( int j = 0; j < N1; ++j )
for ( int k = 0; k < N2; ++k )
for ( int i = s0; i < N0; ++i )
for ( int j = s1; j < N1; ++j )
for ( int k = s2; k < N2; ++k )
{
if ( h_view( i, j, k ) != 3 ) {
++counter;
@ -971,7 +982,7 @@ struct TestMDRange_3D {
}
if ( counter != 0 ) {
printf( "Defaults + InitTag op(): Errors in test_for3; mismatches = %d\n\n", counter );
printf( "Offset Start + Defaults + InitTag op(): Errors in test_for3; mismatches = %d\n\n", counter );
}
ASSERT_EQ( counter, 0 );
@ -1207,7 +1218,11 @@ struct TestMDRange_4D {
typedef typename range_type::tile_type tile_type;
typedef typename range_type::point_type point_type;
range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 3, 3, 3 } } );
int s0 = 1;
int s1 = 1;
int s2 = 1;
int s3 = 1;
range_type range( point_type{ { s0, s1, s2, s3 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 3, 3, 3 } } );
TestMDRange_4D functor( N0, N1, N2, N3 );
@ -1215,7 +1230,7 @@ struct TestMDRange_4D {
double sum = 0.0;
parallel_reduce( range, functor, sum );
ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 );
ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) );
}
// Test with reducers - scalar
@ -1415,7 +1430,11 @@ struct TestMDRange_4D {
typedef typename range_type::tile_type tile_type;
typedef typename range_type::point_type point_type;
range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 11, 3, 3 } } );
int s0 = 1;
int s1 = 1;
int s2 = 1;
int s3 = 1;
range_type range( point_type{ { s0, s1, s2, s3 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 11, 3, 3 } } );
TestMDRange_4D functor( N0, N1, N2, N3 );
parallel_for( range, functor );
@ -1424,10 +1443,10 @@ struct TestMDRange_4D {
Kokkos::deep_copy( h_view, functor.input_view );
int counter = 0;
for ( int i = 0; i < N0; ++i )
for ( int j = 0; j < N1; ++j )
for ( int k = 0; k < N2; ++k )
for ( int l = 0; l < N3; ++l )
for ( int i = s0; i < N0; ++i )
for ( int j = s1; j < N1; ++j )
for ( int k = s2; k < N2; ++k )
for ( int l = s3; l < N3; ++l )
{
if ( h_view( i, j, k, l ) != 3 ) {
++counter;
@ -1435,7 +1454,7 @@ struct TestMDRange_4D {
}
if ( counter != 0 ) {
printf("Defaults +m_tile > m_upper dim2 InitTag op(): Errors in test_for4; mismatches = %d\n\n",counter);
printf("Offset Start + Defaults +m_tile > m_upper dim2 InitTag op(): Errors in test_for4; mismatches = %d\n\n",counter);
}
ASSERT_EQ( counter, 0 );
@ -1682,7 +1701,12 @@ struct TestMDRange_5D {
typedef typename range_type::tile_type tile_type;
typedef typename range_type::point_type point_type;
range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 3 } } );
int s0 = 1;
int s1 = 1;
int s2 = 1;
int s3 = 1;
int s4 = 1;
range_type range( point_type{ { s0, s1, s2, s3, s4 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 3 } } );
TestMDRange_5D functor( N0, N1, N2, N3, N4 );
@ -1690,7 +1714,7 @@ struct TestMDRange_5D {
double sum = 0.0;
parallel_reduce( range, functor, sum );
ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 );
ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) * (N4 - s4) );
}
// Test with reducers - scalar
@ -1810,7 +1834,12 @@ struct TestMDRange_5D {
typedef typename range_type::tile_type tile_type;
typedef typename range_type::point_type point_type;
range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 5 } } );
int s0 = 1;
int s1 = 1;
int s2 = 1;
int s3 = 1;
int s4 = 1;
range_type range( point_type{ { s0, s1, s2, s3, s4 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 5 } } );
TestMDRange_5D functor( N0, N1, N2, N3, N4 );
parallel_for( range, functor );
@ -1819,11 +1848,11 @@ struct TestMDRange_5D {
Kokkos::deep_copy( h_view, functor.input_view );
int counter = 0;
for ( int i = 0; i < N0; ++i )
for ( int j = 0; j < N1; ++j )
for ( int k = 0; k < N2; ++k )
for ( int l = 0; l < N3; ++l )
for ( int m = 0; m < N4; ++m )
for ( int i = s0; i < N0; ++i )
for ( int j = s1; j < N1; ++j )
for ( int k = s2; k < N2; ++k )
for ( int l = s3; l < N3; ++l )
for ( int m = s4; m < N4; ++m )
{
if ( h_view( i, j, k, l, m ) != 3 ) {
++counter;
@ -1831,7 +1860,7 @@ struct TestMDRange_5D {
}
if ( counter != 0 ) {
printf( "Defaults + InitTag op(): Errors in test_for5; mismatches = %d\n\n", counter );
printf( "Offset Start + Defaults + InitTag op(): Errors in test_for5; mismatches = %d\n\n", counter );
}
ASSERT_EQ( counter, 0 );
@ -2084,7 +2113,13 @@ struct TestMDRange_6D {
typedef typename range_type::tile_type tile_type;
typedef typename range_type::point_type point_type;
range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 3, 2 } } );
int s0 = 1;
int s1 = 1;
int s2 = 1;
int s3 = 1;
int s4 = 1;
int s5 = 1;
range_type range( point_type{ { s0, s1, s2, s3, s4, s5 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 3, 2 } } );
TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
@ -2092,7 +2127,7 @@ struct TestMDRange_6D {
double sum = 0.0;
parallel_reduce( range, functor, sum );
ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 * N5 );
ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) * (N4 - s4) * (N5 - s5) );
}
// Test with reducers - scalar
@ -2214,7 +2249,13 @@ struct TestMDRange_6D {
typedef typename range_type::tile_type tile_type;
typedef typename range_type::point_type point_type;
range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 2, 3 } } ); //tile dims 3,3,3,3,3,3 more than cuda can handle with debugging
int s0 = 1;
int s1 = 1;
int s2 = 1;
int s3 = 1;
int s4 = 1;
int s5 = 1;
range_type range( point_type{ { s0, s1, s2, s3, s4, s5 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 2, 3 } } ); //tile dims 3,3,3,3,3,3 more than cuda can handle with debugging
TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
parallel_for( range, functor );
@ -2223,12 +2264,12 @@ struct TestMDRange_6D {
Kokkos::deep_copy( h_view, functor.input_view );
int counter = 0;
for ( int i = 0; i < N0; ++i )
for ( int j = 0; j < N1; ++j )
for ( int k = 0; k < N2; ++k )
for ( int l = 0; l < N3; ++l )
for ( int m = 0; m < N4; ++m )
for ( int n = 0; n < N5; ++n )
for ( int i = s0; i < N0; ++i )
for ( int j = s1; j < N1; ++j )
for ( int k = s2; k < N2; ++k )
for ( int l = s3; l < N3; ++l )
for ( int m = s4; m < N4; ++m )
for ( int n = s5; n < N5; ++n )
{
if ( h_view( i, j, k, l, m, n ) != 3 ) {
++counter;
@ -2236,7 +2277,7 @@ struct TestMDRange_6D {
}
if ( counter != 0 ) {
printf( "Defaults + InitTag op(): Errors in test_for6; mismatches = %d\n\n", counter );
printf( "Offset Start + Defaults + InitTag op(): Errors in test_for6; mismatches = %d\n\n", counter );
}
ASSERT_EQ( counter, 0 );