forked from lijiext/lammps
Merge pull request #677 from stanmoore1/kk_update
Update to Kokkos r2.04.04 and add workaround for performance regression
This commit is contained in:
commit
439c2fd980
|
@ -1,5 +1,24 @@
|
|||
# Change Log
|
||||
|
||||
## [2.04.04](https://github.com/kokkos/kokkos/tree/2.04.04) (2017-09-11)
|
||||
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.04.00...2.04.04)
|
||||
|
||||
**Implemented enhancements:**
|
||||
|
||||
- OpenMP partition: set number of threads on nested level [\#1082](https://github.com/kokkos/kokkos/issues/1082)
|
||||
- Add StaticCrsGraph row\(\) method [\#1071](https://github.com/kokkos/kokkos/issues/1071)
|
||||
- Enhance Kokkos complex operator overloading [\#1052](https://github.com/kokkos/kokkos/issues/1052)
|
||||
- Tell Trilinos packages about host+device lambda [\#1019](https://github.com/kokkos/kokkos/issues/1019)
|
||||
- Function markup for defaulted class members [\#952](https://github.com/kokkos/kokkos/issues/952)
|
||||
- Add deterministic random number generator [\#857](https://github.com/kokkos/kokkos/issues/857)
|
||||
|
||||
**Fixed bugs:**
|
||||
|
||||
- Fix reduction\_identity\<T\>::max for floating point numbers [\#1048](https://github.com/kokkos/kokkos/issues/1048)
|
||||
- Fix MD iteration policy ignores lower bound on GPUs [\#1041](https://github.com/kokkos/kokkos/issues/1041)
|
||||
- (Experimental) HBWSpace Linking issues in KokkosKernels [\#1094](https://github.com/kokkos/kokkos/issues/1094)
|
||||
- (Experimental) ROCm: algorithms/unit\_tests test\_sort failing with segfault [\#1070](https://github.com/kokkos/kokkos/issues/1070)
|
||||
|
||||
## [2.04.00](https://github.com/kokkos/kokkos/tree/2.04.00) (2017-08-16)
|
||||
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.13...2.04.00)
|
||||
|
||||
|
|
|
@ -443,7 +443,7 @@ endif
|
|||
ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
|
||||
KOKKOS_CPPFLAGS += -I$(MEMKIND_PATH)/include
|
||||
KOKKOS_LDFLAGS += -L$(MEMKIND_PATH)/lib
|
||||
KOKKOS_LIBS += -lmemkind
|
||||
KOKKOS_LIBS += -lmemkind -lnuma
|
||||
tmp := $(shell echo "\#define KOKKOS_HAVE_HBWSPACE 1" >> KokkosCore_config.tmp )
|
||||
endif
|
||||
|
||||
|
@ -614,9 +614,18 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1)
|
|||
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
|
||||
|
||||
else
|
||||
# Assume that this is a really a GNU compiler or it could be XL on P8.
|
||||
KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
|
||||
KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
|
||||
KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
|
||||
KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
|
||||
|
||||
else
|
||||
# Assume that this is a really a GNU compiler on P8.
|
||||
KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
|
||||
KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
|
@ -626,9 +635,18 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER9), 1)
|
|||
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
|
||||
|
||||
else
|
||||
# Assume that this is a really a GNU compiler or it could be XL on P9.
|
||||
KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9
|
||||
KOKKOS_LDFLAGS += -mcpu=power9 -mtune=power9
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
|
||||
KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9
|
||||
KOKKOS_LDFLAGS += -mcpu=power9 -mtune=power9
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
|
||||
|
||||
else
|
||||
# Assume that this is a really a GNU compiler on P9
|
||||
KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9
|
||||
KOKKOS_LDFLAGS += -mcpu=power9 -mtune=power9
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
|
|
|
@ -1265,6 +1265,243 @@ void Random_XorShift1024_Pool<Kokkos::Cuda>::free_state(const Random_XorShift102
|
|||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(KOKKOS_ENABLE_ROCM)
|
||||
|
||||
template<>
|
||||
class Random_XorShift1024<Kokkos::Experimental::ROCm> {
|
||||
private:
|
||||
int p_;
|
||||
const int state_idx_;
|
||||
uint64_t* state_;
|
||||
const int stride_;
|
||||
friend class Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>;
|
||||
public:
|
||||
|
||||
typedef Kokkos::Experimental::ROCm device_type;
|
||||
typedef Random_XorShift1024_Pool<device_type> pool_type;
|
||||
|
||||
enum {MAX_URAND = 0xffffffffU};
|
||||
enum {MAX_URAND64 = 0xffffffffffffffffULL-1};
|
||||
enum {MAX_RAND = static_cast<int>(0xffffffffU/2)};
|
||||
enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)};
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Random_XorShift1024 (const typename pool_type::state_data_type& state, int p, int state_idx = 0):
|
||||
p_(p),state_idx_(state_idx),state_(&state(state_idx,0)),stride_(state.stride_1()){
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
uint32_t urand() {
|
||||
uint64_t state_0 = state_[ p_ * stride_ ];
|
||||
uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
|
||||
state_1 ^= state_1 << 31;
|
||||
state_1 ^= state_1 >> 11;
|
||||
state_0 ^= state_0 >> 30;
|
||||
uint64_t tmp = ( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL;
|
||||
tmp = tmp>>16;
|
||||
return static_cast<uint32_t>(tmp&MAX_URAND);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
uint64_t urand64() {
|
||||
uint64_t state_0 = state_[ p_ * stride_ ];
|
||||
uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
|
||||
state_1 ^= state_1 << 31;
|
||||
state_1 ^= state_1 >> 11;
|
||||
state_0 ^= state_0 >> 30;
|
||||
return (( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
uint32_t urand(const uint32_t& range) {
|
||||
const uint32_t max_val = (MAX_URAND/range)*range;
|
||||
uint32_t tmp = urand();
|
||||
while(tmp>=max_val)
|
||||
urand();
|
||||
return tmp%range;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
uint32_t urand(const uint32_t& start, const uint32_t& end ) {
|
||||
return urand(end-start)+start;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
uint64_t urand64(const uint64_t& range) {
|
||||
const uint64_t max_val = (MAX_URAND64/range)*range;
|
||||
uint64_t tmp = urand64();
|
||||
while(tmp>=max_val)
|
||||
urand64();
|
||||
return tmp%range;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
uint64_t urand64(const uint64_t& start, const uint64_t& end ) {
|
||||
return urand64(end-start)+start;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int rand() {
|
||||
return static_cast<int>(urand()/2);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int rand(const int& range) {
|
||||
const int max_val = (MAX_RAND/range)*range;
|
||||
int tmp = rand();
|
||||
while(tmp>=max_val)
|
||||
rand();
|
||||
return tmp%range;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int rand(const int& start, const int& end ) {
|
||||
return rand(end-start)+start;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int64_t rand64() {
|
||||
return static_cast<int64_t>(urand64()/2);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int64_t rand64(const int64_t& range) {
|
||||
const int64_t max_val = (MAX_RAND64/range)*range;
|
||||
int64_t tmp = rand64();
|
||||
while(tmp>=max_val)
|
||||
rand64();
|
||||
return tmp%range;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int64_t rand64(const int64_t& start, const int64_t& end ) {
|
||||
return rand64(end-start)+start;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
float frand() {
|
||||
return 1.0f * urand64()/MAX_URAND64;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
float frand(const float& range) {
|
||||
return range * urand64()/MAX_URAND64;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
float frand(const float& start, const float& end ) {
|
||||
return frand(end-start)+start;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double drand() {
|
||||
return 1.0 * urand64()/MAX_URAND64;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double drand(const double& range) {
|
||||
return range * urand64()/MAX_URAND64;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double drand(const double& start, const double& end ) {
|
||||
return frand(end-start)+start;
|
||||
}
|
||||
|
||||
//Marsaglia polar method for drawing a standard normal distributed random number
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double normal() {
|
||||
double S = 2.0;
|
||||
double U;
|
||||
while(S>=1.0) {
|
||||
U = 2.0*drand() - 1.0;
|
||||
const double V = 2.0*drand() - 1.0;
|
||||
S = U*U+V*V;
|
||||
}
|
||||
return U*std::sqrt(-2.0*log(S)/S);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double normal(const double& mean, const double& std_dev=1.0) {
|
||||
return mean + normal()*std_dev;
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
inline
|
||||
Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::Random_XorShift64_Pool(uint64_t seed) {
|
||||
num_states_ = 0;
|
||||
init(seed,4*32768);
|
||||
}
|
||||
|
||||
template<>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Random_XorShift64<Kokkos::Experimental::ROCm> Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::get_state() const {
|
||||
#ifdef __HCC_ACCELERATOR__
|
||||
const int i_offset = (threadIdx_x*blockDim_y + threadIdx_y)*blockDim_z+threadIdx_z;
|
||||
int i = (((blockIdx_x*gridDim_y+blockIdx_y)*gridDim_z + blockIdx_z) *
|
||||
blockDim_x*blockDim_y*blockDim_z + i_offset)%num_states_;
|
||||
while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) {
|
||||
i+=blockDim_x*blockDim_y*blockDim_z;
|
||||
if(i>=num_states_) {i = i_offset;}
|
||||
}
|
||||
|
||||
return Random_XorShift64<Kokkos::Experimental::ROCm>(state_(i),i);
|
||||
#else
|
||||
return Random_XorShift64<Kokkos::Experimental::ROCm>(state_(0),0);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::free_state(const Random_XorShift64<Kokkos::Experimental::ROCm> &state) const {
|
||||
#ifdef __HCC_ACCELERATOR__
|
||||
state_(state.state_idx_) = state.state_;
|
||||
locks_(state.state_idx_) = 0;
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
template<>
|
||||
inline
|
||||
Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::Random_XorShift1024_Pool(uint64_t seed) {
|
||||
num_states_ = 0;
|
||||
init(seed,4*32768);
|
||||
}
|
||||
|
||||
template<>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Random_XorShift1024<Kokkos::Experimental::ROCm> Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::get_state() const {
|
||||
#ifdef __HCC_ACCELERATOR__
|
||||
const int i_offset = (threadIdx_x*blockDim_y + threadIdx_y)*blockDim_z+threadIdx_z;
|
||||
int i = (((blockIdx_x*gridDim_y+blockIdx_y)*gridDim_z + blockIdx_z) *
|
||||
blockDim_x*blockDim_y*blockDim_z + i_offset)%num_states_;
|
||||
while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) {
|
||||
i+=blockDim_x*blockDim_y*blockDim_z;
|
||||
if(i>=num_states_) {i = i_offset;}
|
||||
}
|
||||
|
||||
return Random_XorShift1024<Kokkos::Experimental::ROCm>(state_, p_(i), i);
|
||||
#else
|
||||
return Random_XorShift1024<Kokkos::Experimental::ROCm>(state_, p_(0), 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::free_state(const Random_XorShift1024<Kokkos::Experimental::ROCm> &state) const {
|
||||
#ifdef __HCC_ACCELERATOR__
|
||||
for(int i=0; i<16; i++)
|
||||
state_(state.state_idx_,i) = state.state_[i];
|
||||
locks_(state.state_idx_) = 0;
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
|
|
@ -30,6 +30,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
|||
TEST_TARGETS += test-cuda
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
|
||||
OBJ_ROCM = TestROCm.o UnitTestMain.o gtest-all.o
|
||||
TARGETS += KokkosAlgorithms_UnitTest_ROCm
|
||||
TEST_TARGETS += test-rocm
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
|
||||
OBJ_THREADS = TestThreads.o UnitTestMain.o gtest-all.o
|
||||
TARGETS += KokkosAlgorithms_UnitTest_Threads
|
||||
|
@ -51,6 +57,9 @@ endif
|
|||
KokkosAlgorithms_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Cuda
|
||||
|
||||
KokkosAlgorithms_UnitTest_ROCm: $(OBJ_ROCM) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(EXTRA_PATH) $(OBJ_ROCM) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_ROCm
|
||||
|
||||
KokkosAlgorithms_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Threads
|
||||
|
||||
|
@ -63,6 +72,9 @@ KokkosAlgorithms_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS)
|
|||
test-cuda: KokkosAlgorithms_UnitTest_Cuda
|
||||
./KokkosAlgorithms_UnitTest_Cuda
|
||||
|
||||
test-rocm: KokkosAlgorithms_UnitTest_ROCm
|
||||
./KokkosAlgorithms_UnitTest_ROCm
|
||||
|
||||
test-threads: KokkosAlgorithms_UnitTest_Threads
|
||||
./KokkosAlgorithms_UnitTest_Threads
|
||||
|
||||
|
|
|
@ -0,0 +1,112 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#ifdef KOKKOS_ENABLE_ROCM
|
||||
|
||||
#include <cstdint>
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
#include <TestRandom.hpp>
|
||||
#include <TestSort.hpp>
|
||||
|
||||
namespace Test {
|
||||
|
||||
class rocm : public ::testing::Test {
|
||||
protected:
|
||||
static void SetUpTestCase()
|
||||
{
|
||||
std::cout << std::setprecision(5) << std::scientific;
|
||||
Kokkos::HostSpace::execution_space::initialize();
|
||||
Kokkos::Experimental::ROCm::initialize( Kokkos::Experimental::ROCm::SelectDevice(0) );
|
||||
}
|
||||
static void TearDownTestCase()
|
||||
{
|
||||
Kokkos::Experimental::ROCm::finalize();
|
||||
Kokkos::HostSpace::execution_space::finalize();
|
||||
}
|
||||
};
|
||||
|
||||
void rocm_test_random_xorshift64( int num_draws )
|
||||
{
|
||||
Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::Experimental::ROCm> >(num_draws);
|
||||
}
|
||||
|
||||
void rocm_test_random_xorshift1024( int num_draws )
|
||||
{
|
||||
Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::Experimental::ROCm> >(num_draws);
|
||||
}
|
||||
|
||||
|
||||
#define ROCM_RANDOM_XORSHIFT64( num_draws ) \
|
||||
TEST_F( rocm, Random_XorShift64 ) { \
|
||||
rocm_test_random_xorshift64(num_draws); \
|
||||
}
|
||||
|
||||
#define ROCM_RANDOM_XORSHIFT1024( num_draws ) \
|
||||
TEST_F( rocm, Random_XorShift1024 ) { \
|
||||
rocm_test_random_xorshift1024(num_draws); \
|
||||
}
|
||||
|
||||
#define ROCM_SORT_UNSIGNED( size ) \
|
||||
TEST_F( rocm, SortUnsigned ) { \
|
||||
Impl::test_sort< Kokkos::Experimental::ROCm, unsigned >(size); \
|
||||
}
|
||||
|
||||
ROCM_RANDOM_XORSHIFT64( 132141141 )
|
||||
ROCM_RANDOM_XORSHIFT1024( 52428813 )
|
||||
ROCM_SORT_UNSIGNED(171)
|
||||
|
||||
#undef ROCM_RANDOM_XORSHIFT64
|
||||
#undef ROCM_RANDOM_XORSHIFT1024
|
||||
#undef ROCM_SORT_UNSIGNED
|
||||
}
|
||||
#else
|
||||
void KOKKOS_ALGORITHMS_UNITTESTS_TESTROCM_PREVENT_LINK_ERROR() {}
|
||||
#endif /* #ifdef KOKKOS_ENABLE_ROCM */
|
||||
|
|
@ -27,7 +27,7 @@ fi
|
|||
HPCBIND_HWLOC_PARENT_CPUSET=""
|
||||
if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
|
||||
MY_PID="$BASHPID"
|
||||
HPCBIND_HWLOC_PARENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2)
|
||||
HPCBIND_HWLOC_PARENT_CPUSET="$(hwloc-ps -a --cpuset | grep ${MY_PID} | cut -f 2)"
|
||||
fi
|
||||
|
||||
################################################################################
|
||||
|
@ -58,23 +58,34 @@ declare -i HPCBIND_ENABLE_GPU_MAPPING=$((NUM_GPUS > 0))
|
|||
################################################################################
|
||||
HPCBIND_QUEUE_NAME=""
|
||||
declare -i HPCBIND_QUEUE_INDEX=0
|
||||
declare -i HPCBIND_QUEUE_GPU_MAPPING=0
|
||||
declare -i HPCBIND_QUEUE_MAPPING=0
|
||||
|
||||
if [[ ! -z "${SLURM_LOCAL_ID}" ]]; then
|
||||
HPCBIND_QUEUE_GPU_MAPPING=1
|
||||
HPCBIND_QUEUE_NAME="sbatch"
|
||||
if [[ ! -z "${PMI_RANK}" ]]; then
|
||||
HPCBIND_QUEUE_MAPPING=1
|
||||
HPCBIND_QUEUE_NAME="mpich"
|
||||
HPCBIND_QUEUE_INDEX=${PMI_RANK}
|
||||
elif [[ ! -z "${OMPI_COMM_WORLD_RANK}" ]]; then
|
||||
HPCBIND_QUEUE_MAPPING=1
|
||||
HPCBIND_QUEUE_NAME="openmpi"
|
||||
HPCBIND_QUEUE_INDEX=${OMPI_COMM_WORLD_RANK}
|
||||
elif [[ ! -z "${MV2_COMM_WORLD_RANK}" ]]; then
|
||||
HPCBIND_QUEUE_MAPPING=1
|
||||
HPCBIND_QUEUE_NAME="mvapich2"
|
||||
HPCBIND_QUEUE_INDEX=${MV2_COMM_WORLD_RANK}
|
||||
elif [[ ! -z "${SLURM_LOCAL_ID}" ]]; then
|
||||
HPCBIND_QUEUE_MAPPING=1
|
||||
HPCBIND_QUEUE_NAME="slurm"
|
||||
HPCBIND_QUEUE_INDEX=${SLURM_LOCAL_ID}
|
||||
elif [[ ! -z "${LBS_JOBINDEX}" ]]; then
|
||||
HPCBIND_QUEUE_GPU_MAPPING=1
|
||||
HPCBIND_QUEUE_MAPPING=1
|
||||
HPCBIND_QUEUE_NAME="bsub"
|
||||
HPCBIND_QUEUE_INDEX=${LBS_JOBINDEX}
|
||||
elif [[ ! -z "${ALPS_APP_PE}" ]]; then
|
||||
HPCBIND_QUEUE_GPU_MAPPING=1
|
||||
HPCBIND_QUEUE_MAPPING=1
|
||||
HPCBIND_QUEUE_NAME="aprun"
|
||||
HPCBIND_QUEUE_INDEX=${ALPS_APP_PE}
|
||||
fi
|
||||
|
||||
|
||||
################################################################################
|
||||
# Show help
|
||||
################################################################################
|
||||
|
@ -91,13 +102,14 @@ function show_help {
|
|||
echo " --proc-bind=<LOC> Set the initial process mask for the script"
|
||||
echo " LOC can be any valid location argument for"
|
||||
echo " hwloc-calc Default: all"
|
||||
echo " --whole-system ${cmd} will ignore the its parent process binding"
|
||||
echo " --distribute=N Distribute the current cpuset into N partitions"
|
||||
echo " --distribute-partition=I"
|
||||
echo " Use the i'th partition (zero based)"
|
||||
echo " --visible-gpus=<L> Comma separated list of gpu ids"
|
||||
echo " Default: CUDA_VISIBLE_DEVICES or all gpus in"
|
||||
echo " sequential order"
|
||||
echo " --gpu-ignore-queue Ignore queue job id when choosing visible GPU"
|
||||
echo " --ignore-queue Ignore queue job id when choosing visible GPU and partition"
|
||||
echo " --no-gpu-mapping Do not set CUDA_VISIBLE_DEVICES"
|
||||
echo " --openmp=M.m Set env variables for the given OpenMP version"
|
||||
echo " Default: 4.0"
|
||||
|
@ -110,22 +122,30 @@ function show_help {
|
|||
echo " --force-openmp-proc-bind=<OP>"
|
||||
echo " Override logic for selecting OMP_PROC_BIND"
|
||||
echo " --no-openmp-nested Set OMP_NESTED to false"
|
||||
echo " --show-bindings Show the bindings"
|
||||
echo " --lstopo Show bindings in lstopo without executing a command"
|
||||
echo " -v|--verbose Show options and relevant environment variables"
|
||||
echo " --output-prefix=<P> Save the output to files of the form"
|
||||
echo " P-N.log, P-N.out and P-N.err where P is the prefix"
|
||||
echo " and N is the queue index or mpi rank (no spaces)"
|
||||
echo " --output-mode=<Op> How console output should be handled."
|
||||
echo " Options are all, rank0, and none. Default: rank0"
|
||||
echo " --lstopo Show bindings in lstopo"
|
||||
echo " -v|--verbose Print bindings and relevant environment variables"
|
||||
echo " -h|--help Show this message"
|
||||
echo ""
|
||||
echo "Sample Usage:"
|
||||
echo " Split the current process cpuset into 4 and use the 3rd partition"
|
||||
echo " ${cmd} --distribute=4 --distribute-partition=2 -v -- command ..."
|
||||
echo " Bing the process to all even cores"
|
||||
echo " Launch 16 jobs over 4 nodes with 4 jobs per node using only the even pus"
|
||||
echo " and save the output to rank specific files"
|
||||
echo " mpiexec -N 16 -npernode 4 ${cmd} --whole-system --proc-bind=pu:even \\"
|
||||
echo " --distribute=4 -v --output-prefix=output -- command ..."
|
||||
echo " Bind the process to all even cores"
|
||||
echo " ${cmd} --proc-bind=core:even -v -- command ..."
|
||||
echo " Bind to the first 64 cores and split the current process cpuset into 4"
|
||||
echo " ${cmd} --proc-bind=core:0-63 --distribute=4 --distribute-partition=0 -- command ..."
|
||||
echo " skip GPU 0 when mapping visible devices"
|
||||
echo " Bind the the even cores of socket 0 and the odd cores of socket 1"
|
||||
echo " ${cmd} --proc-bind='socket:0.core:even socket:1.core:odd' -v -- command ..."
|
||||
echo " Skip GPU 0 when mapping visible devices"
|
||||
echo " ${cmd} --distribute=4 --distribute-partition=0 --visible-gpus=1,2 -v -- command ..."
|
||||
echo " Display the current bindings"
|
||||
echo " ${cmd} --proc-bind=numa:0 --show-bindings -- command"
|
||||
echo " ${cmd} --proc-bind=numa:0 -- command"
|
||||
echo " Display the current bindings using lstopo"
|
||||
echo " ${cmd} --proc-bind=numa:0.core:odd --lstopo"
|
||||
echo ""
|
||||
|
@ -144,7 +164,7 @@ fi
|
|||
declare -a UNKNOWN_ARGS=()
|
||||
declare -i HPCBIND_ENABLE_HWLOC_BIND=${HPCBIND_HAS_HWLOC}
|
||||
declare -i HPCBIND_DISTRIBUTE=1
|
||||
declare -i HPCBIND_PARTITION=0
|
||||
declare -i HPCBIND_PARTITION=-1
|
||||
HPCBIND_PROC_BIND="all"
|
||||
HPCBIND_OPENMP_VERSION=4.0
|
||||
declare -i HPCBIND_OPENMP_PERCENT=100
|
||||
|
@ -155,11 +175,15 @@ HPCBIND_OPENMP_FORCE_PROC_BIND=""
|
|||
HPCBIND_OPENMP_NESTED=${OMP_NESTED:-true}
|
||||
declare -i HPCBIND_VERBOSE=0
|
||||
|
||||
declare -i HPCBIND_SHOW_BINDINGS=0
|
||||
declare -i HPCBIND_LSTOPO=0
|
||||
|
||||
for i in $@; do
|
||||
case $i in
|
||||
HPCBIND_OUTPUT_PREFIX=""
|
||||
HPCBIND_OUTPUT_MODE="rank0"
|
||||
|
||||
declare -i HPCBIND_HAS_COMMAND=0
|
||||
|
||||
for i in "$@"; do
|
||||
case "$i" in
|
||||
# number of partitions to create
|
||||
--no-hwloc-bind)
|
||||
HPCBIND_ENABLE_HWLOC_BIND=0
|
||||
|
@ -169,6 +193,10 @@ for i in $@; do
|
|||
HPCBIND_PROC_BIND="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--whole-system)
|
||||
HPCBIND_HWLOC_PARENT_CPUSET=""
|
||||
shift
|
||||
;;
|
||||
--distribute=*)
|
||||
HPCBIND_DISTRIBUTE="${i#*=}"
|
||||
shift
|
||||
|
@ -182,8 +210,8 @@ for i in $@; do
|
|||
HPCBIND_VISIBLE_GPUS=$(echo "${i#*=}" | tr ',' ' ')
|
||||
shift
|
||||
;;
|
||||
--gpu-ignore-queue)
|
||||
HPCBIND_QUEUE_GPU_MAPPING=0
|
||||
--ignore-queue)
|
||||
HPCBIND_QUEUE_MAPPING=0
|
||||
shift
|
||||
;;
|
||||
--no-gpu-mapping)
|
||||
|
@ -218,14 +246,18 @@ for i in $@; do
|
|||
HPCBIND_OPENMP_NESTED="false"
|
||||
shift
|
||||
;;
|
||||
--show-bindings)
|
||||
HPCBIND_VERBOSE=1
|
||||
HPCBIND_SHOW_BINDINGS=1
|
||||
--output-prefix=*)
|
||||
HPCBIND_OUTPUT_PREFIX="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--output-mode=*)
|
||||
HPCBIND_OUTPUT_MODE="${i#*=}"
|
||||
#convert to lower case
|
||||
HPCBIND_OUTPUT_MODE="${HPCBIND_OUTPUT_MODE,,}"
|
||||
shift
|
||||
;;
|
||||
--lstopo)
|
||||
HPCBIND_VERBOSE=1
|
||||
HPCBIND_SHOW_BINDINGS=0
|
||||
HPCBIND_LSTOPO=1
|
||||
shift
|
||||
;;
|
||||
|
@ -239,6 +271,7 @@ for i in $@; do
|
|||
;;
|
||||
# ignore remaining arguments
|
||||
--)
|
||||
HPCBIND_HAS_COMMAND=1
|
||||
shift
|
||||
break
|
||||
;;
|
||||
|
@ -250,16 +283,41 @@ for i in $@; do
|
|||
esac
|
||||
done
|
||||
|
||||
################################################################################
|
||||
# Check output mode
|
||||
################################################################################
|
||||
declare -i HPCBIND_TEE=0
|
||||
|
||||
if [[ "${HPCBIND_OUTPUT_MODE}" == "none" ]]; then
|
||||
HPCBIND_TEE=0
|
||||
elif [[ "${HPCBIND_OUTPUT_MODE}" == "all" ]]; then
|
||||
HPCBIND_TEE=1
|
||||
elif [[ ${HPCBIND_QUEUE_INDEX} -eq 0 ]]; then
|
||||
#default to rank0 printing to screen
|
||||
HPCBIND_TEE=1
|
||||
fi
|
||||
|
||||
|
||||
if [[ "${HPCBIND_OUTPUT_PREFIX}" == "" ]]; then
|
||||
HPCBIND_LOG=/dev/null
|
||||
HPCBIND_ERR=/dev/null
|
||||
HPCBIND_OUT=/dev/null
|
||||
else
|
||||
HPCBIND_LOG="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.hpc.log"
|
||||
HPCBIND_ERR="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.err"
|
||||
HPCBIND_OUT="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.out"
|
||||
> ${HPCBIND_LOG}
|
||||
fi
|
||||
|
||||
|
||||
################################################################################
|
||||
# Check unknown arguments
|
||||
################################################################################
|
||||
if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then
|
||||
echo "Uknown options: ${UNKNOWN_ARGS[*]}"
|
||||
echo "HPCBIND Uknown options: ${UNKNOWN_ARGS[*]}" > >(tee -a ${HPCBIND_LOG})
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
################################################################################
|
||||
# Check that visible gpus are valid
|
||||
################################################################################
|
||||
|
@ -268,22 +326,19 @@ if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then
|
|||
for ((i=0; i < ${#HPCBIND_VISIBLE_GPUS[*]}; i++)); do
|
||||
if [[ ${HPCBIND_VISIBLE_GPUS[$i]} -ge ${NUM_GPUS} ||
|
||||
${HPCBIND_VISIBLE_GPUS[$i]} -lt 0 ]]; then
|
||||
echo "Invaild GPU ID ${HPCBIND_VISIBLE_GPUS[$i]}, setting to 0"
|
||||
echo "HPCBIND Invaild GPU ID ${HPCBIND_VISIBLE_GPUS[$i]} (setting to 0)" > >(tee -a ${HPCBIND_LOG})
|
||||
HPCBIND_VISIBLE_GPUS[$i]=0;
|
||||
fi
|
||||
done
|
||||
NUM_GPUS=${#HPCBIND_VISIBLE_GPUS[@]}
|
||||
fi
|
||||
|
||||
|
||||
################################################################################
|
||||
# Check OpenMP percent
|
||||
################################################################################
|
||||
if [[ ${HPCBIND_OPENMP_PERCENT} -lt 1 ]]; then
|
||||
echo "OpenMP percent < 1, setting to 1"
|
||||
HPCBIND_OPENMP_PERCENT=1
|
||||
elif [[ ${HPCBIND_OPENMP_PERCENT} -gt 100 ]]; then
|
||||
echo "OpenMP percent > 100, setting to 100"
|
||||
HPCBIND_OPENMP_PERCENT=100
|
||||
fi
|
||||
|
||||
|
@ -291,15 +346,21 @@ fi
|
|||
# Check distribute
|
||||
################################################################################
|
||||
if [[ ${HPCBIND_DISTRIBUTE} -le 0 ]]; then
|
||||
echo "Invalid input for distribute, changing distribute to 1"
|
||||
HPCBIND_DISTRIBUTE=1
|
||||
fi
|
||||
|
||||
if [[ ${HPCBIND_PARTITION} -ge ${HPCBIND_DISTRIBUTE} ]]; then
|
||||
echo "Invalid input for distribute-partition, changing to 0"
|
||||
################################################################################
|
||||
#choose the correct partition
|
||||
################################################################################
|
||||
if [[ ${HPCBIND_PARTITION} -lt 0 && ${HPCBIND_QUEUE_MAPPING} -eq 1 ]]; then
|
||||
HPCBIND_PARTITION=${HPCBIND_QUEUE_INDEX}
|
||||
elif [[ ${HPCBIND_PARTITION} -lt 0 ]]; then
|
||||
HPCBIND_PARTITION=0
|
||||
fi
|
||||
|
||||
if [[ ${HPCBIND_PARTITION} -ge ${HPCBIND_DISTRIBUTE} ]]; then
|
||||
HPCBIND_PARTITION=$((HPCBIND_PARTITION % HPCBIND_DISTRIBUTE))
|
||||
fi
|
||||
|
||||
################################################################################
|
||||
# Find cpuset and num threads
|
||||
|
@ -309,13 +370,17 @@ declare -i HPCBIND_NUM_PUS=0
|
|||
|
||||
if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
|
||||
if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then
|
||||
BINDING=$(hwloc-calc ${HPCBIND_PROC_BIND})
|
||||
BINDING=$(hwloc-calc ${HPCBIND_PROC_BIND[*]})
|
||||
else
|
||||
BINDING=$(hwloc-calc --restrict ${HPCBIND_HWLOC_PARENT_CPUSET} ${HPCBIND_PROC_BIND})
|
||||
BINDING=$(hwloc-calc --restrict ${HPCBIND_HWLOC_PARENT_CPUSET} ${HPCBIND_PROC_BIND[*]})
|
||||
fi
|
||||
|
||||
CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${HPCBIND_DISTRIBUTE}))
|
||||
HPCBIND_HWLOC_CPUSET=${CPUSETS[${HPCBIND_PARTITION}]}
|
||||
if [[ ${HPCBIND_DISTRIBUTE} -gt 1 ]]; then
|
||||
CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${HPCBIND_DISTRIBUTE}))
|
||||
HPCBIND_HWLOC_CPUSET="${CPUSETS[${HPCBIND_PARTITION}]}"
|
||||
else
|
||||
HPCBIND_HWLOC_CPUSET="${BINDING}"
|
||||
fi
|
||||
HPCBIND_NUM_PUS=$(hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu | wc -l)
|
||||
else
|
||||
HPCBIND_NUM_PUS=$(cat /proc/cpuinfo | grep -c processor)
|
||||
|
@ -373,13 +438,13 @@ export OMP_NESTED=${HPCBIND_OPENMP_NESTED}
|
|||
################################################################################
|
||||
|
||||
if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then
|
||||
if [[ ${HPCBIND_QUEUE_GPU_MAPPING} -eq 0 ]]; then
|
||||
if [[ ${HPCBIND_QUEUE_MAPPING} -eq 0 ]]; then
|
||||
declare -i GPU_ID=$((HPCBIND_PARTITION % NUM_GPUS))
|
||||
export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}
|
||||
export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}"
|
||||
else
|
||||
declare -i MY_TASK_ID=$((HPCBIND_QUEUE_INDEX * HPCBIND_DISTRIBUTE + HPCBIND_PARTITION))
|
||||
declare -i GPU_ID=$((MY_TASK_ID % NUM_GPUS))
|
||||
export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}
|
||||
export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}"
|
||||
fi
|
||||
fi
|
||||
|
||||
|
@ -389,22 +454,22 @@ fi
|
|||
export HPCBIND_HAS_HWLOC=${HPCBIND_HAS_HWLOC}
|
||||
export HPCBIND_HAS_NVIDIA=${HPCBIND_HAS_NVIDIA}
|
||||
export HPCBIND_NUM_PUS=${HPCBIND_NUM_PUS}
|
||||
export HPCBIND_HWLOC_CPUSET=${HPCBIND_HWLOC_CPUSET}
|
||||
export HPCBIND_HWLOC_CPUSET="${HPCBIND_HWLOC_CPUSET}"
|
||||
export HPCBIND_HWLOC_DISTRIBUTE=${HPCBIND_DISTRIBUTE}
|
||||
export HPCBIND_HWLOC_DISTRIBUTE_PARTITION=${HPCBIND_PARTITION}
|
||||
if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then
|
||||
export HPCBIND_HWLOC_PARENT_CPUSET="all"
|
||||
else
|
||||
export HPCBIND_HWLOC_PARENT_CPUSET=${HPCBIND_HWLOC_PARENT_CPUSET}
|
||||
export HPCBIND_HWLOC_PARENT_CPUSET="${HPCBIND_HWLOC_PARENT_CPUSET}"
|
||||
fi
|
||||
export HPCBIND_HWLOC_PROC_BIND=${HPCBIND_PROC_BIND}
|
||||
export HPCBIND_HWLOC_PROC_BIND="${HPCBIND_PROC_BIND}"
|
||||
export HPCBIND_NVIDIA_ENABLE_GPU_MAPPING=${HPCBIND_ENABLE_GPU_MAPPING}
|
||||
export HPCBIND_NVIDIA_VISIBLE_GPUS=$(echo "${HPCBIND_VISIBLE_GPUS[*]}" | tr ' ' ',')
|
||||
export HPCBIND_OPENMP_VERSION=${HPCBIND_OPENMP_VERSION}
|
||||
export HPCBIND_OPENMP_VERSION="${HPCBIND_OPENMP_VERSION}"
|
||||
if [[ "${HPCBIND_QUEUE_NAME}" != "" ]]; then
|
||||
export HPCBIND_QUEUE_INDEX=${HPCBIND_QUEUE_INDEX}
|
||||
export HPCBIND_QUEUE_NAME=${HPCBIND_QUEUE_NAME}
|
||||
export HPCBIND_QUEUE_GPU_MAPPING=${HPCBIND_QUEUE_GPU_MAPPING}
|
||||
export HPCBIND_QUEUE_NAME="${HPCBIND_QUEUE_NAME}"
|
||||
export HPCBIND_QUEUE_MAPPING=${HPCBIND_QUEUE_MAPPING}
|
||||
fi
|
||||
|
||||
|
||||
|
@ -412,43 +477,63 @@ fi
|
|||
# Print verbose
|
||||
################################################################################
|
||||
|
||||
if [[ ${HPCBIND_VERBOSE} -eq 1 ]]; then
|
||||
MY_ENV=$(env | sort)
|
||||
echo "[HPCBIND]"
|
||||
echo "${MY_ENV}" | grep -E "^HPCBIND_"
|
||||
echo "[CUDA]"
|
||||
echo "${MY_ENV}" | grep -E "^CUDA_"
|
||||
echo "[OPENMP]"
|
||||
echo "${MY_ENV}" | grep -E "^OMP_"
|
||||
fi
|
||||
TMP_ENV=$(env | sort)
|
||||
if [[ ${HPCBIND_TEE} -eq 0 || ${HPCBIND_VERBOSE} -eq 0 ]]; then
|
||||
echo "[HOST]" >> ${HPCBIND_LOG}
|
||||
hostname -s >> ${HPCBIND_LOG}
|
||||
echo "[HPCBIND]" >> ${HPCBIND_LOG}
|
||||
echo "${TMP_ENV}" | grep -E "^HPCBIND_" >> ${HPCBIND_LOG}
|
||||
echo "[CUDA]" >> ${HPCBIND_LOG}
|
||||
echo "${TMP_ENV}" | grep -E "^CUDA_" >> ${HPCBIND_LOG}
|
||||
echo "[OPENMP]" >> ${HPCBIND_LOG}
|
||||
echo "${TMP_ENV}" | grep -E "^OMP_" >> ${HPCBIND_LOG}
|
||||
|
||||
if [[ ${HPCBIND_HAS_HWLOC} -eq 1 && ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then
|
||||
echo "[BINDINGS]"
|
||||
hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu
|
||||
elif [[ ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then
|
||||
echo "Unable to show bindings, hwloc not available."
|
||||
if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
|
||||
echo "[BINDINGS]" >> ${HPCBIND_LOG}
|
||||
hwloc-ls --restrict "${HPCBIND_HWLOC_CPUSET}" --only pu >> ${HPCBIND_LOG}
|
||||
else
|
||||
echo "Unable to show bindings, hwloc not available." >> ${HPCBIND_LOG}
|
||||
fi
|
||||
else
|
||||
echo "[HOST]" > >(tee -a ${HPCBIND_LOG})
|
||||
hostname -s > >(tee -a ${HPCBIND_LOG})
|
||||
echo "[HPCBIND]" > >(tee -a ${HPCBIND_LOG})
|
||||
echo "${TMP_ENV}" | grep -E "^HPCBIND_" > >(tee -a ${HPCBIND_LOG})
|
||||
echo "[CUDA]" > >(tee -a ${HPCBIND_LOG})
|
||||
echo "${TMP_ENV}" | grep -E "^CUDA_" > >(tee -a ${HPCBIND_LOG})
|
||||
echo "[OPENMP]" > >(tee -a ${HPCBIND_LOG})
|
||||
echo "${TMP_ENV}" | grep -E "^OMP_" > >(tee -a ${HPCBIND_LOG})
|
||||
|
||||
if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
|
||||
echo "[BINDINGS]" > >(tee -a ${HPCBIND_LOG})
|
||||
hwloc-ls --restrict "${HPCBIND_HWLOC_CPUSET}" --only pu > >(tee -a ${HPCBIND_LOG})
|
||||
else
|
||||
echo "Unable to show bindings, hwloc not available." > >(tee -a ${HPCBIND_LOG})
|
||||
fi
|
||||
fi
|
||||
|
||||
################################################################################
|
||||
# Run command
|
||||
################################################################################
|
||||
|
||||
if [[ ${HPCBIND_LSTOPO} -eq 0 ]]; then
|
||||
if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
|
||||
hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- $@
|
||||
else
|
||||
eval $@
|
||||
fi
|
||||
else
|
||||
if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
|
||||
if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 && ! -z ${DISPLAY} ]]; then
|
||||
echo "[BINDINGS]"
|
||||
hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu
|
||||
hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- lstopo --pid 0
|
||||
# must be the last executed command so that the return value is correct
|
||||
if [[ ${HPCBIND_LSTOPO} -eq 1 && ${HPCBIND_HAS_HWLOC} -eq 1 && ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 && ! -z ${DISPLAY} ]]; then
|
||||
hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- lstopo --pid 0
|
||||
elif [[ ${HPCBIND_HAS_COMMAND} -eq 1 ]]; then
|
||||
# clear output files
|
||||
> ${HPCBIND_ERR}
|
||||
> ${HPCBIND_OUT}
|
||||
if [[ ${HPCBIND_TEE} -eq 0 ]]; then
|
||||
if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
|
||||
hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
|
||||
else
|
||||
hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET}
|
||||
eval $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
|
||||
fi
|
||||
else
|
||||
echo "Unable to show bindings, hwloc not available."
|
||||
if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
|
||||
hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
|
||||
else
|
||||
eval $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
|
|
@ -1,221 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# check if hwloc commands exist
|
||||
declare -i HAS_HWLOC=0
|
||||
type hwloc-bind >/dev/null 2>&1
|
||||
HAS_HWLOC="${HAS_HWLOC} + $?"
|
||||
|
||||
type hwloc-distrib >/dev/null 2>&1
|
||||
HAS_HWLOC="${HAS_HWLOC} + $?"
|
||||
|
||||
type hwloc-ls >/dev/null 2>&1
|
||||
HAS_HWLOC="${HAS_HWLOC} + $?"
|
||||
|
||||
type hwloc-calc >/dev/null 2>&1
|
||||
HAS_HWLOC="${HAS_HWLOC} + $?"
|
||||
|
||||
type hwloc-ps >/dev/null 2>&1
|
||||
HAS_HWLOC="${HAS_HWLOC} + $?"
|
||||
|
||||
|
||||
#parse args
|
||||
declare -a UNKNOWN_ARGS=()
|
||||
declare -i DISTRIBUTE=1
|
||||
declare -i INDEX=0
|
||||
PROC_BIND="all"
|
||||
CURRENT_CPUSET=""
|
||||
OPENMP_VERSION=4.0
|
||||
OPENMP_PROC_BIND=True
|
||||
OPENMP_NESTED=True
|
||||
VERBOSE=False
|
||||
|
||||
#get the current process cpuset
|
||||
if [[ ${HAS_HWLOC} -eq 0 ]]; then
|
||||
MY_PID="$BASHPID"
|
||||
CURRENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2)
|
||||
echo "$CURRENT_CPUSET"
|
||||
fi
|
||||
|
||||
function show_help {
|
||||
local cmd=$(basename "$0")
|
||||
echo "Usage: ${cmd} <options> -- command ..."
|
||||
echo " Uses hwloc to divide the node into the given number of groups,"
|
||||
echo " set the appropriate OMP_NUM_THREADS and execute the command on the"
|
||||
echo " selected group."
|
||||
echo ""
|
||||
echo " NOTE: This command assumes it has exclusive use of the node"
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo " --proc-bind=<LOC> Set the initial process mask for the script. "
|
||||
echo " LOC can be any valid location argumnet for"
|
||||
echo " hwloc-calc. Defaults to the entire machine"
|
||||
echo " --distribute=N Distribute the current proc-bind into N groups"
|
||||
echo " --index=I Use the i'th group (zero based)"
|
||||
echo " --openmp=M.m Set env variables for the given OpenMP version"
|
||||
echo " (default 4.0)"
|
||||
echo " --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES"
|
||||
echo " --no-openmp-nested Set OMP_NESTED to false"
|
||||
echo " -v|--verbose"
|
||||
echo " -h|--help"
|
||||
echo ""
|
||||
echo "Sample Usage:"
|
||||
echo " ${cmd} --distribute=4 --index=2 -v -- command ..."
|
||||
echo ""
|
||||
}
|
||||
|
||||
if [[ "$#" -eq 0 ]]; then
|
||||
show_help
|
||||
exit 0
|
||||
fi
|
||||
|
||||
|
||||
for i in $@; do
|
||||
case $i in
|
||||
# number of partitions to create
|
||||
--proc-bind=*)
|
||||
PROC_BIND="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--distribute=*)
|
||||
DISTRIBUTE="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
# which group to use
|
||||
--index=*)
|
||||
INDEX="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--openmp=*)
|
||||
OPENMP_VERSION="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--no-openmp-proc-bind)
|
||||
OPENMP_PROC_BIND=False
|
||||
shift
|
||||
;;
|
||||
--no-openmp-nested)
|
||||
OPENMP_NESTED=False
|
||||
shift
|
||||
;;
|
||||
-v|--verbose)
|
||||
VERBOSE=True
|
||||
shift
|
||||
;;
|
||||
-h|--help)
|
||||
show_help
|
||||
exit 0
|
||||
;;
|
||||
# ignore remaining arguments
|
||||
--)
|
||||
shift
|
||||
break
|
||||
;;
|
||||
# unknown option
|
||||
*)
|
||||
UNKNOWN_ARGS+=("$i")
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then
|
||||
echo "Uknown options: ${UNKNOWN_ARGS[*]}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ ${DISTRIBUTE} -le 0 ]]; then
|
||||
echo "Invalid input for distribute, changing distribute to 1"
|
||||
DISTRIBUTE=1
|
||||
fi
|
||||
|
||||
if [[ ${INDEX} -ge ${DISTRIBUTE} ]]; then
|
||||
echo "Invalid input for index, changing index to 0"
|
||||
INDEX=0
|
||||
fi
|
||||
|
||||
if [[ ${HAS_HWLOC} -ne 0 ]]; then
|
||||
echo "hwloc not found, no process binding will occur"
|
||||
DISTRIBUTE=1
|
||||
INDEX=0
|
||||
fi
|
||||
|
||||
if [[ ${HAS_HWLOC} -eq 0 ]]; then
|
||||
|
||||
if [[ "${CURRENT_CPUSET}" == "" ]]; then
|
||||
BINDING=$(hwloc-calc ${PROC_BIND})
|
||||
else
|
||||
BINDING=$(hwloc-calc --restrict ${CURRENT_CPUSET} ${PROC_BIND})
|
||||
fi
|
||||
|
||||
CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${DISTRIBUTE}))
|
||||
CPUSET=${CPUSETS[${INDEX}]}
|
||||
NUM_THREADS=$(hwloc-ls --restrict ${CPUSET} --only pu | wc -l)
|
||||
|
||||
if [[ "${VERBOSE}" == "True" ]]; then
|
||||
echo "hwloc: true"
|
||||
echo " proc_bind: ${PROC_BIND}"
|
||||
echo " distribute: ${DISTRIBUTE}"
|
||||
echo " index: ${INDEX}"
|
||||
echo " parent_cpuset: ${CURRENT_CPUSET}"
|
||||
echo " cpuset: ${CPUSET}"
|
||||
echo "omp_num_threads: ${NUM_THREADS}"
|
||||
echo "omp_proc_bind: ${OPENMP_PROC_BIND}"
|
||||
echo "omp_nested: ${OPENMP_NESTED}"
|
||||
echo "OpenMP: ${OPENMP_VERSION}"
|
||||
fi
|
||||
|
||||
# set OMP env
|
||||
if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then
|
||||
if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then
|
||||
export OMP_PLACES="threads"
|
||||
export OMP_PROC_BIND="spread"
|
||||
else
|
||||
export OMP_PROC_BIND="true"
|
||||
unset OMP_PLACES
|
||||
fi
|
||||
else
|
||||
unset OMP_PLACES
|
||||
unset OMP_PROC_BIND
|
||||
fi
|
||||
if [[ "${OPENMP_NESTED}" == "True" ]]; then
|
||||
export OMP_NESTED="true"
|
||||
else
|
||||
export OMP_NESTED="false"
|
||||
fi
|
||||
export OMP_NUM_THREADS="${NUM_THREADS}"
|
||||
|
||||
hwloc-bind ${CPUSET} -- $@
|
||||
else
|
||||
NUM_THREADS=$(cat /proc/cpuinfo | grep -c processor)
|
||||
|
||||
if [[ "${VERBOSE}" == "True" ]]; then
|
||||
echo "hwloc: false"
|
||||
echo "omp_num_threads: ${NUM_THREADS}"
|
||||
echo "omp_proc_bind: ${OPENMP_PROC_BIND}"
|
||||
echo "omp_nested: ${OPENMP_NESTED}"
|
||||
echo "OpenMP: ${OPENMP_VERSION}"
|
||||
fi
|
||||
|
||||
# set OMP env
|
||||
if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then
|
||||
if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then
|
||||
export OMP_PLACES="threads"
|
||||
export OMP_PROC_BIND="spread"
|
||||
else
|
||||
export OMP_PROC_BIND="true"
|
||||
unset OMP_PLACES
|
||||
fi
|
||||
else
|
||||
unset OMP_PLACES
|
||||
unset OMP_PROC_BIND
|
||||
fi
|
||||
if [[ "${OPENMP_NESTED}" == "True" ]]; then
|
||||
export OMP_NESTED="true"
|
||||
else
|
||||
export OMP_NESTED="false"
|
||||
fi
|
||||
export OMP_NUM_THREADS="${NUM_THREADS}"
|
||||
|
||||
eval $@
|
||||
fi
|
||||
|
|
@ -78,6 +78,9 @@ temp_dir=${TMPDIR:-/tmp}
|
|||
# Check if we have an optimization argument already
|
||||
optimization_applied=0
|
||||
|
||||
# Check if we have -std=c++X or --std=c++X already
|
||||
stdcxx_applied=0
|
||||
|
||||
#echo "Arguments: $# $@"
|
||||
|
||||
while [ $# -gt 0 ]
|
||||
|
@ -130,10 +133,16 @@ do
|
|||
cuda_args="$cuda_args $1 $2"
|
||||
shift
|
||||
;;
|
||||
#Handle c++11 setting
|
||||
--std=c++11|-std=c++11)
|
||||
shared_args="$shared_args $1"
|
||||
#Handle c++11
|
||||
--std=c++11|-std=c++11|--std=c++14|-std=c++14|--std=c++1z|-std=c++1z)
|
||||
if [ $stdcxx_applied -eq 1 ]; then
|
||||
echo "nvcc_wrapper - *warning* you have set multiple optimization flags (-std=c++1* or --std=c++1*), only the first is used because nvcc can only accept a single std setting"
|
||||
else
|
||||
shared_args="$shared_args $1"
|
||||
stdcxx_applied=1
|
||||
fi
|
||||
;;
|
||||
|
||||
#strip of -std=c++98 due to nvcc warnings and Tribits will place both -std=c++11 and -std=c++98
|
||||
-std=c++98|--std=c++98)
|
||||
;;
|
||||
|
|
|
@ -9,3 +9,4 @@ tag: 2.03.00 date: 04:25:2017 master: 120d9ce7 develop: 015ba641
|
|||
tag: 2.03.05 date: 05:27:2017 master: 36b92f43 develop: 79073186
|
||||
tag: 2.03.13 date: 07:27:2017 master: da314444 develop: 29ccb58a
|
||||
tag: 2.04.00 date: 08:16:2017 master: 54eb75c0 develop: 32fb8ee1
|
||||
tag: 2.04.04 date: 09:11:2017 master: 2b7e9c20 develop: 51e7b25a
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
module purge
|
||||
module load sems-env sems-gcc/4.9.3 sems-openmpi/1.10.1 sems-hdf5/1.8.12/parallel sems-netcdf/4.3.2/parallel sems-python/2.7.9 sems-zlib/1.2.8/base sems-cmake/3.5.2 sems-parmetis/4.0.3/64bit_parallel sems-scotch/6.0.3/nopthread_64bit_parallel sems-boost/1.59.0/base
|
||||
module load sems-env sems-gcc/4.9.3 sems-openmpi/1.10.1 sems-hdf5/1.8.12/parallel sems-netcdf/4.3.2/parallel sems-python/2.7.9 sems-zlib/1.2.8/base sems-cmake/3.5.2 sems-parmetis/4.0.3/64bit_parallel sems-scotch/6.0.3/nopthread_64bit_parallel sems-boost/1.63.0/base sems-yaml_cpp sems-superlu
|
||||
|
||||
#Run Trilinos CheckinTest
|
||||
|
|
|
@ -125,6 +125,123 @@ namespace Impl {
|
|||
};
|
||||
}
|
||||
|
||||
/// \class GraphRowViewConst
|
||||
/// \brief View of a row of a sparse graph.
|
||||
/// \tparam GraphType Sparse graph type, such as (but not limited to) StaticCrsGraph.
|
||||
///
|
||||
/// This class provides a generic view of a row of a sparse graph.
|
||||
/// We intended this class to view a row of a StaticCrsGraph, but
|
||||
/// GraphType need not necessarily be CrsMatrix.
|
||||
///
|
||||
/// The row view is suited for computational kernels like sparse
|
||||
/// matrix-vector multiply, as well as for modifying entries in the
|
||||
/// sparse matrix. The view is always const as it does not allow graph modification.
|
||||
///
|
||||
/// Here is an example loop over the entries in the row:
|
||||
/// \code
|
||||
/// typedef typename GraphRowViewConst<MatrixType>::ordinal_type ordinal_type;
|
||||
///
|
||||
/// GraphRowView<GraphType> G_i = ...;
|
||||
/// const ordinal_type numEntries = G_i.length;
|
||||
/// for (ordinal_type k = 0; k < numEntries; ++k) {
|
||||
/// ordinal_type j = G_i.colidx (k);
|
||||
/// // ... do something with A_ij and j ...
|
||||
/// }
|
||||
/// \endcode
|
||||
///
|
||||
/// GraphType must provide the \c data_type
|
||||
/// typedefs. In addition, it must make sense to use GraphRowViewConst to
|
||||
/// view a row of GraphType. In particular, column
|
||||
/// indices of a row must be accessible using the <tt>entries</tt>
|
||||
/// resp. <tt>colidx</tt> arrays given to the constructor of this
|
||||
/// class, with a constant <tt>stride</tt> between successive entries.
|
||||
/// The stride is one for the compressed sparse row storage format (as
|
||||
/// is used by CrsMatrix), but may be greater than one for other
|
||||
/// sparse matrix storage formats (e.g., ELLPACK or jagged diagonal).
|
||||
template<class GraphType>
|
||||
struct GraphRowViewConst {
|
||||
//! The type of the column indices in the row.
|
||||
typedef const typename GraphType::data_type ordinal_type;
|
||||
|
||||
private:
|
||||
//! Array of (local) column indices in the row.
|
||||
ordinal_type* colidx_;
|
||||
/// \brief Stride between successive entries in the row.
|
||||
///
|
||||
/// For compressed sparse row (CSR) storage, this is always one.
|
||||
/// This might be greater than one for storage formats like ELLPACK
|
||||
/// or Jagged Diagonal. Nevertheless, the stride can never be
|
||||
/// greater than the number of rows or columns in the matrix. Thus,
|
||||
/// \c ordinal_type is the correct type.
|
||||
const ordinal_type stride_;
|
||||
|
||||
public:
|
||||
/// \brief Constructor
|
||||
///
|
||||
/// \param values [in] Array of the row's values.
|
||||
/// \param colidx [in] Array of the row's column indices.
|
||||
/// \param stride [in] (Constant) stride between matrix entries in
|
||||
/// each of the above arrays.
|
||||
/// \param count [in] Number of entries in the row.
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
GraphRowViewConst ( ordinal_type* const colidx_in,
|
||||
const ordinal_type& stride,
|
||||
const ordinal_type& count) :
|
||||
colidx_ (colidx_in), stride_ (stride), length (count)
|
||||
{}
|
||||
|
||||
/// \brief Constructor with offset into \c colidx array
|
||||
///
|
||||
/// \param colidx [in] Array of the row's column indices.
|
||||
/// \param stride [in] (Constant) stride between matrix entries in
|
||||
/// each of the above arrays.
|
||||
/// \param count [in] Number of entries in the row.
|
||||
/// \param idx [in] Start offset into \c colidx array
|
||||
///
|
||||
/// \tparam OffsetType The type of \c idx (see above). Must be a
|
||||
/// built-in integer type. This may differ from ordinal_type.
|
||||
/// For example, the matrix may have dimensions that fit in int,
|
||||
/// but a number of entries that does not fit in int.
|
||||
template<class OffsetType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
GraphRowViewConst ( const typename GraphType::entries_type& colidx_in,
|
||||
const ordinal_type& stride,
|
||||
const ordinal_type& count,
|
||||
const OffsetType& idx,
|
||||
const typename std::enable_if<std::is_integral<OffsetType>::value, int>::type& = 0) :
|
||||
colidx_ (&colidx_in(idx)), stride_ (stride), length (count)
|
||||
{}
|
||||
|
||||
/// \brief Number of entries in the row.
|
||||
///
|
||||
/// This is a public const field rather than a public const method,
|
||||
/// in order to avoid possible overhead of a method call if the
|
||||
/// compiler is unable to inline that method call.
|
||||
///
|
||||
/// We assume that rows contain no duplicate entries (i.e., entries
|
||||
/// with the same column index). Thus, a row may have up to
|
||||
/// A.numCols() entries. This means that the correct type of
|
||||
/// 'length' is ordinal_type.
|
||||
const ordinal_type length;
|
||||
|
||||
/// \brief (Const) reference to the column index of entry i in this
|
||||
/// row of the sparse matrix.
|
||||
///
|
||||
/// "Entry i" is not necessarily the entry with column index i, nor
|
||||
/// does i necessarily correspond to the (local) row index.
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ordinal_type& colidx (const ordinal_type& i) const {
|
||||
return colidx_[i*stride_];
|
||||
}
|
||||
|
||||
/// \brief An alias for colidx
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ordinal_type& operator()(const ordinal_type& i) const {
|
||||
return colidx(i);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/// \class StaticCrsGraph
|
||||
/// \brief Compressed row storage array.
|
||||
///
|
||||
|
@ -218,6 +335,38 @@ public:
|
|||
static_cast<size_type> (0);
|
||||
}
|
||||
|
||||
/// \brief Return a const view of row i of the graph.
|
||||
///
|
||||
/// If row i does not belong to the graph, return an empty view.
|
||||
///
|
||||
/// The returned object \c view implements the following interface:
|
||||
/// <ul>
|
||||
/// <li> \c view.length is the number of entries in the row </li>
|
||||
/// <li> \c view.colidx(k) returns a const reference to the
|
||||
/// column index of the k-th entry in the row </li>
|
||||
/// </ul>
|
||||
/// k is not a column index; it just counts from 0 to
|
||||
/// <tt>view.length - 1</tt>.
|
||||
///
|
||||
/// Users should not rely on the return type of this method. They
|
||||
/// should instead assign to 'auto'. That allows compile-time
|
||||
/// polymorphism for different kinds of sparse matrix formats (e.g.,
|
||||
/// ELLPACK or Jagged Diagonal) that we may wish to support in the
|
||||
/// future.
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
GraphRowViewConst<StaticCrsGraph> rowConst (const data_type i) const {
|
||||
const size_type start = row_map(i);
|
||||
// count is guaranteed to fit in ordinal_type, as long as no row
|
||||
// has duplicate entries.
|
||||
const data_type count = static_cast<data_type> (row_map(i+1) - start);
|
||||
|
||||
if (count == 0) {
|
||||
return GraphRowViewConst<StaticCrsGraph> (NULL, 1, 0);
|
||||
} else {
|
||||
return GraphRowViewConst<StaticCrsGraph> (entries, 1, count, start);
|
||||
}
|
||||
}
|
||||
|
||||
/** \brief Create a row partitioning into a given number of blocks
|
||||
* balancing non-zeros + a fixed cost per row.
|
||||
*/
|
||||
|
|
|
@ -91,11 +91,11 @@ struct DeviceIterateTile<2,RP,Functor,void >
|
|||
// LL
|
||||
if (RP::inner_direction == RP::Left) {
|
||||
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
|
||||
m_func(offset_0 , offset_1);
|
||||
}
|
||||
|
@ -106,11 +106,11 @@ struct DeviceIterateTile<2,RP,Functor,void >
|
|||
// LR
|
||||
else {
|
||||
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
|
||||
|
||||
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
|
||||
m_func(offset_0 , offset_1);
|
||||
}
|
||||
|
@ -143,11 +143,11 @@ struct DeviceIterateTile<2,RP,Functor,Tag>
|
|||
if (RP::inner_direction == RP::Left) {
|
||||
// Loop over size maxnumblocks until full range covered
|
||||
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
|
||||
m_func(Tag(), offset_0 , offset_1);
|
||||
}
|
||||
|
@ -157,11 +157,11 @@ struct DeviceIterateTile<2,RP,Functor,Tag>
|
|||
}
|
||||
else {
|
||||
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
|
||||
|
||||
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
|
||||
m_func(Tag(), offset_0 , offset_1);
|
||||
}
|
||||
|
@ -196,15 +196,15 @@ struct DeviceIterateTile<3,RP,Functor,void >
|
|||
// LL
|
||||
if (RP::inner_direction == RP::Left) {
|
||||
for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
|
||||
m_func(offset_0 , offset_1 , offset_2);
|
||||
}
|
||||
|
@ -217,15 +217,15 @@ struct DeviceIterateTile<3,RP,Functor,void >
|
|||
// LR
|
||||
else {
|
||||
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
|
||||
|
||||
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
|
||||
m_func(offset_0 , offset_1 , offset_2);
|
||||
}
|
||||
|
@ -259,15 +259,15 @@ struct DeviceIterateTile<3,RP,Functor,Tag>
|
|||
{
|
||||
if (RP::inner_direction == RP::Left) {
|
||||
for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
|
||||
m_func(Tag(), offset_0 , offset_1 , offset_2);
|
||||
}
|
||||
|
@ -279,15 +279,15 @@ struct DeviceIterateTile<3,RP,Functor,Tag>
|
|||
}
|
||||
else {
|
||||
for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
|
||||
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
|
||||
|
||||
for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
|
||||
m_func(Tag(), offset_0 , offset_1 , offset_2);
|
||||
}
|
||||
|
@ -340,19 +340,19 @@ struct DeviceIterateTile<4,RP,Functor,void >
|
|||
const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
|
||||
|
||||
for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
|
||||
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
|
||||
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
|
||||
if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
|
||||
|
||||
for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
|
||||
m_func(offset_0 , offset_1 , offset_2 , offset_3);
|
||||
}
|
||||
|
@ -378,19 +378,19 @@ struct DeviceIterateTile<4,RP,Functor,void >
|
|||
const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
|
||||
|
||||
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
|
||||
|
||||
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
|
||||
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
|
||||
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
|
||||
if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
|
||||
m_func(offset_0 , offset_1 , offset_2 , offset_3);
|
||||
}
|
||||
|
@ -442,19 +442,19 @@ struct DeviceIterateTile<4,RP,Functor,Tag>
|
|||
const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
|
||||
|
||||
for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
|
||||
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
|
||||
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
|
||||
if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
|
||||
|
||||
for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
|
||||
m_func(Tag(), offset_0 , offset_1 , offset_2 , offset_3);
|
||||
}
|
||||
|
@ -479,19 +479,19 @@ struct DeviceIterateTile<4,RP,Functor,Tag>
|
|||
const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
|
||||
|
||||
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
|
||||
|
||||
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1;
|
||||
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
|
||||
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
|
||||
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
|
||||
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
|
||||
if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
|
||||
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3);
|
||||
}
|
||||
|
@ -558,23 +558,23 @@ struct DeviceIterateTile<5,RP,Functor,void >
|
|||
const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];
|
||||
|
||||
for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
|
||||
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
|
||||
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
|
||||
if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
|
||||
|
||||
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
|
||||
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
|
||||
|
||||
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
|
||||
m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4);
|
||||
}
|
||||
|
@ -613,23 +613,23 @@ struct DeviceIterateTile<5,RP,Functor,void >
|
|||
const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];
|
||||
|
||||
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
|
||||
|
||||
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
|
||||
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
|
||||
|
||||
for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
|
||||
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
|
||||
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
|
||||
if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
|
||||
m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
|
||||
}
|
||||
|
@ -695,23 +695,23 @@ struct DeviceIterateTile<5,RP,Functor,Tag>
|
|||
const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];
|
||||
|
||||
for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
|
||||
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
|
||||
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
|
||||
if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
|
||||
|
||||
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
|
||||
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
|
||||
|
||||
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
|
||||
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4);
|
||||
}
|
||||
|
@ -750,23 +750,23 @@ struct DeviceIterateTile<5,RP,Functor,Tag>
|
|||
const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];
|
||||
|
||||
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
|
||||
|
||||
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
|
||||
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
|
||||
|
||||
for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
|
||||
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
|
||||
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
|
||||
if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
|
||||
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
|
||||
}
|
||||
|
@ -845,27 +845,27 @@ struct DeviceIterateTile<6,RP,Functor,void >
|
|||
const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4];
|
||||
|
||||
for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
|
||||
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
|
||||
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
|
||||
if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
|
||||
|
||||
for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
|
||||
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
|
||||
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
|
||||
if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
|
||||
|
||||
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
|
||||
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
|
||||
|
||||
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
|
||||
m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
|
||||
}
|
||||
|
@ -917,27 +917,27 @@ struct DeviceIterateTile<6,RP,Functor,void >
|
|||
const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5];
|
||||
|
||||
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
|
||||
|
||||
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
|
||||
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
|
||||
|
||||
for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
|
||||
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
|
||||
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
|
||||
if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
|
||||
|
||||
for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
|
||||
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
|
||||
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
|
||||
if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
|
||||
m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
|
||||
}
|
||||
|
@ -1016,27 +1016,27 @@ struct DeviceIterateTile<6,RP,Functor,Tag>
|
|||
const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4];
|
||||
|
||||
for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
|
||||
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
|
||||
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
|
||||
if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
|
||||
|
||||
for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
|
||||
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
|
||||
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
|
||||
if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
|
||||
|
||||
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
|
||||
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
|
||||
|
||||
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
|
||||
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
|
||||
}
|
||||
|
@ -1088,27 +1088,27 @@ struct DeviceIterateTile<6,RP,Functor,Tag>
|
|||
const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5];
|
||||
|
||||
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
|
||||
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
|
||||
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
|
||||
|
||||
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
|
||||
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
|
||||
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
|
||||
|
||||
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
|
||||
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
|
||||
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
|
||||
|
||||
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
|
||||
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
|
||||
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
|
||||
|
||||
for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
|
||||
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
|
||||
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
|
||||
if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
|
||||
|
||||
for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
|
||||
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
|
||||
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
|
||||
if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
|
||||
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
|
||||
}
|
||||
|
|
|
@ -164,7 +164,7 @@ static void cuda_parallel_launch_constant_memory()
|
|||
|
||||
template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
|
||||
__global__
|
||||
__launch_bounds__(maxTperB, minBperSM)
|
||||
//__launch_bounds__(maxTperB, minBperSM)
|
||||
static void cuda_parallel_launch_constant_memory()
|
||||
{
|
||||
const DriverType & driver =
|
||||
|
@ -182,7 +182,7 @@ static void cuda_parallel_launch_local_memory( const DriverType driver )
|
|||
|
||||
template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
|
||||
__global__
|
||||
__launch_bounds__(maxTperB, minBperSM)
|
||||
//__launch_bounds__(maxTperB, minBperSM)
|
||||
static void cuda_parallel_launch_local_memory( const DriverType driver )
|
||||
{
|
||||
driver();
|
||||
|
|
|
@ -242,45 +242,89 @@ public:
|
|||
re_ = v;
|
||||
}
|
||||
|
||||
template<typename InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>& operator += (const complex<RealType>& src) {
|
||||
complex<RealType>&
|
||||
operator += (const complex<InputRealType>& src) {
|
||||
static_assert(std::is_convertible<InputRealType,RealType>::value,
|
||||
"InputRealType must be convertible to RealType");
|
||||
re_ += src.re_;
|
||||
im_ += src.im_;
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<typename InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator += (const volatile complex<RealType>& src) volatile {
|
||||
void
|
||||
operator += (const volatile complex<InputRealType>& src) volatile {
|
||||
static_assert(std::is_convertible<InputRealType,RealType>::value,
|
||||
"InputRealType must be convertible to RealType");
|
||||
re_ += src.re_;
|
||||
im_ += src.im_;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>& operator += (const RealType& src) {
|
||||
complex<RealType>&
|
||||
operator += (const std::complex<RealType>& src) {
|
||||
re_ += src.real();
|
||||
im_ += src.imag();
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<typename InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>&
|
||||
operator += (const InputRealType& src) {
|
||||
static_assert(std::is_convertible<InputRealType,RealType>::value,
|
||||
"InputRealType must be convertible to RealType");
|
||||
re_ += src;
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<typename InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator += (const volatile RealType& src) volatile {
|
||||
void
|
||||
operator += (const volatile InputRealType& src) volatile {
|
||||
static_assert(std::is_convertible<InputRealType,RealType>::value,
|
||||
"InputRealType must be convertible to RealType");
|
||||
re_ += src;
|
||||
}
|
||||
|
||||
|
||||
template<typename InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>& operator -= (const complex<RealType>& src) {
|
||||
complex<RealType>&
|
||||
operator -= (const complex<InputRealType>& src) {
|
||||
static_assert(std::is_convertible<InputRealType,RealType>::value,
|
||||
"InputRealType must be convertible to RealType");
|
||||
re_ -= src.re_;
|
||||
im_ -= src.im_;
|
||||
return *this;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>& operator -= (const RealType& src) {
|
||||
complex<RealType>&
|
||||
operator -= (const std::complex<RealType>& src) {
|
||||
re_ -= src.real();
|
||||
im_ -= src.imag();
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<typename InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>&
|
||||
operator -= (const InputRealType& src) {
|
||||
static_assert(std::is_convertible<InputRealType,RealType>::value,
|
||||
"InputRealType must be convertible to RealType");
|
||||
re_ -= src;
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<typename InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>& operator *= (const complex<RealType>& src) {
|
||||
complex<RealType>&
|
||||
operator *= (const complex<InputRealType>& src) {
|
||||
static_assert(std::is_convertible<InputRealType,RealType>::value,
|
||||
"InputRealType must be convertible to RealType");
|
||||
const RealType realPart = re_ * src.re_ - im_ * src.im_;
|
||||
const RealType imagPart = re_ * src.im_ + im_ * src.re_;
|
||||
re_ = realPart;
|
||||
|
@ -288,8 +332,12 @@ public:
|
|||
return *this;
|
||||
}
|
||||
|
||||
template<typename InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator *= (const volatile complex<RealType>& src) volatile {
|
||||
void
|
||||
operator *= (const volatile complex<InputRealType>& src) volatile {
|
||||
static_assert(std::is_convertible<InputRealType,RealType>::value,
|
||||
"InputRealType must be convertible to RealType");
|
||||
const RealType realPart = re_ * src.re_ - im_ * src.im_;
|
||||
const RealType imagPart = re_ * src.im_ + im_ * src.re_;
|
||||
re_ = realPart;
|
||||
|
@ -297,20 +345,70 @@ public:
|
|||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>& operator *= (const RealType& src) {
|
||||
complex<RealType>&
|
||||
operator *= (const std::complex<RealType>& src) {
|
||||
const RealType realPart = re_ * src.real() - im_ * src.imag();
|
||||
const RealType imagPart = re_ * src.imag() + im_ * src.real();
|
||||
re_ = realPart;
|
||||
im_ = imagPart;
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<typename InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>&
|
||||
operator *= (const InputRealType& src) {
|
||||
static_assert(std::is_convertible<InputRealType,RealType>::value,
|
||||
"InputRealType must be convertible to RealType");
|
||||
re_ *= src;
|
||||
im_ *= src;
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<typename InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator *= (const volatile RealType& src) volatile {
|
||||
void
|
||||
operator *= (const volatile InputRealType& src) volatile {
|
||||
static_assert(std::is_convertible<InputRealType,RealType>::value,
|
||||
"InputRealType must be convertible to RealType");
|
||||
re_ *= src;
|
||||
im_ *= src;
|
||||
}
|
||||
|
||||
template<typename InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>& operator /= (const complex<RealType>& y) {
|
||||
complex<RealType>&
|
||||
operator /= (const complex<InputRealType>& y) {
|
||||
static_assert(std::is_convertible<InputRealType,RealType>::value,
|
||||
"InputRealType must be convertible to RealType");
|
||||
|
||||
// Scale (by the "1-norm" of y) to avoid unwarranted overflow.
|
||||
// If the real part is +/-Inf and the imaginary part is -/+Inf,
|
||||
// this won't change the result.
|
||||
const RealType s = std::fabs (y.real ()) + std::fabs (y.imag ());
|
||||
|
||||
// If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0.
|
||||
// In that case, the relation x/y == (x/s) / (y/s) doesn't hold,
|
||||
// because y/s is NaN.
|
||||
if (s == 0.0) {
|
||||
this->re_ /= s;
|
||||
this->im_ /= s;
|
||||
}
|
||||
else {
|
||||
const complex<RealType> x_scaled (this->re_ / s, this->im_ / s);
|
||||
const complex<RealType> y_conj_scaled (y.re_ / s, -(y.im_) / s);
|
||||
const RealType y_scaled_abs = y_conj_scaled.re_ * y_conj_scaled.re_ +
|
||||
y_conj_scaled.im_ * y_conj_scaled.im_; // abs(y) == abs(conj(y))
|
||||
*this = x_scaled * y_conj_scaled;
|
||||
*this /= y_scaled_abs;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>&
|
||||
operator /= (const std::complex<RealType>& y) {
|
||||
|
||||
// Scale (by the "1-norm" of y) to avoid unwarranted overflow.
|
||||
// If the real part is +/-Inf and the imaginary part is -/+Inf,
|
||||
// this won't change the result.
|
||||
|
@ -334,57 +432,95 @@ public:
|
|||
return *this;
|
||||
}
|
||||
|
||||
|
||||
template<typename InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>& operator /= (const RealType& src) {
|
||||
complex<RealType>&
|
||||
operator /= (const InputRealType& src) {
|
||||
static_assert(std::is_convertible<InputRealType,RealType>::value,
|
||||
"InputRealType must be convertible to RealType");
|
||||
|
||||
re_ /= src;
|
||||
im_ /= src;
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<typename InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator == (const complex<RealType>& src) {
|
||||
return (re_ == src.re_) && (im_ == src.im_);
|
||||
bool
|
||||
operator == (const complex<InputRealType>& src) {
|
||||
static_assert(std::is_convertible<InputRealType,RealType>::value,
|
||||
"InputRealType must be convertible to RealType");
|
||||
|
||||
return (re_ == static_cast<RealType>(src.re_)) && (im_ == static_cast<RealType>(src.im_));
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator == (const RealType src) {
|
||||
return (re_ == src) && (im_ == RealType(0));
|
||||
bool
|
||||
operator == (const std::complex<RealType>& src) {
|
||||
return (re_ == src.real()) && (im_ == src.imag());
|
||||
}
|
||||
|
||||
template<typename InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool
|
||||
operator == (const InputRealType src) {
|
||||
static_assert(std::is_convertible<InputRealType,RealType>::value,
|
||||
"InputRealType must be convertible to RealType");
|
||||
|
||||
return (re_ == static_cast<RealType>(src)) && (im_ == RealType(0));
|
||||
}
|
||||
|
||||
template<typename InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool
|
||||
operator != (const complex<InputRealType>& src) {
|
||||
static_assert(std::is_convertible<InputRealType,RealType>::value,
|
||||
"InputRealType must be convertible to RealType");
|
||||
|
||||
return (re_ != static_cast<RealType>(src.re_)) || (im_ != static_cast<RealType>(src.im_));
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator != (const complex<RealType>& src) {
|
||||
return (re_ != src.re_) || (im_ != src.im_);
|
||||
bool
|
||||
operator != (const std::complex<RealType>& src) {
|
||||
return (re_ != src.real()) || (im_ != src.imag());
|
||||
}
|
||||
|
||||
template<typename InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator != (const RealType src) {
|
||||
return (re_ != src) || (im_ != RealType(0));
|
||||
}
|
||||
bool
|
||||
operator != (const InputRealType src) {
|
||||
static_assert(std::is_convertible<InputRealType,RealType>::value,
|
||||
"InputRealType must be convertible to RealType");
|
||||
|
||||
return (re_ != static_cast<RealType>(src)) || (im_ != RealType(0));
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
//! Binary + operator for complex complex.
|
||||
template<class RealType>
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
operator + (const complex<RealType>& x, const complex<RealType>& y) {
|
||||
return complex<RealType> (x.real () + y.real (), x.imag () + y.imag ());
|
||||
complex<typename std::common_type<RealType1,RealType2>::type>
|
||||
operator + (const complex<RealType1>& x, const complex<RealType2>& y) {
|
||||
return complex<typename std::common_type<RealType1,RealType2>::type > (x.real () + y.real (), x.imag () + y.imag ());
|
||||
}
|
||||
|
||||
//! Binary + operator for complex scalar.
|
||||
template<class RealType>
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
operator + (const complex<RealType>& x, const RealType& y) {
|
||||
return complex<RealType> (x.real () + y , x.imag ());
|
||||
complex<typename std::common_type<RealType1,RealType2>::type>
|
||||
operator + (const complex<RealType1>& x, const RealType2& y) {
|
||||
return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () + y , x.imag ());
|
||||
}
|
||||
|
||||
//! Binary + operator for scalar complex.
|
||||
template<class RealType>
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
operator + (const RealType& x, const complex<RealType>& y) {
|
||||
return complex<RealType> (x + y.real (), y.imag ());
|
||||
complex<typename std::common_type<RealType1,RealType2>::type>
|
||||
operator + (const RealType1& x, const complex<RealType2>& y) {
|
||||
return complex<typename std::common_type<RealType1,RealType2>::type> (x + y.real (), y.imag ());
|
||||
}
|
||||
|
||||
//! Unary + operator for complex.
|
||||
|
@ -396,27 +532,27 @@ operator + (const complex<RealType>& x) {
|
|||
}
|
||||
|
||||
//! Binary - operator for complex.
|
||||
template<class RealType>
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
operator - (const complex<RealType>& x, const complex<RealType>& y) {
|
||||
return complex<RealType> (x.real () - y.real (), x.imag () - y.imag ());
|
||||
complex<typename std::common_type<RealType1,RealType2>::type>
|
||||
operator - (const complex<RealType1>& x, const complex<RealType2>& y) {
|
||||
return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () - y.real (), x.imag () - y.imag ());
|
||||
}
|
||||
|
||||
//! Binary - operator for complex scalar.
|
||||
template<class RealType>
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
operator - (const complex<RealType>& x, const RealType& y) {
|
||||
return complex<RealType> (x.real () - y , x.imag ());
|
||||
complex<typename std::common_type<RealType1,RealType2>::type>
|
||||
operator - (const complex<RealType1>& x, const RealType2& y) {
|
||||
return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () - y , x.imag ());
|
||||
}
|
||||
|
||||
//! Binary - operator for scalar complex.
|
||||
template<class RealType>
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
operator - (const RealType& x, const complex<RealType>& y) {
|
||||
return complex<RealType> (x - y.real (), - y.imag ());
|
||||
complex<typename std::common_type<RealType1,RealType2>::type>
|
||||
operator - (const RealType1& x, const complex<RealType2>& y) {
|
||||
return complex<typename std::common_type<RealType1,RealType2>::type> (x - y.real (), - y.imag ());
|
||||
}
|
||||
|
||||
//! Unary - operator for complex.
|
||||
|
@ -428,12 +564,12 @@ operator - (const complex<RealType>& x) {
|
|||
}
|
||||
|
||||
//! Binary * operator for complex.
|
||||
template<class RealType>
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
operator * (const complex<RealType>& x, const complex<RealType>& y) {
|
||||
return complex<RealType> (x.real () * y.real () - x.imag () * y.imag (),
|
||||
x.real () * y.imag () + x.imag () * y.real ());
|
||||
complex<typename std::common_type<RealType1,RealType2>::type>
|
||||
operator * (const complex<RealType1>& x, const complex<RealType2>& y) {
|
||||
return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () * y.real () - x.imag () * y.imag (),
|
||||
x.real () * y.imag () + x.imag () * y.real ());
|
||||
}
|
||||
|
||||
/// \brief Binary * operator for std::complex and complex.
|
||||
|
@ -446,33 +582,34 @@ operator * (const complex<RealType>& x, const complex<RealType>& y) {
|
|||
/// This function cannot be called in a CUDA device function, because
|
||||
/// std::complex's methods and nonmember functions are not marked as
|
||||
/// CUDA device functions.
|
||||
template<class RealType>
|
||||
complex<RealType>
|
||||
operator * (const std::complex<RealType>& x, const complex<RealType>& y) {
|
||||
return complex<RealType> (x.real () * y.real () - x.imag () * y.imag (),
|
||||
x.real () * y.imag () + x.imag () * y.real ());
|
||||
template<class RealType1, class RealType2>
|
||||
inline
|
||||
complex<typename std::common_type<RealType1,RealType2>::type>
|
||||
operator * (const std::complex<RealType1>& x, const complex<RealType2>& y) {
|
||||
return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () * y.real () - x.imag () * y.imag (),
|
||||
x.real () * y.imag () + x.imag () * y.real ());
|
||||
}
|
||||
|
||||
/// \brief Binary * operator for RealType times complex.
|
||||
///
|
||||
/// This function exists because the compiler doesn't know that
|
||||
/// RealType and complex<RealType> commute with respect to operator*.
|
||||
template<class RealType>
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
operator * (const RealType& x, const complex<RealType>& y) {
|
||||
return complex<RealType> (x * y.real (), x * y.imag ());
|
||||
complex<typename std::common_type<RealType1,RealType2>::type>
|
||||
operator * (const RealType1& x, const complex<RealType2>& y) {
|
||||
return complex<typename std::common_type<RealType1,RealType2>::type> (x * y.real (), x * y.imag ());
|
||||
}
|
||||
|
||||
/// \brief Binary * operator for RealType times complex.
|
||||
///
|
||||
/// This function exists because the compiler doesn't know that
|
||||
/// RealType and complex<RealType> commute with respect to operator*.
|
||||
template<class RealType>
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
operator * (const complex<RealType>& y, const RealType& x) {
|
||||
return complex<RealType> (x * y.real (), x * y.imag ());
|
||||
complex<typename std::common_type<RealType1,RealType2>::type>
|
||||
operator * (const complex<RealType1>& y, const RealType2& x) {
|
||||
return complex<typename std::common_type<RealType1,RealType2>::type> (x * y.real (), x * y.imag ());
|
||||
}
|
||||
|
||||
//! Imaginary part of a complex number.
|
||||
|
@ -539,33 +676,34 @@ complex<RealType> pow (const complex<RealType>& x) {
|
|||
//! Binary operator / for complex and real numbers
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType1>
|
||||
complex<typename std::common_type<RealType1,RealType2>::type>
|
||||
operator / (const complex<RealType1>& x, const RealType2& y) {
|
||||
return complex<RealType1> (real (x) / y, imag (x) / y);
|
||||
return complex<typename std::common_type<RealType1,RealType2>::type> (real (x) / y, imag (x) / y);
|
||||
}
|
||||
|
||||
//! Binary operator / for complex.
|
||||
template<class RealType>
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
operator / (const complex<RealType>& x, const complex<RealType>& y) {
|
||||
complex<typename std::common_type<RealType1,RealType2>::type>
|
||||
operator / (const complex<RealType1>& x, const complex<RealType2>& y) {
|
||||
// Scale (by the "1-norm" of y) to avoid unwarranted overflow.
|
||||
// If the real part is +/-Inf and the imaginary part is -/+Inf,
|
||||
// this won't change the result.
|
||||
const RealType s = std::fabs (real (y)) + std::fabs (imag (y));
|
||||
typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
|
||||
const common_real_type s = std::fabs (real (y)) + std::fabs (imag (y));
|
||||
|
||||
// If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0.
|
||||
// In that case, the relation x/y == (x/s) / (y/s) doesn't hold,
|
||||
// because y/s is NaN.
|
||||
if (s == 0.0) {
|
||||
return complex<RealType> (real (x) / s, imag (x) / s);
|
||||
return complex<common_real_type> (real (x) / s, imag (x) / s);
|
||||
}
|
||||
else {
|
||||
const complex<RealType> x_scaled (real (x) / s, imag (x) / s);
|
||||
const complex<RealType> y_conj_scaled (real (y) / s, -imag (y) / s);
|
||||
const RealType y_scaled_abs = real (y_conj_scaled) * real (y_conj_scaled) +
|
||||
const complex<common_real_type> x_scaled (real (x) / s, imag (x) / s);
|
||||
const complex<common_real_type> y_conj_scaled (real (y) / s, -imag (y) / s);
|
||||
const RealType1 y_scaled_abs = real (y_conj_scaled) * real (y_conj_scaled) +
|
||||
imag (y_conj_scaled) * imag (y_conj_scaled); // abs(y) == abs(conj(y))
|
||||
complex<RealType> result = x_scaled * y_conj_scaled;
|
||||
complex<common_real_type> result = x_scaled * y_conj_scaled;
|
||||
result /= y_scaled_abs;
|
||||
return result;
|
||||
}
|
||||
|
@ -574,16 +712,19 @@ operator / (const complex<RealType>& x, const complex<RealType>& y) {
|
|||
//! Binary operator / for complex and real numbers
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType1>
|
||||
complex<typename std::common_type<RealType1,RealType2>::type>
|
||||
operator / (const RealType1& x, const complex<RealType2>& y) {
|
||||
return complex<RealType1> (x)/y;
|
||||
return complex<typename std::common_type<RealType1,RealType2>::type> (x)/y;
|
||||
}
|
||||
|
||||
//! Equality operator for two complex numbers.
|
||||
template<class RealType>
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator == (const complex<RealType>& x, const complex<RealType>& y) {
|
||||
return real (x) == real (y) && imag (x) == imag (y);
|
||||
bool
|
||||
operator == (const complex<RealType1>& x, const complex<RealType2>& y) {
|
||||
typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
|
||||
return ( static_cast<common_real_type>(real (x)) == static_cast<common_real_type>(real (y)) &&
|
||||
static_cast<common_real_type>(imag (x)) == static_cast<common_real_type>(imag (y)) );
|
||||
}
|
||||
|
||||
/// \brief Equality operator for std::complex and Kokkos::complex.
|
||||
|
@ -592,50 +733,68 @@ bool operator == (const complex<RealType>& x, const complex<RealType>& y) {
|
|||
/// Otherwise, CUDA builds will give compiler warnings ("warning:
|
||||
/// calling a constexpr __host__ function("real") from a __host__
|
||||
/// __device__ function("operator==") is not allowed").
|
||||
template<class RealType>
|
||||
bool operator == (const std::complex<RealType>& x, const complex<RealType>& y) {
|
||||
return std::real (x) == real (y) && std::imag (x) == imag (y);
|
||||
template<class RealType1, class RealType2>
|
||||
inline
|
||||
bool
|
||||
operator == (const std::complex<RealType1>& x, const complex<RealType2>& y) {
|
||||
typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
|
||||
return ( static_cast<common_real_type>(std::real (x)) == static_cast<common_real_type>(real (y)) &&
|
||||
static_cast<common_real_type>(std::imag (x)) == static_cast<common_real_type>(imag (y)) );
|
||||
}
|
||||
|
||||
|
||||
//! Equality operator for complex and real number.
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator == (const complex<RealType1>& x, const RealType2& y) {
|
||||
return real (x) == y && imag (x) == static_cast<RealType1> (0.0);
|
||||
bool
|
||||
operator == (const complex<RealType1>& x, const RealType2& y) {
|
||||
typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
|
||||
return ( static_cast<common_real_type>(real (x)) == static_cast<common_real_type>(y) &&
|
||||
static_cast<common_real_type>(imag (x)) == static_cast<common_real_type>(0.0) );
|
||||
}
|
||||
|
||||
//! Equality operator for real and complex number.
|
||||
template<class RealType>
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator == (const RealType& x, const complex<RealType>& y) {
|
||||
bool
|
||||
operator == (const RealType1& x, const complex<RealType2>& y) {
|
||||
return y == x;
|
||||
}
|
||||
|
||||
//! Inequality operator for two complex numbers.
|
||||
template<class RealType>
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator != (const complex<RealType>& x, const complex<RealType>& y) {
|
||||
return real (x) != real (y) || imag (x) != imag (y);
|
||||
bool
|
||||
operator != (const complex<RealType1>& x, const complex<RealType2>& y) {
|
||||
typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
|
||||
return ( static_cast<common_real_type>(real (x)) != static_cast<common_real_type>(real (y)) ||
|
||||
static_cast<common_real_type>(imag (x)) != static_cast<common_real_type>(imag (y)) );
|
||||
}
|
||||
|
||||
//! Inequality operator for std::complex and Kokkos::complex.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator != (const std::complex<RealType>& x, const complex<RealType>& y) {
|
||||
return std::real (x) != real (y) || std::imag (x) != imag (y);
|
||||
template<class RealType1, class RealType2>
|
||||
inline
|
||||
bool
|
||||
operator != (const std::complex<RealType1>& x, const complex<RealType2>& y) {
|
||||
typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
|
||||
return ( static_cast<common_real_type>(std::real (x)) != static_cast<common_real_type>(real (y)) ||
|
||||
static_cast<common_real_type>(std::imag (x)) != static_cast<common_real_type>(imag (y)) );
|
||||
}
|
||||
|
||||
//! Inequality operator for complex and real number.
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator != (const complex<RealType1>& x, const RealType2& y) {
|
||||
return real (x) != y || imag (x) != static_cast<RealType1> (0.0);
|
||||
bool
|
||||
operator != (const complex<RealType1>& x, const RealType2& y) {
|
||||
typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
|
||||
return ( static_cast<common_real_type>(real (x)) != static_cast<common_real_type>(y) ||
|
||||
static_cast<common_real_type>(imag (x)) != static_cast<common_real_type>(0.0) );
|
||||
}
|
||||
|
||||
//! Inequality operator for real and complex number.
|
||||
template<class RealType>
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator != (const RealType& x, const complex<RealType>& y) {
|
||||
bool
|
||||
operator != (const RealType1& x, const complex<RealType2>& y) {
|
||||
return y != x;
|
||||
}
|
||||
|
||||
|
|
|
@ -353,7 +353,14 @@ struct CountAndFill {
|
|||
struct Fill {};
|
||||
KOKKOS_INLINE_FUNCTION void operator()(Fill, size_type i) const {
|
||||
auto j = m_crs.row_map(i);
|
||||
data_type* fill = &(m_crs.entries(j));
|
||||
/* we don't want to access entries(entries.size()), even if its just to get its
|
||||
address and never use it.
|
||||
this can happen when row (i) is empty and all rows after it are also empty.
|
||||
we could compare to row_map(i + 1), but that is a read from global memory,
|
||||
whereas dimension_0() should be part of the View in registers (or constant memory) */
|
||||
data_type* fill =
|
||||
(j == static_cast<decltype(j)>(m_crs.entries.dimension_0())) ?
|
||||
nullptr : (&(m_crs.entries(j)));
|
||||
m_functor(i, fill);
|
||||
}
|
||||
using self_type = CountAndFill<CrsType, Functor>;
|
||||
|
|
|
@ -147,12 +147,11 @@ public:
|
|||
, const size_t arg_alloc_size ) const;
|
||||
|
||||
/**\brief Return Name of the MemorySpace */
|
||||
static constexpr const char* name();
|
||||
static constexpr const char* name() { return "HBW"; }
|
||||
|
||||
private:
|
||||
|
||||
AllocationMechanism m_alloc_mech;
|
||||
static constexpr const char* m_name = "HBW";
|
||||
friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace, void >;
|
||||
};
|
||||
|
||||
|
|
|
@ -192,7 +192,7 @@ template<>
|
|||
struct reduction_identity<float> {
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static float sum() {return static_cast<float>(0.0f);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static float prod() {return static_cast<float>(1.0f);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static float max() {return FLT_MIN;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static float max() {return -FLT_MAX;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static float min() {return FLT_MAX;}
|
||||
};
|
||||
|
||||
|
@ -200,7 +200,7 @@ template<>
|
|||
struct reduction_identity<double> {
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static double sum() {return static_cast<double>(0.0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static double prod() {return static_cast<double>(1.0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static double max() {return DBL_MIN;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static double max() {return -DBL_MAX;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static double min() {return DBL_MAX;}
|
||||
};
|
||||
|
||||
|
@ -208,7 +208,7 @@ template<>
|
|||
struct reduction_identity<long double> {
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long double sum() {return static_cast<long double>(0.0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long double prod() {return static_cast<long double>(1.0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long double max() {return LDBL_MIN;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long double max() {return -LDBL_MAX;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long double min() {return LDBL_MAX;}
|
||||
};
|
||||
|
||||
|
|
|
@ -211,6 +211,24 @@ struct VerifyExecutionCanAccessMemorySpace
|
|||
} // namespace Kokkos
|
||||
|
||||
|
||||
|
||||
#define threadIdx_x (hc_get_workitem_id(0))
|
||||
#define threadIdx_y (hc_get_workitem_id(1))
|
||||
#define threadIdx_z (hc_get_workitem_id(2))
|
||||
|
||||
#define blockIdx_x (hc_get_group_id(0))
|
||||
#define blockIdx_y (hc_get_group_id(1))
|
||||
#define blockIdx_z (hc_get_group_id(2))
|
||||
|
||||
#define blockDim_x (hc_get_group_size(0))
|
||||
#define blockDim_y (hc_get_group_size(1))
|
||||
#define blockDim_z (hc_get_group_size(2))
|
||||
|
||||
#define gridDim_x (hc_get_num_groups(0))
|
||||
#define gridDim_y (hc_get_num_groups(1))
|
||||
#define gridDim_z (hc_get_num_groups(2))
|
||||
|
||||
|
||||
#include <ROCm/Kokkos_ROCm_Parallel.hpp>
|
||||
#include <ROCm/Kokkos_ROCm_Task.hpp>
|
||||
|
||||
|
|
|
@ -88,6 +88,7 @@ build-makefile-kokkos:
|
|||
echo "KOKKOS_SRC = $(KOKKOS_SRC)" >> Makefile.kokkos
|
||||
echo "" >> Makefile.kokkos
|
||||
echo "#Variables used in application Makefiles" >> Makefile.kokkos
|
||||
echo "KOKKOS_OS = $(KOKKOS_OS)" >> Makefile.kokkos
|
||||
echo "KOKKOS_CPP_DEPENDS = $(KOKKOS_CPP_DEPENDS)" >> Makefile.kokkos
|
||||
echo "KOKKOS_CXXFLAGS = $(KOKKOS_CXXFLAGS)" >> Makefile.kokkos
|
||||
echo "KOKKOS_CPPFLAGS = $(KOKKOS_CPPFLAGS)" >> Makefile.kokkos
|
||||
|
|
|
@ -211,6 +211,7 @@ void OpenMP::partition_master( F const& f
|
|||
, thread_local_bytes
|
||||
);
|
||||
|
||||
omp_set_num_threads(partition_size);
|
||||
f( omp_get_thread_num(), omp_get_num_threads() );
|
||||
|
||||
Impl::t_openmp_instance->~Exec();
|
||||
|
|
|
@ -113,7 +113,6 @@ void reduce_enqueue(
|
|||
|
||||
if (output_length < 1) return;
|
||||
|
||||
assert(output_result != nullptr);
|
||||
const auto td = get_tile_desc<T>(szElements,output_length,team_size,vector_size, shared_size);
|
||||
|
||||
// allocate host and device memory for the results from each team
|
||||
|
@ -176,14 +175,17 @@ void reduce_enqueue(
|
|||
}
|
||||
|
||||
});
|
||||
ValueInit::init(ReducerConditional::select(f, reducer), output_result);
|
||||
if (output_result != nullptr)
|
||||
ValueInit::init(ReducerConditional::select(f, reducer), output_result);
|
||||
fut.wait();
|
||||
|
||||
copy(result,result_cpu.data());
|
||||
for(std::size_t i=0;i<td.num_tiles;i++)
|
||||
ValueJoin::join(ReducerConditional::select(f, reducer), output_result, result_cpu.data()+i*output_length);
|
||||
if (output_result != nullptr) {
|
||||
for(std::size_t i=0;i<td.num_tiles;i++)
|
||||
ValueJoin::join(ReducerConditional::select(f, reducer), output_result, result_cpu.data()+i*output_length);
|
||||
|
||||
ValueFinal::final( ReducerConditional::select(f, reducer) , output_result );
|
||||
ValueFinal::final( ReducerConditional::select(f, reducer) , output_result );
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -67,7 +67,7 @@ void scan_enqueue(
|
|||
hc::array<value_type> result(td.num_tiles);
|
||||
hc::array<value_type> scratch(len);
|
||||
|
||||
tile_for<value_type>(td, [&,len,td](hc::tiled_index<1> t_idx, tile_buffer<value_type> buffer) [[hc]]
|
||||
tile_for<value_type>(td, [&,f,len,td](hc::tiled_index<1> t_idx, tile_buffer<value_type> buffer) [[hc]]
|
||||
{
|
||||
const auto local = t_idx.local[0];
|
||||
const auto global = t_idx.global[0];
|
||||
|
@ -135,7 +135,7 @@ void scan_enqueue(
|
|||
ValueJoin::join(f, &result_cpu[i], &result_cpu[i-1]);
|
||||
|
||||
copy(result_cpu.data(),result);
|
||||
hc::parallel_for_each(hc::extent<1>(len).tile(td.tile_size), [&,len,td](hc::tiled_index<1> t_idx) [[hc]]
|
||||
hc::parallel_for_each(hc::extent<1>(len).tile(td.tile_size), [&,f,len,td](hc::tiled_index<1> t_idx) [[hc]]
|
||||
{
|
||||
// const auto local = t_idx.local[0];
|
||||
const auto global = t_idx.global[0];
|
||||
|
|
|
@ -68,6 +68,8 @@ int bit_first_zero( unsigned i ) noexcept
|
|||
return full != i ? _bit_scan_forward( ~i ) : -1 ;
|
||||
#elif defined( KOKKOS_COMPILER_IBM )
|
||||
return full != i ? __cnttz4( ~i ) : -1 ;
|
||||
#elif defined( KOKKOS_COMPILER_CRAYC )
|
||||
return full != i ? _popcnt( i ^ (i+1) ) - 1 : -1 ;
|
||||
#elif defined( KOKKOS_COMPILER_GNU ) || defined( __GNUC__ ) || defined( __GNUG__ )
|
||||
return full != i ? __builtin_ffs( ~i ) - 1 : -1 ;
|
||||
#else
|
||||
|
@ -90,17 +92,16 @@ int bit_scan_forward( unsigned i )
|
|||
return _bit_scan_forward(i);
|
||||
#elif defined( KOKKOS_COMPILER_IBM )
|
||||
return __cnttz4(i);
|
||||
#elif defined( KOKKOS_COMPILER_CRAYC )
|
||||
return i ? _popcnt(~i & (i-1)) : -1;
|
||||
#elif defined( KOKKOS_COMPILER_GNU ) || defined( __GNUC__ ) || defined( __GNUG__ )
|
||||
return __builtin_ffs(i) - 1;
|
||||
#else
|
||||
unsigned t = 1u;
|
||||
int r = 0;
|
||||
while ( i && ( i & t == 0 ) )
|
||||
{
|
||||
t = t << 1;
|
||||
++r;
|
||||
int offset = -1;
|
||||
if ( i ) {
|
||||
for ( offset = 0 ; (i & ( 1 << offset ) ) == 0 ; ++offset );
|
||||
}
|
||||
return r;
|
||||
return offset;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -116,17 +117,16 @@ int bit_scan_reverse( unsigned i )
|
|||
return _bit_scan_reverse(i);
|
||||
#elif defined( KOKKOS_COMPILER_IBM )
|
||||
return shift - __cntlz4(i);
|
||||
#elif defined( KOKKOS_COMPILER_CRAYC )
|
||||
return i ? shift - _leadz32(i) : 0 ;
|
||||
#elif defined( __GNUC__ ) || defined( __GNUG__ )
|
||||
return shift - __builtin_clz(i);
|
||||
#else
|
||||
unsigned t = 1u << shift;
|
||||
int r = 0;
|
||||
while ( i && ( i & t == 0 ) )
|
||||
{
|
||||
t = t >> 1;
|
||||
++r;
|
||||
int offset = 0;
|
||||
if ( i ) {
|
||||
for ( offset = shift ; (i & ( 1 << offset ) ) == 0 ; --offset );
|
||||
}
|
||||
return r;
|
||||
return offset;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -142,6 +142,8 @@ int bit_count( unsigned i )
|
|||
return _popcnt32(i);
|
||||
#elif defined( KOKKOS_COMPILER_IBM )
|
||||
return __popcnt4(i);
|
||||
#elif defined( KOKKOS_COMPILER_CRAYC )
|
||||
return _popcnt(i);
|
||||
#elif defined( __GNUC__ ) || defined( __GNUG__ )
|
||||
return __builtin_popcount(i);
|
||||
#else
|
||||
|
|
|
@ -166,10 +166,6 @@ void HBWSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_s
|
|||
}
|
||||
}
|
||||
|
||||
constexpr const char* HBWSpace::name() {
|
||||
return m_name;
|
||||
}
|
||||
|
||||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
|
|
|
@ -114,7 +114,7 @@ struct TestComplexBasicMath {
|
|||
typename Kokkos::View<Kokkos::complex<double>*,ExecSpace>::HostMirror h_results;
|
||||
|
||||
void testit () {
|
||||
d_results = Kokkos::View<Kokkos::complex<double>*,ExecSpace>("TestComplexBasicMath",20);
|
||||
d_results = Kokkos::View<Kokkos::complex<double>*,ExecSpace>("TestComplexBasicMath",24);
|
||||
h_results = Kokkos::create_mirror_view(d_results);
|
||||
|
||||
Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0,1), *this);
|
||||
|
@ -125,6 +125,7 @@ struct TestComplexBasicMath {
|
|||
std::complex<double> b(3.25,5.75);
|
||||
std::complex<double> d(1.0,2.0);
|
||||
double c = 9.3;
|
||||
int e = 2;
|
||||
|
||||
std::complex<double> r;
|
||||
r = a+b; ASSERT_FLOAT_EQ(h_results(0).real(), r.real()); ASSERT_FLOAT_EQ(h_results(0).imag(), r.imag());
|
||||
|
@ -147,6 +148,12 @@ struct TestComplexBasicMath {
|
|||
r = c-a; ASSERT_FLOAT_EQ(h_results(17).real(), r.real()); ASSERT_FLOAT_EQ(h_results(17).imag(), r.imag());
|
||||
r = c*a; ASSERT_FLOAT_EQ(h_results(18).real(), r.real()); ASSERT_FLOAT_EQ(h_results(18).imag(), r.imag());
|
||||
r = c/a; ASSERT_FLOAT_EQ(h_results(19).real(), r.real()); ASSERT_FLOAT_EQ(h_results(19).imag(), r.imag());
|
||||
|
||||
r = a;
|
||||
/* r = a+e; */ ASSERT_FLOAT_EQ(h_results(20).real(), r.real()+e); ASSERT_FLOAT_EQ(h_results(20).imag(), r.imag());
|
||||
/* r = a-e; */ ASSERT_FLOAT_EQ(h_results(21).real(), r.real()-e); ASSERT_FLOAT_EQ(h_results(21).imag(), r.imag());
|
||||
/* r = a*e; */ ASSERT_FLOAT_EQ(h_results(22).real(), r.real()*e); ASSERT_FLOAT_EQ(h_results(22).imag(), r.imag()*e);
|
||||
/* r = a/e; */ ASSERT_FLOAT_EQ(h_results(23).real(), r.real()/2); ASSERT_FLOAT_EQ(h_results(23).imag(), r.imag()/e);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
@ -190,6 +197,12 @@ struct TestComplexBasicMath {
|
|||
d_results(17) = c-a;
|
||||
d_results(18) = c*a;
|
||||
d_results(19) = c/a;
|
||||
|
||||
int e = 2;
|
||||
d_results(20) = a+e;
|
||||
d_results(21) = a-e;
|
||||
d_results(22) = a*e;
|
||||
d_results(23) = a/e;
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -286,7 +286,9 @@ struct TestMDRange_2D {
|
|||
// Test with reducers - scalar
|
||||
{
|
||||
typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType<int> > range_type;
|
||||
range_type range( {{ 0, 0 }}, {{ N0, N1 }}, {{ 3, 3 }} );
|
||||
int s0 = 1;
|
||||
int s1 = 1;
|
||||
range_type range( {{ s0, s1 }}, {{ N0, N1 }}, {{ 3, 3 }} );
|
||||
|
||||
TestMDRange_2D functor( N0, N1 );
|
||||
|
||||
|
@ -297,7 +299,7 @@ struct TestMDRange_2D {
|
|||
|
||||
parallel_reduce( range, functor, reducer_scalar );
|
||||
|
||||
ASSERT_EQ( sum, 2 * N0 * N1 );
|
||||
ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) );
|
||||
}
|
||||
// Test with reducers - scalar view
|
||||
{
|
||||
|
@ -445,7 +447,9 @@ struct TestMDRange_2D {
|
|||
typedef typename range_type::tile_type tile_type;
|
||||
typedef typename range_type::point_type point_type;
|
||||
|
||||
range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
|
||||
const int s0 = 1;
|
||||
const int s1 = 1;
|
||||
range_type range( point_type{ { s0, s1 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
|
||||
TestMDRange_2D functor( N0, N1 );
|
||||
|
||||
parallel_for( range, functor );
|
||||
|
@ -454,8 +458,8 @@ struct TestMDRange_2D {
|
|||
Kokkos::deep_copy( h_view, functor.input_view );
|
||||
|
||||
int counter = 0;
|
||||
for ( int i = 0; i < N0; ++i )
|
||||
for ( int j = 0; j < N1; ++j )
|
||||
for ( int i = s0; i < N0; ++i )
|
||||
for ( int j = s1; j < N1; ++j )
|
||||
{
|
||||
if ( h_view( i, j ) != 3 ) {
|
||||
++counter;
|
||||
|
@ -463,7 +467,7 @@ struct TestMDRange_2D {
|
|||
}
|
||||
|
||||
if ( counter != 0 ) {
|
||||
printf( "Default Layouts + InitTag op(): Errors in test_for2; mismatches = %d\n\n", counter );
|
||||
printf( "Offset Start + Default Layouts + InitTag op(): Errors in test_for2; mismatches = %d\n\n", counter );
|
||||
}
|
||||
|
||||
ASSERT_EQ( counter, 0 );
|
||||
|
@ -699,6 +703,7 @@ struct TestMDRange_2D {
|
|||
|
||||
ASSERT_EQ( counter, 0 );
|
||||
}
|
||||
|
||||
} // end test_for2
|
||||
}; // MDRange_2D
|
||||
|
||||
|
@ -749,7 +754,10 @@ struct TestMDRange_3D {
|
|||
typedef typename range_type::tile_type tile_type;
|
||||
typedef typename range_type::point_type point_type;
|
||||
|
||||
range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
|
||||
int s0 = 1;
|
||||
int s1 = 1;
|
||||
int s2 = 1;
|
||||
range_type range( point_type{ { s0, s1, s2 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
|
||||
|
||||
TestMDRange_3D functor( N0, N1, N2 );
|
||||
|
||||
|
@ -757,7 +765,7 @@ struct TestMDRange_3D {
|
|||
double sum = 0.0;
|
||||
parallel_reduce( range, functor, sum );
|
||||
|
||||
ASSERT_EQ( sum, 2 * N0 * N1 * N2 );
|
||||
ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) );
|
||||
}
|
||||
|
||||
// Test with reducers - scalar
|
||||
|
@ -952,7 +960,10 @@ struct TestMDRange_3D {
|
|||
typedef typename range_type::tile_type tile_type;
|
||||
typedef typename range_type::point_type point_type;
|
||||
|
||||
range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
|
||||
int s0 = 1;
|
||||
int s1 = 1;
|
||||
int s2 = 1;
|
||||
range_type range( point_type{ { s0, s1, s2 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
|
||||
TestMDRange_3D functor( N0, N1, N2 );
|
||||
|
||||
parallel_for( range, functor );
|
||||
|
@ -961,9 +972,9 @@ struct TestMDRange_3D {
|
|||
Kokkos::deep_copy( h_view, functor.input_view );
|
||||
|
||||
int counter = 0;
|
||||
for ( int i = 0; i < N0; ++i )
|
||||
for ( int j = 0; j < N1; ++j )
|
||||
for ( int k = 0; k < N2; ++k )
|
||||
for ( int i = s0; i < N0; ++i )
|
||||
for ( int j = s1; j < N1; ++j )
|
||||
for ( int k = s2; k < N2; ++k )
|
||||
{
|
||||
if ( h_view( i, j, k ) != 3 ) {
|
||||
++counter;
|
||||
|
@ -971,7 +982,7 @@ struct TestMDRange_3D {
|
|||
}
|
||||
|
||||
if ( counter != 0 ) {
|
||||
printf( "Defaults + InitTag op(): Errors in test_for3; mismatches = %d\n\n", counter );
|
||||
printf( "Offset Start + Defaults + InitTag op(): Errors in test_for3; mismatches = %d\n\n", counter );
|
||||
}
|
||||
|
||||
ASSERT_EQ( counter, 0 );
|
||||
|
@ -1207,7 +1218,11 @@ struct TestMDRange_4D {
|
|||
typedef typename range_type::tile_type tile_type;
|
||||
typedef typename range_type::point_type point_type;
|
||||
|
||||
range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 3, 3, 3 } } );
|
||||
int s0 = 1;
|
||||
int s1 = 1;
|
||||
int s2 = 1;
|
||||
int s3 = 1;
|
||||
range_type range( point_type{ { s0, s1, s2, s3 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 3, 3, 3 } } );
|
||||
|
||||
TestMDRange_4D functor( N0, N1, N2, N3 );
|
||||
|
||||
|
@ -1215,7 +1230,7 @@ struct TestMDRange_4D {
|
|||
double sum = 0.0;
|
||||
parallel_reduce( range, functor, sum );
|
||||
|
||||
ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 );
|
||||
ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) );
|
||||
}
|
||||
|
||||
// Test with reducers - scalar
|
||||
|
@ -1415,7 +1430,11 @@ struct TestMDRange_4D {
|
|||
typedef typename range_type::tile_type tile_type;
|
||||
typedef typename range_type::point_type point_type;
|
||||
|
||||
range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 11, 3, 3 } } );
|
||||
int s0 = 1;
|
||||
int s1 = 1;
|
||||
int s2 = 1;
|
||||
int s3 = 1;
|
||||
range_type range( point_type{ { s0, s1, s2, s3 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 11, 3, 3 } } );
|
||||
TestMDRange_4D functor( N0, N1, N2, N3 );
|
||||
|
||||
parallel_for( range, functor );
|
||||
|
@ -1424,10 +1443,10 @@ struct TestMDRange_4D {
|
|||
Kokkos::deep_copy( h_view, functor.input_view );
|
||||
|
||||
int counter = 0;
|
||||
for ( int i = 0; i < N0; ++i )
|
||||
for ( int j = 0; j < N1; ++j )
|
||||
for ( int k = 0; k < N2; ++k )
|
||||
for ( int l = 0; l < N3; ++l )
|
||||
for ( int i = s0; i < N0; ++i )
|
||||
for ( int j = s1; j < N1; ++j )
|
||||
for ( int k = s2; k < N2; ++k )
|
||||
for ( int l = s3; l < N3; ++l )
|
||||
{
|
||||
if ( h_view( i, j, k, l ) != 3 ) {
|
||||
++counter;
|
||||
|
@ -1435,7 +1454,7 @@ struct TestMDRange_4D {
|
|||
}
|
||||
|
||||
if ( counter != 0 ) {
|
||||
printf("Defaults +m_tile > m_upper dim2 InitTag op(): Errors in test_for4; mismatches = %d\n\n",counter);
|
||||
printf("Offset Start + Defaults +m_tile > m_upper dim2 InitTag op(): Errors in test_for4; mismatches = %d\n\n",counter);
|
||||
}
|
||||
|
||||
ASSERT_EQ( counter, 0 );
|
||||
|
@ -1682,7 +1701,12 @@ struct TestMDRange_5D {
|
|||
typedef typename range_type::tile_type tile_type;
|
||||
typedef typename range_type::point_type point_type;
|
||||
|
||||
range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 3 } } );
|
||||
int s0 = 1;
|
||||
int s1 = 1;
|
||||
int s2 = 1;
|
||||
int s3 = 1;
|
||||
int s4 = 1;
|
||||
range_type range( point_type{ { s0, s1, s2, s3, s4 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 3 } } );
|
||||
|
||||
TestMDRange_5D functor( N0, N1, N2, N3, N4 );
|
||||
|
||||
|
@ -1690,7 +1714,7 @@ struct TestMDRange_5D {
|
|||
double sum = 0.0;
|
||||
parallel_reduce( range, functor, sum );
|
||||
|
||||
ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 );
|
||||
ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) * (N4 - s4) );
|
||||
}
|
||||
|
||||
// Test with reducers - scalar
|
||||
|
@ -1810,7 +1834,12 @@ struct TestMDRange_5D {
|
|||
typedef typename range_type::tile_type tile_type;
|
||||
typedef typename range_type::point_type point_type;
|
||||
|
||||
range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 5 } } );
|
||||
int s0 = 1;
|
||||
int s1 = 1;
|
||||
int s2 = 1;
|
||||
int s3 = 1;
|
||||
int s4 = 1;
|
||||
range_type range( point_type{ { s0, s1, s2, s3, s4 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 5 } } );
|
||||
TestMDRange_5D functor( N0, N1, N2, N3, N4 );
|
||||
|
||||
parallel_for( range, functor );
|
||||
|
@ -1819,11 +1848,11 @@ struct TestMDRange_5D {
|
|||
Kokkos::deep_copy( h_view, functor.input_view );
|
||||
|
||||
int counter = 0;
|
||||
for ( int i = 0; i < N0; ++i )
|
||||
for ( int j = 0; j < N1; ++j )
|
||||
for ( int k = 0; k < N2; ++k )
|
||||
for ( int l = 0; l < N3; ++l )
|
||||
for ( int m = 0; m < N4; ++m )
|
||||
for ( int i = s0; i < N0; ++i )
|
||||
for ( int j = s1; j < N1; ++j )
|
||||
for ( int k = s2; k < N2; ++k )
|
||||
for ( int l = s3; l < N3; ++l )
|
||||
for ( int m = s4; m < N4; ++m )
|
||||
{
|
||||
if ( h_view( i, j, k, l, m ) != 3 ) {
|
||||
++counter;
|
||||
|
@ -1831,7 +1860,7 @@ struct TestMDRange_5D {
|
|||
}
|
||||
|
||||
if ( counter != 0 ) {
|
||||
printf( "Defaults + InitTag op(): Errors in test_for5; mismatches = %d\n\n", counter );
|
||||
printf( "Offset Start + Defaults + InitTag op(): Errors in test_for5; mismatches = %d\n\n", counter );
|
||||
}
|
||||
|
||||
ASSERT_EQ( counter, 0 );
|
||||
|
@ -2084,7 +2113,13 @@ struct TestMDRange_6D {
|
|||
typedef typename range_type::tile_type tile_type;
|
||||
typedef typename range_type::point_type point_type;
|
||||
|
||||
range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 3, 2 } } );
|
||||
int s0 = 1;
|
||||
int s1 = 1;
|
||||
int s2 = 1;
|
||||
int s3 = 1;
|
||||
int s4 = 1;
|
||||
int s5 = 1;
|
||||
range_type range( point_type{ { s0, s1, s2, s3, s4, s5 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 3, 2 } } );
|
||||
|
||||
TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
|
||||
|
||||
|
@ -2092,7 +2127,7 @@ struct TestMDRange_6D {
|
|||
double sum = 0.0;
|
||||
parallel_reduce( range, functor, sum );
|
||||
|
||||
ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 * N5 );
|
||||
ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) * (N4 - s4) * (N5 - s5) );
|
||||
}
|
||||
|
||||
// Test with reducers - scalar
|
||||
|
@ -2214,7 +2249,13 @@ struct TestMDRange_6D {
|
|||
typedef typename range_type::tile_type tile_type;
|
||||
typedef typename range_type::point_type point_type;
|
||||
|
||||
range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 2, 3 } } ); //tile dims 3,3,3,3,3,3 more than cuda can handle with debugging
|
||||
int s0 = 1;
|
||||
int s1 = 1;
|
||||
int s2 = 1;
|
||||
int s3 = 1;
|
||||
int s4 = 1;
|
||||
int s5 = 1;
|
||||
range_type range( point_type{ { s0, s1, s2, s3, s4, s5 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 2, 3 } } ); //tile dims 3,3,3,3,3,3 more than cuda can handle with debugging
|
||||
TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
|
||||
|
||||
parallel_for( range, functor );
|
||||
|
@ -2223,12 +2264,12 @@ struct TestMDRange_6D {
|
|||
Kokkos::deep_copy( h_view, functor.input_view );
|
||||
|
||||
int counter = 0;
|
||||
for ( int i = 0; i < N0; ++i )
|
||||
for ( int j = 0; j < N1; ++j )
|
||||
for ( int k = 0; k < N2; ++k )
|
||||
for ( int l = 0; l < N3; ++l )
|
||||
for ( int m = 0; m < N4; ++m )
|
||||
for ( int n = 0; n < N5; ++n )
|
||||
for ( int i = s0; i < N0; ++i )
|
||||
for ( int j = s1; j < N1; ++j )
|
||||
for ( int k = s2; k < N2; ++k )
|
||||
for ( int l = s3; l < N3; ++l )
|
||||
for ( int m = s4; m < N4; ++m )
|
||||
for ( int n = s5; n < N5; ++n )
|
||||
{
|
||||
if ( h_view( i, j, k, l, m, n ) != 3 ) {
|
||||
++counter;
|
||||
|
@ -2236,7 +2277,7 @@ struct TestMDRange_6D {
|
|||
}
|
||||
|
||||
if ( counter != 0 ) {
|
||||
printf( "Defaults + InitTag op(): Errors in test_for6; mismatches = %d\n\n", counter );
|
||||
printf( "Offset Start + Defaults + InitTag op(): Errors in test_for6; mismatches = %d\n\n", counter );
|
||||
}
|
||||
|
||||
ASSERT_EQ( counter, 0 );
|
||||
|
|
Loading…
Reference in New Issue