Kokkos lib update

This commit is contained in:
Steve Plimpton 2016-09-08 13:56:18 -06:00
parent 0252347d43
commit 236ebf7fab
212 changed files with 18902 additions and 13466 deletions

View File

@ -1,4 +1,15 @@
IF(COMMAND TRIBITS_PACKAGE_DECL)
SET(KOKKOS_HAS_TRILINOS ON CACHE BOOL "")
ELSE()
SET(KOKKOS_HAS_TRILINOS OFF CACHE BOOL "")
ENDIF()
IF(NOT KOKKOS_HAS_TRILINOS)
CMAKE_MINIMUM_REQUIRED(VERSION 2.8.11 FATAL_ERROR)
INCLUDE(cmake/tribits.cmake)
ENDIF()
#
# A) Forward delcare the package so that certain options are also defined for
# subpackages
@ -12,7 +23,22 @@ TRIBITS_PACKAGE_DECL(Kokkos) # ENABLE_SHADOWING_WARNINGS)
# subpackages as well.
#
TRIBITS_ADD_DEBUG_OPTION()
# mfh 01 Aug 2016: See Issue #61:
#
# https://github.com/kokkos/kokkos/issues/61
#
# Don't use TRIBITS_ADD_DEBUG_OPTION() here, because that defines
# HAVE_KOKKOS_DEBUG. We define KOKKOS_HAVE_DEBUG here instead,
# for compatibility with Kokkos' Makefile build system.
TRIBITS_ADD_OPTION_AND_DEFINE(
${PACKAGE_NAME}_ENABLE_DEBUG
${PACKAGE_NAME_UC}_HAVE_DEBUG
"Enable run-time debug checks. These checks may be expensive, so they are disabled by default in a release build."
${${PROJECT_NAME}_ENABLE_DEBUG}
)
TRIBITS_ADD_OPTION_AND_DEFINE(
Kokkos_ENABLE_SIERRA_BUILD
@ -82,11 +108,33 @@ TRIBITS_ADD_OPTION_AND_DEFINE(
"${TPL_ENABLE_MPI}"
)
# Set default value of Kokkos_ENABLE_Debug_Bounds_Check option
#
# CMake is case sensitive. The Kokkos_ENABLE_Debug_Bounds_Check
# option (defined below) is annoyingly not all caps, but we need to
# keep it that way for backwards compatibility. If users forget and
# try using an all-caps variable, then make it count by using the
# all-caps version as the default value of the original, not-all-caps
# option. Otherwise, the default value of this option comes from
# Kokkos_ENABLE_DEBUG (see Issue #367).
ASSERT_DEFINED(${PACKAGE_NAME}_ENABLE_DEBUG)
IF(DEFINED Kokkos_ENABLE_DEBUG_BOUNDS_CHECK)
IF(Kokkos_ENABLE_DEBUG_BOUNDS_CHECK)
SET(Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT ON)
ELSE()
SET(Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT "${${PACKAGE_NAME}_ENABLE_DEBUG}")
ENDIF()
ELSE()
SET(Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT "${${PACKAGE_NAME}_ENABLE_DEBUG}")
ENDIF()
ASSERT_DEFINED(Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT)
TRIBITS_ADD_OPTION_AND_DEFINE(
Kokkos_ENABLE_Debug_Bounds_Check
KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
"Enable bounds checking support in Kokkos."
OFF
"Enable Kokkos::View run-time bounds checking."
"${Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT}"
)
TRIBITS_ADD_OPTION_AND_DEFINE(

View File

@ -7,7 +7,7 @@ CXXFLAGS=$(CCFLAGS)
#Options: OpenMP,Serial,Pthreads,Cuda
KOKKOS_DEVICES ?= "OpenMP"
#KOKKOS_DEVICES ?= "Pthreads"
#Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,ARMv8,BGQ,Power7,Power8,KNL
#Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal61,ARMv8,BGQ,Power7,Power8,KNL,BDW
KOKKOS_ARCH ?= ""
#Options: yes,no
KOKKOS_DEBUG ?= "no"
@ -97,6 +97,7 @@ KOKKOS_INTERNAL_USE_CUDA := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Cuda |
KOKKOS_INTERNAL_USE_ARCH_KNC := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNC | wc -l))
KOKKOS_INTERNAL_USE_ARCH_SNB := $(strip $(shell echo $(KOKKOS_ARCH) | grep SNB | wc -l))
KOKKOS_INTERNAL_USE_ARCH_HSW := $(strip $(shell echo $(KOKKOS_ARCH) | grep HSW | wc -l))
KOKKOS_INTERNAL_USE_ARCH_BDW := $(strip $(shell echo $(KOKKOS_ARCH) | grep BDW | wc -l))
KOKKOS_INTERNAL_USE_ARCH_KNL := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNL | wc -l))
#NVIDIA based
@ -108,10 +109,12 @@ KOKKOS_INTERNAL_USE_ARCH_KEPLER37 := $(strip $(shell echo $(KOKKOS_ARCH) | grep
KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell50 | wc -l))
KOKKOS_INTERNAL_USE_ARCH_MAXWELL52 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell52 | wc -l))
KOKKOS_INTERNAL_USE_ARCH_MAXWELL53 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell53 | wc -l))
KOKKOS_INTERNAL_USE_ARCH_PASCAL61 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Pascal61 | wc -l))
KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37) \
+ $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61) \
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc))
@ -123,6 +126,7 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_AR
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37) \
+ $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61) \
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc))
@ -142,11 +146,11 @@ KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(strip $(shell echo $(KOKKOS_ARCH) | grep AM
#Any AVX?
KOKKOS_INTERNAL_USE_ARCH_AVX := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX) | bc ))
KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW) | bc ))
KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW) | bc ))
KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
# Decide what ISA level we are able to support
KOKKOS_INTERNAL_USE_ISA_X86_64 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
KOKKOS_INTERNAL_USE_ISA_X86_64 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
KOKKOS_INTERNAL_USE_ISA_KNC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNC) | bc ))
KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8) | bc ))
@ -304,8 +308,8 @@ endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_POWER8 1" >> KokkosCore_config.tmp )
KOKKOS_CXXFLAGS += -mcpu=power8
KOKKOS_LDFLAGS += -mcpu=power8
KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX2), 1)
@ -321,8 +325,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX2), 1)
else
# Assume that this is a really a GNU compiler
KOKKOS_CXXFLAGS += -march=core-avx2
KOKKOS_LDFLAGS += -march=core-avx2
KOKKOS_CXXFLAGS += -march=core-avx2 -mtune=core-avx2
KOKKOS_LDFLAGS += -march=core-avx2 -mtune=core-avx2
endif
endif
endif
@ -390,6 +394,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL53 1" >> KokkosCore_config.tmp )
KOKKOS_CXXFLAGS += -arch=sm_53
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL 1" >> KokkosCore_config.tmp )
tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL61 1" >> KokkosCore_config.tmp )
KOKKOS_CXXFLAGS += -arch=sm_61
endif
endif
KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h)

View File

@ -1,9 +1,5 @@
Kokkos_UnorderedMap_impl.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/containers/src/impl/Kokkos_UnorderedMap_impl.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/containers/src/impl/Kokkos_UnorderedMap_impl.cpp
Kokkos_AllocationTracker.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_AllocationTracker.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_AllocationTracker.cpp
Kokkos_BasicAllocators.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_BasicAllocators.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_BasicAllocators.cpp
Kokkos_Core.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Core.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Core.cpp
Kokkos_CPUDiscovery.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_CPUDiscovery.cpp
@ -20,6 +16,10 @@ Kokkos_Serial.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Seria
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial.cpp
Kokkos_Serial_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_TaskPolicy.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_TaskPolicy.cpp
Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
Kokkos_Serial_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp
Kokkos_Shape.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Shape.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Shape.cpp
Kokkos_spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
@ -32,12 +32,12 @@ Kokkos_MemoryPool.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_M
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
Kokkos_Cuda_BasicAllocators.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp
Kokkos_Cuda_Impl.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Impl.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Impl.cpp
Kokkos_CudaSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp
Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
Kokkos_Cuda_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp
endif
@ -61,6 +61,8 @@ endif
ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
Kokkos_OpenMPexec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMPexec.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMPexec.cpp
Kokkos_OpenMP_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
endif
Kokkos_HBWSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp

View File

@ -37,7 +37,7 @@ hcedwar(at)sandia.gov and crtrott(at)sandia.gov
====Requirements============================================================
============================================================================
Primary tested compilers are:
Primary tested compilers on X86 are:
GCC 4.7.2
GCC 4.8.4
GCC 4.9.2
@ -48,26 +48,43 @@ Primary tested compilers are:
Clang 3.5.2
Clang 3.6.1
Primary tested compilers on Power 8 are:
IBM XL 13.1.3 (OpenMP,Serial)
GCC 4.9.2 (OpenMP,Serial)
GCC 5.3.0 (OpenMP,Serial)
Secondary tested compilers are:
CUDA 6.5 (with gcc 4.7.2)
CUDA 7.0 (with gcc 4.7.2)
CUDA 7.5 (with gcc 4.8.4)
Other compilers working:
PGI 15.4
IBM XL 13.1.2
Cygwin 2.1.0 64bit with gcc 4.9.3
X86:
Intel 17.0.042 (the FENL example causes internal compiler error)
PGI 15.4
Cygwin 2.1.0 64bit with gcc 4.9.3
KNL:
Intel 16.2.181 (the FENL example causes internal compiler error)
Intel 17.0.042 (the FENL example causes internal compiler error)
Known non-working combinations:
Power8:
GCC 6.1.0
Pthreads backend
Primary tested compiler are passing in release mode
with warnings as errors. We are using the following set
of flags:
with warnings as errors. They also are tested with a comprehensive set of
backend combinations (i.e. OpenMP, Pthreads, Serial, OpenMP+Serial, ...).
We are using the following set of flags:
GCC: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits
-Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized
Intel: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized
Clang: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized
Secondary compilers are passing without -Werror.
Other compilers are tested occasionally.
Other compilers are tested occasionally, in particular when pushing from develop to
master branch, without -Werror and only for a select set of backends.
============================================================================
====Getting started=========================================================

View File

@ -771,6 +771,7 @@ namespace Kokkos {
friend class Random_XorShift1024_Pool<DeviceType>;
public:
typedef Random_XorShift1024_Pool<DeviceType> pool_type;
typedef DeviceType device_type;
enum {MAX_URAND = 0xffffffffU};
@ -779,10 +780,10 @@ namespace Kokkos {
enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)};
KOKKOS_INLINE_FUNCTION
Random_XorShift1024 (uint64_t* state, int p, int state_idx = 0):
Random_XorShift1024 (const typename pool_type::state_data_type& state, int p, int state_idx = 0):
p_(p),state_idx_(state_idx){
for(int i=0 ; i<16; i++)
state_[i] = state[i];
state_[i] = state(state_idx,i);
}
KOKKOS_INLINE_FUNCTION
@ -933,6 +934,7 @@ namespace Kokkos {
state_data_type state_;
int_view_type p_;
int num_states_;
friend class Random_XorShift1024<DeviceType>;
public:
typedef Random_XorShift1024<DeviceType> generator_type;
@ -1001,7 +1003,7 @@ namespace Kokkos {
KOKKOS_INLINE_FUNCTION
Random_XorShift1024<DeviceType> get_state() const {
const int i = DeviceType::hardware_thread_id();
return Random_XorShift1024<DeviceType>(&state_(i,0),p_(i),i);
return Random_XorShift1024<DeviceType>(state_,p_(i),i);
};
KOKKOS_INLINE_FUNCTION
@ -1020,10 +1022,12 @@ namespace Kokkos {
int p_;
const int state_idx_;
uint64_t* state_;
const int stride_;
friend class Random_XorShift1024_Pool<Kokkos::Cuda>;
public:
typedef Kokkos::Cuda device_type;
typedef Random_XorShift1024_Pool<device_type> pool_type;
enum {MAX_URAND = 0xffffffffU};
enum {MAX_URAND64 = 0xffffffffffffffffULL-1};
@ -1031,30 +1035,30 @@ namespace Kokkos {
enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)};
KOKKOS_INLINE_FUNCTION
Random_XorShift1024 (uint64_t* state, int p, int state_idx = 0):
p_(p),state_idx_(state_idx),state_(state){
Random_XorShift1024 (const typename pool_type::state_data_type& state, int p, int state_idx = 0):
p_(p),state_idx_(state_idx),state_(&state(state_idx,0)),stride_(state.stride_1()){
}
KOKKOS_INLINE_FUNCTION
uint32_t urand() {
uint64_t state_0 = state_[ p_ ];
uint64_t state_1 = state_[ p_ = ( p_ + 1 ) & 15 ];
uint64_t state_0 = state_[ p_ * stride_ ];
uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
state_1 ^= state_1 << 31;
state_1 ^= state_1 >> 11;
state_0 ^= state_0 >> 30;
uint64_t tmp = ( state_[ p_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL;
uint64_t tmp = ( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL;
tmp = tmp>>16;
return static_cast<uint32_t>(tmp&MAX_URAND);
}
KOKKOS_INLINE_FUNCTION
uint64_t urand64() {
uint64_t state_0 = state_[ p_ ];
uint64_t state_1 = state_[ p_ = ( p_ + 1 ) & 15 ];
uint64_t state_0 = state_[ p_ * stride_ ];
uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
state_1 ^= state_1 << 31;
state_1 ^= state_1 >> 11;
state_0 ^= state_0 >> 30;
return (( state_[ p_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1;
return (( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1;
}
KOKKOS_INLINE_FUNCTION
@ -1227,9 +1231,9 @@ Random_XorShift1024<Kokkos::Cuda> Random_XorShift1024_Pool<Kokkos::Cuda>::get_st
if(i>=num_states_) {i = i_offset;}
}
return Random_XorShift1024<Kokkos::Cuda>(&state_(i,0), p_(i), i);
return Random_XorShift1024<Kokkos::Cuda>(state_, p_(i), i);
#else
return Random_XorShift1024<Kokkos::Cuda>(&state_(0,0), p_(0), 0);
return Random_XorShift1024<Kokkos::Cuda>(state_, p_(0), 0);
#endif
}
@ -1248,14 +1252,15 @@ void Random_XorShift1024_Pool<Kokkos::Cuda>::free_state(const Random_XorShift102
#endif
namespace Impl {
template<class ViewType, class RandomPool, int loops, int rank>
template<class ViewType, class RandomPool, int loops, int rank, class IndexType>
struct fill_random_functor_range;
template<class ViewType, class RandomPool, int loops, int rank>
template<class ViewType, class RandomPool, int loops, int rank, class IndexType>
struct fill_random_functor_begin_end;
template<class ViewType, class RandomPool, int loops>
struct fill_random_functor_range<ViewType,RandomPool,loops,1>{
template<class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_range<ViewType,RandomPool,loops,1,IndexType>{
typedef typename ViewType::execution_space execution_space;
ViewType a;
RandomPool rand_pool;
@ -1268,19 +1273,19 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,1>{
a(a_),rand_pool(rand_pool_),range(range_) {}
KOKKOS_INLINE_FUNCTION
void operator() (unsigned int i) const {
void operator() (const IndexType& i) const {
typename RandomPool::generator_type gen = rand_pool.get_state();
for(unsigned int j=0;j<loops;j++) {
const uint64_t idx = i*loops+j;
if(idx<a.dimension_0())
for(IndexType j=0;j<loops;j++) {
const IndexType idx = i*loops+j;
if(idx<static_cast<IndexType>(a.dimension_0()))
a(idx) = Rand::draw(gen,range);
}
rand_pool.free_state(gen);
}
};
template<class ViewType, class RandomPool, int loops>
struct fill_random_functor_range<ViewType,RandomPool,loops,2>{
template<class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_range<ViewType,RandomPool,loops,2,IndexType>{
typedef typename ViewType::execution_space execution_space;
ViewType a;
RandomPool rand_pool;
@ -1293,12 +1298,12 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,2>{
a(a_),rand_pool(rand_pool_),range(range_) {}
KOKKOS_INLINE_FUNCTION
void operator() (unsigned int i) const {
void operator() (IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state();
for(unsigned int j=0;j<loops;j++) {
const uint64_t idx = i*loops+j;
if(idx<a.dimension_0()) {
for(unsigned int k=0;k<a.dimension_1();k++)
for(IndexType j=0;j<loops;j++) {
const IndexType idx = i*loops+j;
if(idx<static_cast<IndexType>(a.dimension_0())) {
for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
a(idx,k) = Rand::draw(gen,range);
}
}
@ -1307,8 +1312,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,2>{
};
template<class ViewType, class RandomPool, int loops>
struct fill_random_functor_range<ViewType,RandomPool,loops,3>{
template<class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_range<ViewType,RandomPool,loops,3,IndexType>{
typedef typename ViewType::execution_space execution_space;
ViewType a;
RandomPool rand_pool;
@ -1321,13 +1326,13 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,3>{
a(a_),rand_pool(rand_pool_),range(range_) {}
KOKKOS_INLINE_FUNCTION
void operator() (unsigned int i) const {
void operator() (IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state();
for(unsigned int j=0;j<loops;j++) {
const uint64_t idx = i*loops+j;
if(idx<a.dimension_0()) {
for(unsigned int k=0;k<a.dimension_1();k++)
for(unsigned int l=0;l<a.dimension_2();l++)
for(IndexType j=0;j<loops;j++) {
const IndexType idx = i*loops+j;
if(idx<static_cast<IndexType>(a.dimension_0())) {
for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
a(idx,k,l) = Rand::draw(gen,range);
}
}
@ -1335,8 +1340,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,3>{
}
};
template<class ViewType, class RandomPool, int loops>
struct fill_random_functor_range<ViewType,RandomPool,loops,4>{
template<class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_range<ViewType,RandomPool,loops,4, IndexType>{
typedef typename ViewType::execution_space execution_space;
ViewType a;
RandomPool rand_pool;
@ -1349,14 +1354,14 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,4>{
a(a_),rand_pool(rand_pool_),range(range_) {}
KOKKOS_INLINE_FUNCTION
void operator() (unsigned int i) const {
void operator() (IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state();
for(unsigned int j=0;j<loops;j++) {
const uint64_t idx = i*loops+j;
if(idx<a.dimension_0()) {
for(unsigned int k=0;k<a.dimension_1();k++)
for(unsigned int l=0;l<a.dimension_2();l++)
for(unsigned int m=0;m<a.dimension_3();m++)
for(IndexType j=0;j<loops;j++) {
const IndexType idx = i*loops+j;
if(idx<static_cast<IndexType>(a.dimension_0())) {
for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
a(idx,k,l,m) = Rand::draw(gen,range);
}
}
@ -1364,8 +1369,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,4>{
}
};
template<class ViewType, class RandomPool, int loops>
struct fill_random_functor_range<ViewType,RandomPool,loops,5>{
template<class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_range<ViewType,RandomPool,loops,5,IndexType>{
typedef typename ViewType::execution_space execution_space;
ViewType a;
RandomPool rand_pool;
@ -1378,15 +1383,15 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,5>{
a(a_),rand_pool(rand_pool_),range(range_) {}
KOKKOS_INLINE_FUNCTION
void operator() (unsigned int i) const {
void operator() (IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state();
for(unsigned int j=0;j<loops;j++) {
const uint64_t idx = i*loops+j;
if(idx<a.dimension_0()) {
for(unsigned int k=0;k<a.dimension_1();k++)
for(unsigned int l=0;l<a.dimension_2();l++)
for(unsigned int m=0;m<a.dimension_3();m++)
for(unsigned int n=0;n<a.dimension_4();n++)
for(IndexType j=0;j<loops;j++) {
const IndexType idx = i*loops+j;
if(idx<static_cast<IndexType>(a.dimension_0())) {
for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
a(idx,k,l,m,n) = Rand::draw(gen,range);
}
}
@ -1394,8 +1399,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,5>{
}
};
template<class ViewType, class RandomPool, int loops>
struct fill_random_functor_range<ViewType,RandomPool,loops,6>{
template<class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_range<ViewType,RandomPool,loops,6,IndexType>{
typedef typename ViewType::execution_space execution_space;
ViewType a;
RandomPool rand_pool;
@ -1408,16 +1413,16 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,6>{
a(a_),rand_pool(rand_pool_),range(range_) {}
KOKKOS_INLINE_FUNCTION
void operator() (unsigned int i) const {
void operator() (IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state();
for(unsigned int j=0;j<loops;j++) {
const uint64_t idx = i*loops+j;
if(idx<a.dimension_0()) {
for(unsigned int k=0;k<a.dimension_1();k++)
for(unsigned int l=0;l<a.dimension_2();l++)
for(unsigned int m=0;m<a.dimension_3();m++)
for(unsigned int n=0;n<a.dimension_4();n++)
for(unsigned int o=0;o<a.dimension_5();o++)
for(IndexType j=0;j<loops;j++) {
const IndexType idx = i*loops+j;
if(idx<static_cast<IndexType>(a.dimension_0())) {
for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
a(idx,k,l,m,n,o) = Rand::draw(gen,range);
}
}
@ -1425,8 +1430,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,6>{
}
};
template<class ViewType, class RandomPool, int loops>
struct fill_random_functor_range<ViewType,RandomPool,loops,7>{
template<class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_range<ViewType,RandomPool,loops,7,IndexType>{
typedef typename ViewType::execution_space execution_space;
ViewType a;
RandomPool rand_pool;
@ -1439,17 +1444,17 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,7>{
a(a_),rand_pool(rand_pool_),range(range_) {}
KOKKOS_INLINE_FUNCTION
void operator() (unsigned int i) const {
void operator() (IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state();
for(unsigned int j=0;j<loops;j++) {
const uint64_t idx = i*loops+j;
if(idx<a.dimension_0()) {
for(unsigned int k=0;k<a.dimension_1();k++)
for(unsigned int l=0;l<a.dimension_2();l++)
for(unsigned int m=0;m<a.dimension_3();m++)
for(unsigned int n=0;n<a.dimension_4();n++)
for(unsigned int o=0;o<a.dimension_5();o++)
for(unsigned int p=0;p<a.dimension_6();p++)
for(IndexType j=0;j<loops;j++) {
const IndexType idx = i*loops+j;
if(idx<static_cast<IndexType>(a.dimension_0())) {
for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
for(IndexType p=0;p<static_cast<IndexType>(a.dimension_6());p++)
a(idx,k,l,m,n,o,p) = Rand::draw(gen,range);
}
}
@ -1457,8 +1462,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,7>{
}
};
template<class ViewType, class RandomPool, int loops>
struct fill_random_functor_range<ViewType,RandomPool,loops,8>{
template<class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_range<ViewType,RandomPool,loops,8,IndexType>{
typedef typename ViewType::execution_space execution_space;
ViewType a;
RandomPool rand_pool;
@ -1471,26 +1476,26 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,8>{
a(a_),rand_pool(rand_pool_),range(range_) {}
KOKKOS_INLINE_FUNCTION
void operator() (unsigned int i) const {
void operator() (IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state();
for(unsigned int j=0;j<loops;j++) {
const uint64_t idx = i*loops+j;
if(idx<a.dimension_0()) {
for(unsigned int k=0;k<a.dimension_1();k++)
for(unsigned int l=0;l<a.dimension_2();l++)
for(unsigned int m=0;m<a.dimension_3();m++)
for(unsigned int n=0;n<a.dimension_4();n++)
for(unsigned int o=0;o<a.dimension_5();o++)
for(unsigned int p=0;p<a.dimension_6();p++)
for(unsigned int q=0;q<a.dimension_7();q++)
for(IndexType j=0;j<loops;j++) {
const IndexType idx = i*loops+j;
if(idx<static_cast<IndexType>(a.dimension_0())) {
for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
for(IndexType p=0;p<static_cast<IndexType>(a.dimension_6());p++)
for(IndexType q=0;q<static_cast<IndexType>(a.dimension_7());q++)
a(idx,k,l,m,n,o,p,q) = Rand::draw(gen,range);
}
}
rand_pool.free_state(gen);
}
};
template<class ViewType, class RandomPool, int loops>
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,1>{
template<class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,1,IndexType>{
typedef typename ViewType::execution_space execution_space;
ViewType a;
RandomPool rand_pool;
@ -1503,19 +1508,19 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,1>{
a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
KOKKOS_INLINE_FUNCTION
void operator() (unsigned int i) const {
void operator() (IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state();
for(unsigned int j=0;j<loops;j++) {
const uint64_t idx = i*loops+j;
if(idx<a.dimension_0())
for(IndexType j=0;j<loops;j++) {
const IndexType idx = i*loops+j;
if(idx<static_cast<IndexType>(a.dimension_0()))
a(idx) = Rand::draw(gen,begin,end);
}
rand_pool.free_state(gen);
}
};
template<class ViewType, class RandomPool, int loops>
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,2>{
template<class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,2,IndexType>{
typedef typename ViewType::execution_space execution_space;
ViewType a;
RandomPool rand_pool;
@ -1528,12 +1533,12 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,2>{
a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
KOKKOS_INLINE_FUNCTION
void operator() (unsigned int i) const {
void operator() (IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state();
for(unsigned int j=0;j<loops;j++) {
const uint64_t idx = i*loops+j;
if(idx<a.dimension_0()) {
for(unsigned int k=0;k<a.dimension_1();k++)
for(IndexType j=0;j<loops;j++) {
const IndexType idx = i*loops+j;
if(idx<static_cast<IndexType>(a.dimension_0())) {
for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
a(idx,k) = Rand::draw(gen,begin,end);
}
}
@ -1542,8 +1547,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,2>{
};
template<class ViewType, class RandomPool, int loops>
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,3>{
template<class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,3,IndexType>{
typedef typename ViewType::execution_space execution_space;
ViewType a;
RandomPool rand_pool;
@ -1556,13 +1561,13 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,3>{
a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
KOKKOS_INLINE_FUNCTION
void operator() (unsigned int i) const {
void operator() (IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state();
for(unsigned int j=0;j<loops;j++) {
const uint64_t idx = i*loops+j;
if(idx<a.dimension_0()) {
for(unsigned int k=0;k<a.dimension_1();k++)
for(unsigned int l=0;l<a.dimension_2();l++)
for(IndexType j=0;j<loops;j++) {
const IndexType idx = i*loops+j;
if(idx<static_cast<IndexType>(a.dimension_0())) {
for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
a(idx,k,l) = Rand::draw(gen,begin,end);
}
}
@ -1570,8 +1575,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,3>{
}
};
template<class ViewType, class RandomPool, int loops>
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,4>{
template<class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,4,IndexType>{
typedef typename ViewType::execution_space execution_space;
ViewType a;
RandomPool rand_pool;
@ -1584,14 +1589,14 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,4>{
a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
KOKKOS_INLINE_FUNCTION
void operator() (unsigned int i) const {
void operator() (IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state();
for(unsigned int j=0;j<loops;j++) {
const uint64_t idx = i*loops+j;
if(idx<a.dimension_0()) {
for(unsigned int k=0;k<a.dimension_1();k++)
for(unsigned int l=0;l<a.dimension_2();l++)
for(unsigned int m=0;m<a.dimension_3();m++)
for(IndexType j=0;j<loops;j++) {
const IndexType idx = i*loops+j;
if(idx<static_cast<IndexType>(a.dimension_0())) {
for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
a(idx,k,l,m) = Rand::draw(gen,begin,end);
}
}
@ -1599,8 +1604,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,4>{
}
};
template<class ViewType, class RandomPool, int loops>
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,5>{
template<class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,5,IndexType>{
typedef typename ViewType::execution_space execution_space;
ViewType a;
RandomPool rand_pool;
@ -1613,15 +1618,15 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,5>{
a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
KOKKOS_INLINE_FUNCTION
void operator() (unsigned int i) const {
void operator() (IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state();
for(unsigned int j=0;j<loops;j++) {
const uint64_t idx = i*loops+j;
if(idx<a.dimension_0()){
for(unsigned int l=0;l<a.dimension_1();l++)
for(unsigned int m=0;m<a.dimension_2();m++)
for(unsigned int n=0;n<a.dimension_3();n++)
for(unsigned int o=0;o<a.dimension_4();o++)
for(IndexType j=0;j<loops;j++) {
const IndexType idx = i*loops+j;
if(idx<static_cast<IndexType>(a.dimension_0())){
for(IndexType l=0;l<static_cast<IndexType>(a.dimension_1());l++)
for(IndexType m=0;m<static_cast<IndexType>(a.dimension_2());m++)
for(IndexType n=0;n<static_cast<IndexType>(a.dimension_3());n++)
for(IndexType o=0;o<static_cast<IndexType>(a.dimension_4());o++)
a(idx,l,m,n,o) = Rand::draw(gen,begin,end);
}
}
@ -1629,8 +1634,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,5>{
}
};
template<class ViewType, class RandomPool, int loops>
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,6>{
template<class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,6,IndexType>{
typedef typename ViewType::execution_space execution_space;
ViewType a;
RandomPool rand_pool;
@ -1643,16 +1648,16 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,6>{
a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
KOKKOS_INLINE_FUNCTION
void operator() (unsigned int i) const {
void operator() (IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state();
for(unsigned int j=0;j<loops;j++) {
const uint64_t idx = i*loops+j;
if(idx<a.dimension_0()) {
for(unsigned int k=0;k<a.dimension_1();k++)
for(unsigned int l=0;l<a.dimension_2();l++)
for(unsigned int m=0;m<a.dimension_3();m++)
for(unsigned int n=0;n<a.dimension_4();n++)
for(unsigned int o=0;o<a.dimension_5();o++)
for(IndexType j=0;j<loops;j++) {
const IndexType idx = i*loops+j;
if(idx<static_cast<IndexType>(a.dimension_0())) {
for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
a(idx,k,l,m,n,o) = Rand::draw(gen,begin,end);
}
}
@ -1661,8 +1666,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,6>{
};
template<class ViewType, class RandomPool, int loops>
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,7>{
template<class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,7,IndexType>{
typedef typename ViewType::execution_space execution_space;
ViewType a;
RandomPool rand_pool;
@ -1675,17 +1680,17 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,7>{
a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
KOKKOS_INLINE_FUNCTION
void operator() (unsigned int i) const {
void operator() (IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state();
for(unsigned int j=0;j<loops;j++) {
const uint64_t idx = i*loops+j;
if(idx<a.dimension_0()) {
for(unsigned int k=0;k<a.dimension_1();k++)
for(unsigned int l=0;l<a.dimension_2();l++)
for(unsigned int m=0;m<a.dimension_3();m++)
for(unsigned int n=0;n<a.dimension_4();n++)
for(unsigned int o=0;o<a.dimension_5();o++)
for(unsigned int p=0;p<a.dimension_6();p++)
for(IndexType j=0;j<loops;j++) {
const IndexType idx = i*loops+j;
if(idx<static_cast<IndexType>(a.dimension_0())) {
for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
for(IndexType p=0;p<static_cast<IndexType>(a.dimension_6());p++)
a(idx,k,l,m,n,o,p) = Rand::draw(gen,begin,end);
}
}
@ -1693,8 +1698,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,7>{
}
};
template<class ViewType, class RandomPool, int loops>
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,8>{
template<class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,8,IndexType>{
typedef typename ViewType::execution_space execution_space;
ViewType a;
RandomPool rand_pool;
@ -1707,18 +1712,18 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,8>{
a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
KOKKOS_INLINE_FUNCTION
void operator() (unsigned int i) const {
void operator() (IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state();
for(unsigned int j=0;j<loops;j++) {
const uint64_t idx = i*loops+j;
if(idx<a.dimension_0()) {
for(unsigned int k=0;k<a.dimension_1();k++)
for(unsigned int l=0;l<a.dimension_2();l++)
for(unsigned int m=0;m<a.dimension_3();m++)
for(unsigned int n=0;n<a.dimension_4();n++)
for(unsigned int o=0;o<a.dimension_5();o++)
for(unsigned int p=0;p<a.dimension_6();p++)
for(unsigned int q=0;q<a.dimension_7();q++)
for(IndexType j=0;j<loops;j++) {
const IndexType idx = i*loops+j;
if(idx<static_cast<IndexType>(a.dimension_0())) {
for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
for(IndexType p=0;p<static_cast<IndexType>(a.dimension_6());p++)
for(IndexType q=0;q<static_cast<IndexType>(a.dimension_7());q++)
a(idx,k,l,m,n,o,p,q) = Rand::draw(gen,begin,end);
}
}
@ -1726,18 +1731,20 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,8>{
}
};
template<class ViewType, class RandomPool>
}
template<class ViewType, class RandomPool, class IndexType = int64_t>
void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type range) {
int64_t LDA = a.dimension_0();
if(LDA>0)
parallel_for((LDA+127)/128,fill_random_functor_range<ViewType,RandomPool,128,ViewType::Rank>(a,g,range));
parallel_for((LDA+127)/128,Impl::fill_random_functor_range<ViewType,RandomPool,128,ViewType::Rank,IndexType>(a,g,range));
}
template<class ViewType, class RandomPool>
template<class ViewType, class RandomPool, class IndexType = int64_t>
void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type begin,typename ViewType::const_value_type end ) {
int64_t LDA = a.dimension_0();
if(LDA>0)
parallel_for((LDA+127)/128,fill_random_functor_begin_end<ViewType,RandomPool,128,ViewType::Rank>(a,g,begin,end));
parallel_for((LDA+127)/128,Impl::fill_random_functor_begin_end<ViewType,RandomPool,128,ViewType::Rank,IndexType>(a,g,begin,end));
}
}

View File

@ -50,6 +50,7 @@
#include <Kokkos_Core.hpp>
#include <Kokkos_Random.hpp>
#include <cmath>
#include <chrono>
namespace Test {
@ -207,7 +208,6 @@ struct test_histogram1d_functor {
density_1d (d1d),
mean (1.0*num_draws/HIST_DIM1D*3)
{
printf ("Mean: %e\n", mean);
}
KOKKOS_INLINE_FUNCTION void
@ -295,7 +295,7 @@ struct test_random_scalar {
parallel_reduce (num_draws/1024, functor_type (pool, density_1d, density_3d), result);
//printf("Result: %lf %lf %lf\n",result.mean/num_draws/3,result.variance/num_draws/3,result.covariance/num_draws/2);
double tolerance = 2.0*sqrt(1.0/num_draws);
double tolerance = 1.6*sqrt(1.0/num_draws);
double mean_expect = 0.5*Kokkos::rand<rnd_type,Scalar>::max();
double variance_expect = 1.0/3.0*mean_expect*mean_expect;
double mean_eps = mean_expect/(result.mean/num_draws/3)-1.0;
@ -303,10 +303,10 @@ struct test_random_scalar {
double covariance_eps = result.covariance/num_draws/2/variance_expect;
pass_mean = ((-tolerance < mean_eps) &&
( tolerance > mean_eps)) ? 1:0;
pass_var = ((-tolerance < variance_eps) &&
( tolerance > variance_eps)) ? 1:0;
pass_covar = ((-1.4*tolerance < covariance_eps) &&
( 1.4*tolerance > covariance_eps)) ? 1:0;
pass_var = ((-1.5*tolerance < variance_eps) &&
( 1.5*tolerance > variance_eps)) ? 1:0;
pass_covar = ((-2.0*tolerance < covariance_eps) &&
( 2.0*tolerance > covariance_eps)) ? 1:0;
cerr << "Pass: " << pass_mean
<< " " << pass_var
<< " " << mean_eps
@ -328,12 +328,12 @@ struct test_random_scalar {
double mean_eps = mean_expect/(result.mean/HIST_DIM1D)-1.0;
double variance_eps = variance_expect/(result.variance/HIST_DIM1D)-1.0;
double covariance_eps = (result.covariance/HIST_DIM1D - covariance_expect)/mean_expect;
pass_hist1d_mean = ((-tolerance < mean_eps) &&
( tolerance > mean_eps)) ? 1:0;
pass_hist1d_var = ((-tolerance < variance_eps) &&
( tolerance > variance_eps)) ? 1:0;
pass_hist1d_covar = ((-tolerance < covariance_eps) &&
( tolerance > covariance_eps)) ? 1:0;
pass_hist1d_mean = ((-0.0001 < mean_eps) &&
( 0.0001 > mean_eps)) ? 1:0;
pass_hist1d_var = ((-0.07 < variance_eps) &&
( 0.07 > variance_eps)) ? 1:0;
pass_hist1d_covar = ((-0.06 < covariance_eps) &&
( 0.06 > covariance_eps)) ? 1:0;
cerr << "Density 1D: " << mean_eps
<< " " << variance_eps
@ -363,8 +363,8 @@ struct test_random_scalar {
double covariance_eps = (result.covariance/HIST_DIM1D - covariance_expect)/mean_expect;
pass_hist3d_mean = ((-tolerance < mean_eps) &&
( tolerance > mean_eps)) ? 1:0;
pass_hist3d_var = ((-tolerance < variance_eps) &&
( tolerance > variance_eps)) ? 1:0;
pass_hist3d_var = ((-1.2*tolerance < variance_eps) &&
( 1.2*tolerance > variance_eps)) ? 1:0;
pass_hist3d_covar = ((-tolerance < covariance_eps) &&
( tolerance > covariance_eps)) ? 1:0;
@ -386,8 +386,13 @@ void test_random(unsigned int num_draws)
typename test_random_functor<RandomGenerator,int>::type_1d density_1d("D1d");
typename test_random_functor<RandomGenerator,int>::type_3d density_3d("D3d");
uint64_t ticks = std::chrono::high_resolution_clock::now().time_since_epoch().count();
cerr << "Test Seed:" << ticks << endl;
RandomGenerator pool(ticks);
cerr << "Test Scalar=int" << endl;
RandomGenerator pool(31891);
test_random_scalar<RandomGenerator,int> test_int(density_1d,density_3d,pool,num_draws);
ASSERT_EQ( test_int.pass_mean,1);
ASSERT_EQ( test_int.pass_var,1);

View File

@ -0,0 +1,79 @@
# @HEADER
# ************************************************************************
#
# Trilinos: An Object-Oriented Solver Framework
# Copyright (2001) Sandia Corporation
#
#
# Copyright (2001) Sandia Corporation. Under the terms of Contract
# DE-AC04-94AL85000, there is a non-exclusive license for use of this
# work by or on behalf of the U.S. Government. Export of this program
# may require a license from the United States Government.
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# 3. Neither the name of the Corporation nor the names of the
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# NOTICE: The United States Government is granted for itself and others
# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
# license in this data to reproduce, prepare derivative works, and
# perform publicly and display publicly. Beginning five (5) years from
# July 25, 2001, the United States Government is granted for itself and
# others acting on its behalf a paid-up, nonexclusive, irrevocable
# worldwide license in this data to reproduce, prepare derivative works,
# distribute copies to the public, perform publicly and display
# publicly, and to permit others to do so.
#
# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
#
# ************************************************************************
# @HEADER
# Check for CUDA support
SET(_CUDA_FAILURE OFF)
# Have CMake find CUDA
IF(NOT _CUDA_FAILURE)
FIND_PACKAGE(CUDA 3.2)
IF (NOT CUDA_FOUND)
SET(_CUDA_FAILURE ON)
ENDIF()
ENDIF()
IF(NOT _CUDA_FAILURE)
# if we haven't met failure
macro(PACKAGE_ADD_CUDA_LIBRARY cuda_target)
TRIBITS_ADD_LIBRARY(${cuda_target} ${ARGN} CUDALIBRARY)
endmacro()
GLOBAL_SET(TPL_CUDA_LIBRARY_DIRS)
GLOBAL_SET(TPL_CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE})
GLOBAL_SET(TPL_CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY} ${CUDA_cublas_LIBRARY} ${CUDA_cufft_LIBRARY})
TIBITS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE)
ELSE()
SET(TPL_ENABLE_CUDA OFF)
ENDIF()

View File

@ -0,0 +1,64 @@
# @HEADER
# ************************************************************************
#
# Trilinos: An Object-Oriented Solver Framework
# Copyright (2001) Sandia Corporation
#
#
# Copyright (2001) Sandia Corporation. Under the terms of Contract
# DE-AC04-94AL85000, there is a non-exclusive license for use of this
# work by or on behalf of the U.S. Government. Export of this program
# may require a license from the United States Government.
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# 3. Neither the name of the Corporation nor the names of the
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# NOTICE: The United States Government is granted for itself and others
# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
# license in this data to reproduce, prepare derivative works, and
# perform publicly and display publicly. Beginning five (5) years from
# July 25, 2001, the United States Government is granted for itself and
# others acting on its behalf a paid-up, nonexclusive, irrevocable
# worldwide license in this data to reproduce, prepare derivative works,
# distribute copies to the public, perform publicly and display
# publicly, and to permit others to do so.
#
# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
#
# ************************************************************************
# @HEADER
include(${TRIBITS_DEPS_DIR}/CUDA.cmake)
IF (TPL_ENABLE_CUDA)
GLOBAL_SET(TPL_CUSPARSE_LIBRARY_DIRS)
GLOBAL_SET(TPL_CUSPARSE_INCLUDE_DIRS ${TPL_CUDA_INCLUDE_DIRS})
GLOBAL_SET(TPL_CUSPARSE_LIBRARIES ${CUDA_cusparse_LIBRARY})
TIBITS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE)
ENDIF()

View File

@ -0,0 +1,70 @@
# @HEADER
# ************************************************************************
#
# Trilinos: An Object-Oriented Solver Framework
# Copyright (2001) Sandia Corporation
#
#
# Copyright (2001) Sandia Corporation. Under the terms of Contract
# DE-AC04-94AL85000, there is a non-exclusive license for use of this
# work by or on behalf of the U.S. Government. Export of this program
# may require a license from the United States Government.
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# 3. Neither the name of the Corporation nor the names of the
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# NOTICE: The United States Government is granted for itself and others
# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
# license in this data to reproduce, prepare derivative works, and
# perform publicly and display publicly. Beginning five (5) years from
# July 25, 2001, the United States Government is granted for itself and
# others acting on its behalf a paid-up, nonexclusive, irrevocable
# worldwide license in this data to reproduce, prepare derivative works,
# distribute copies to the public, perform publicly and display
# publicly, and to permit others to do so.
#
# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
#
# ************************************************************************
# @HEADER
#-----------------------------------------------------------------------------
# Hardware locality detection and control library.
#
# Acquisition information:
# Date checked: November 2011
# Checked by: H. Carter Edwards <hcedwar AT sandia.gov>
# Source: http://www.open-mpi.org/projects/hwloc/
# Version: 1.3
#
TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( HWLOC
REQUIRED_HEADERS hwloc.h
REQUIRED_LIBS_NAMES "hwloc"
)

View File

@ -0,0 +1,83 @@
# @HEADER
# ************************************************************************
#
# Trilinos: An Object-Oriented Solver Framework
# Copyright (2001) Sandia Corporation
#
#
# Copyright (2001) Sandia Corporation. Under the terms of Contract
# DE-AC04-94AL85000, there is a non-exclusive license for use of this
# work by or on behalf of the U.S. Government. Export of this program
# may require a license from the United States Government.
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# 3. Neither the name of the Corporation nor the names of the
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# NOTICE: The United States Government is granted for itself and others
# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
# license in this data to reproduce, prepare derivative works, and
# perform publicly and display publicly. Beginning five (5) years from
# July 25, 2001, the United States Government is granted for itself and
# others acting on its behalf a paid-up, nonexclusive, irrevocable
# worldwide license in this data to reproduce, prepare derivative works,
# distribute copies to the public, perform publicly and display
# publicly, and to permit others to do so.
#
# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
#
# ************************************************************************
# @HEADER
SET(USE_THREADS FALSE)
IF(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES)
# Use CMake's Thread finder since it is a bit smarter in determining
# whether pthreads is already built into the compiler and doesn't need
# a library to link.
FIND_PACKAGE(Threads)
#If Threads found a copy of pthreads make sure it is one of the cases the tribits
#tpl system cannot handle.
IF(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT)
IF(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread")
SET(USE_THREADS TRUE)
ENDIF()
ENDIF()
ENDIF()
IF(USE_THREADS)
SET(TPL_Pthread_INCLUDE_DIRS "")
SET(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}")
SET(TPL_Pthread_LIBRARY_DIRS "")
TIBITS_CREATE_IMPORTED_TPL_LIBRARY(Pthread)
ELSE()
TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( Pthread
REQUIRED_HEADERS pthread.h
REQUIRED_LIBS_NAMES pthread
)
ENDIF()

View File

@ -0,0 +1,70 @@
# @HEADER
# ************************************************************************
#
# Trilinos: An Object-Oriented Solver Framework
# Copyright (2001) Sandia Corporation
#
#
# Copyright (2001) Sandia Corporation. Under the terms of Contract
# DE-AC04-94AL85000, there is a non-exclusive license for use of this
# work by or on behalf of the U.S. Government. Export of this program
# may require a license from the United States Government.
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# 3. Neither the name of the Corporation nor the names of the
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# NOTICE: The United States Government is granted for itself and others
# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
# license in this data to reproduce, prepare derivative works, and
# perform publicly and display publicly. Beginning five (5) years from
# July 25, 2001, the United States Government is granted for itself and
# others acting on its behalf a paid-up, nonexclusive, irrevocable
# worldwide license in this data to reproduce, prepare derivative works,
# distribute copies to the public, perform publicly and display
# publicly, and to permit others to do so.
#
# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
#
# ************************************************************************
# @HEADER
#-----------------------------------------------------------------------------
# Hardware locality detection and control library.
#
# Acquisition information:
# Date checked: July 2014
# Checked by: H. Carter Edwards <hcedwar AT sandia.gov>
# Source: https://code.google.com/p/qthreads
#
TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( QTHREAD
REQUIRED_HEADERS qthread.h
REQUIRED_LIBS_NAMES "qthread"
)

View File

@ -0,0 +1,485 @@
INCLUDE(CMakeParseArguments)
INCLUDE(CTest)
FUNCTION(ASSERT_DEFINED VARS)
FOREACH(VAR ${VARS})
IF(NOT DEFINED ${VAR})
MESSAGE(SEND_ERROR "Error, the variable ${VAR} is not defined!")
ENDIF()
ENDFOREACH()
ENDFUNCTION()
MACRO(GLOBAL_SET VARNAME)
SET(${VARNAME} ${ARGN} CACHE INTERNAL "")
ENDMACRO()
MACRO(PREPEND_GLOBAL_SET VARNAME)
ASSERT_DEFINED(${VARNAME})
GLOBAL_SET(${VARNAME} ${ARGN} ${${VARNAME}})
ENDMACRO()
FUNCTION(REMOVE_GLOBAL_DUPLICATES VARNAME)
ASSERT_DEFINED(${VARNAME})
IF (${VARNAME})
SET(TMP ${${VARNAME}})
LIST(REMOVE_DUPLICATES TMP)
GLOBAL_SET(${VARNAME} ${TMP})
ENDIF()
ENDFUNCTION()
MACRO(TRIBITS_ADD_OPTION_AND_DEFINE USER_OPTION_NAME MACRO_DEFINE_NAME DOCSTRING DEFAULT_VALUE)
MESSAGE(STATUS "TRIBITS_ADD_OPTION_AND_DEFINE: '${USER_OPTION_NAME}' '${MACRO_DEFINE_NAME}' '${DEFAULT_VALUE}'")
SET( ${USER_OPTION_NAME} "${DEFAULT_VALUE}" CACHE BOOL "${DOCSTRING}" )
IF(NOT ${MACRO_DEFINE_NAME} STREQUAL "")
IF(${USER_OPTION_NAME})
GLOBAL_SET(${MACRO_DEFINE_NAME} ON)
ELSE()
GLOBAL_SET(${MACRO_DEFINE_NAME} OFF)
ENDIF()
ENDIF()
ENDMACRO()
FUNCTION(TRIBITS_CONFIGURE_FILE PACKAGE_NAME_CONFIG_FILE)
# Configure the file
CONFIGURE_FILE(
${PACKAGE_SOURCE_DIR}/cmake/${PACKAGE_NAME_CONFIG_FILE}.in
${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME_CONFIG_FILE}
)
ENDFUNCTION()
MACRO(TRIBITS_ADD_DEBUG_OPTION)
TRIBITS_ADD_OPTION_AND_DEFINE(
${PROJECT_NAME}_ENABLE_DEBUG
HAVE_${PROJECT_NAME_UC}_DEBUG
"Enable a host of runtime debug checking."
OFF
)
ENDMACRO()
MACRO(TRIBITS_ADD_TEST_DIRECTORIES)
FOREACH(TEST_DIR ${ARGN})
ADD_SUBDIRECTORY(${TEST_DIR})
ENDFOREACH()
ENDMACRO()
MACRO(TRIBITS_ADD_EXAMPLE_DIRECTORIES)
IF(${PACKAGE_NAME}_ENABLE_EXAMPLES OR ${PARENT_PACKAGE_NAME}_ENABLE_EXAMPLES)
FOREACH(EXAMPLE_DIR ${ARGN})
ADD_SUBDIRECTORY(${EXAMPLE_DIR})
ENDFOREACH()
ENDIF()
ENDMACRO()
MACRO(TARGET_TRANSFER_PROPERTY TARGET_NAME PROP_IN PROP_OUT)
SET(PROP_VALUES)
FOREACH(TARGET_X ${ARGN})
LIST(APPEND PROP_VALUES "$<TARGET_PROPERTY:${TARGET_X},${PROP_IN}>")
ENDFOREACH()
SET_TARGET_PROPERTIES(${TARGET_NAME} PROPERTIES ${PROP_OUT} "${PROP_VALUES}")
ENDMACRO()
MACRO(ADD_INTERFACE_LIBRARY LIB_NAME)
FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp "")
ADD_LIBRARY(${LIB_NAME} STATIC ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp)
SET_TARGET_PROPERTIES(${LIB_NAME} PROPERTIES INTERFACE TRUE)
ENDMACRO()
# Older versions of cmake does not make include directories transitive
MACRO(TARGET_LINK_AND_INCLUDE_LIBRARIES TARGET_NAME)
TARGET_LINK_LIBRARIES(${TARGET_NAME} LINK_PUBLIC ${ARGN})
FOREACH(DEP_LIB ${ARGN})
TARGET_INCLUDE_DIRECTORIES(${TARGET_NAME} PUBLIC $<TARGET_PROPERTY:${DEP_LIB},INTERFACE_INCLUDE_DIRECTORIES>)
TARGET_INCLUDE_DIRECTORIES(${TARGET_NAME} PUBLIC $<TARGET_PROPERTY:${DEP_LIB},INCLUDE_DIRECTORIES>)
ENDFOREACH()
ENDMACRO()
FUNCTION(TRIBITS_ADD_LIBRARY LIBRARY_NAME)
SET(options STATIC SHARED TESTONLY NO_INSTALL_LIB_OR_HEADERS CUDALIBRARY)
SET(oneValueArgs)
SET(multiValueArgs HEADERS HEADERS_INSTALL_SUBDIR NOINSTALLHEADERS SOURCES DEPLIBS IMPORTEDLIBS DEFINES ADDED_LIB_TARGET_NAME_OUT)
CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
IF(PARSE_HEADERS)
LIST(REMOVE_DUPLICATES PARSE_HEADERS)
ENDIF()
IF(PARSE_SOURCES)
LIST(REMOVE_DUPLICATES PARSE_SOURCES)
ENDIF()
# Local variable to hold all of the libraries that will be directly linked
# to this library.
SET(LINK_LIBS ${${PACKAGE_NAME}_DEPS})
# Add dependent libraries passed directly in
IF (PARSE_IMPORTEDLIBS)
LIST(APPEND LINK_LIBS ${PARSE_IMPORTEDLIBS})
ENDIF()
IF (PARSE_DEPLIBS)
LIST(APPEND LINK_LIBS ${PARSE_DEPLIBS})
ENDIF()
# Add the library and all the dependencies
IF (PARSE_DEFINES)
ADD_DEFINITIONS(${PARSE_DEFINES})
ENDIF()
IF (PARSE_STATIC)
SET(STATIC_KEYWORD "STATIC")
ELSE()
SET(STATIC_KEYWORD)
ENDIF()
IF (PARSE_SHARED)
SET(SHARED_KEYWORD "SHARED")
ELSE()
SET(SHARED_KEYWORD)
ENDIF()
IF (PARSE_TESTONLY)
SET(EXCLUDE_FROM_ALL_KEYWORD "EXCLUDE_FROM_ALL")
ELSE()
SET(EXCLUDE_FROM_ALL_KEYWORD)
ENDIF()
IF (NOT PARSE_CUDALIBRARY)
ADD_LIBRARY(
${LIBRARY_NAME}
${STATIC_KEYWORD}
${SHARED_KEYWORD}
${EXCLUDE_FROM_ALL_KEYWORD}
${PARSE_HEADERS}
${PARSE_NOINSTALLHEADERS}
${PARSE_SOURCES}
)
ELSE()
CUDA_ADD_LIBRARY(
${LIBRARY_NAME}
${PARSE_HEADERS}
${PARSE_NOINSTALLHEADERS}
${PARSE_SOURCES}
)
ENDIF()
TARGET_LINK_AND_INCLUDE_LIBRARIES(${LIBRARY_NAME} ${LINK_LIBS})
IF (NOT PARSE_TESTONLY OR PARSE_NO_INSTALL_LIB_OR_HEADERS)
INSTALL(
TARGETS ${LIBRARY_NAME}
EXPORT ${PROJECT_NAME}
RUNTIME DESTINATION bin
LIBRARY DESTINATION lib
ARCHIVE DESTINATION lib
COMPONENT ${PACKAGE_NAME}
)
INSTALL(
FILES ${PARSE_HEADERS}
EXPORT ${PROJECT_NAME}
DESTINATION include
COMPONENT ${PACKAGE_NAME}
)
INSTALL(
DIRECTORY ${PARSE_HEADERS_INSTALL_SUBDIR}
EXPORT ${PROJECT_NAME}
DESTINATION include
COMPONENT ${PACKAGE_NAME}
)
ENDIF()
IF (NOT PARSE_TESTONLY)
PREPEND_GLOBAL_SET(${PACKAGE_NAME}_LIBS ${LIBRARY_NAME})
REMOVE_GLOBAL_DUPLICATES(${PACKAGE_NAME}_LIBS)
ENDIF()
ENDFUNCTION()
FUNCTION(TRIBITS_ADD_EXECUTABLE EXE_NAME)
SET(options NOEXEPREFIX NOEXESUFFIX ADD_DIR_TO_NAME INSTALLABLE TESTONLY)
SET(oneValueArgs ADDED_EXE_TARGET_NAME_OUT)
SET(multiValueArgs SOURCES CATEGORIES HOST XHOST HOSTTYPE XHOSTTYPE DIRECTORY TESTONLYLIBS IMPORTEDLIBS DEPLIBS COMM LINKER_LANGUAGE TARGET_DEFINES DEFINES)
CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
IF (PARSE_TARGET_DEFINES)
TARGET_COMPILE_DEFINITIONS(${EXE_NAME} PUBLIC ${PARSE_TARGET_DEFINES})
ENDIF()
SET(LINK_LIBS PACKAGE_${PACKAGE_NAME})
IF (PARSE_TESTONLYLIBS)
LIST(APPEND LINK_LIBS ${PARSE_TESTONLYLIBS})
ENDIF()
IF (PARSE_IMPORTEDLIBS)
LIST(APPEND LINK_LIBS ${PARSE_IMPORTEDLIBS})
ENDIF()
SET (EXE_SOURCES)
IF(PARSE_DIRECTORY)
FOREACH( SOURCE_FILE ${PARSE_SOURCES} )
IF(IS_ABSOLUTE ${SOURCE_FILE})
SET (EXE_SOURCES ${EXE_SOURCES} ${SOURCE_FILE})
ELSE()
SET (EXE_SOURCES ${EXE_SOURCES} ${PARSE_DIRECTORY}/${SOURCE_FILE})
ENDIF()
ENDFOREACH( )
ELSE()
FOREACH( SOURCE_FILE ${PARSE_SOURCES} )
SET (EXE_SOURCES ${EXE_SOURCES} ${SOURCE_FILE})
ENDFOREACH( )
ENDIF()
SET(EXE_BINARY_NAME ${EXE_NAME})
IF(DEFINED PACKAGE_NAME AND NOT PARSE_NOEXEPREFIX)
SET(EXE_BINARY_NAME ${PACKAGE_NAME}_${EXE_BINARY_NAME})
ENDIF()
IF (PARSE_TESTONLY)
SET(EXCLUDE_FROM_ALL_KEYWORD "EXCLUDE_FROM_ALL")
ELSE()
SET(EXCLUDE_FROM_ALL_KEYWORD)
ENDIF()
ADD_EXECUTABLE(${EXE_BINARY_NAME} ${EXCLUDE_FROM_ALL_KEYWORD} ${EXE_SOURCES})
TARGET_LINK_AND_INCLUDE_LIBRARIES(${EXE_BINARY_NAME} ${LINK_LIBS})
IF(PARSE_ADDED_EXE_TARGET_NAME_OUT)
SET(${PARSE_ADDED_EXE_TARGET_NAME_OUT} ${EXE_BINARY_NAME} PARENT_SCOPE)
ENDIF()
IF(PARSE_INSTALLABLE)
INSTALL(
TARGETS ${EXE_BINARY_NAME}
EXPORT ${PROJECT_NAME}
DESTINATION bin
)
ENDIF()
ENDFUNCTION()
ADD_CUSTOM_TARGET(check COMMAND ${CMAKE_CTEST_COMMAND} -VV -C ${CMAKE_CFG_INTDIR})
FUNCTION(TRIBITS_ADD_EXECUTABLE_AND_TEST EXE_NAME)
SET(options STANDARD_PASS_OUTPUT WILL_FAIL)
SET(oneValueArgs PASS_REGULAR_EXPRESSION FAIL_REGULAR_EXPRESSION ENVIRONMENT TIMEOUT CATEGORIES ADDED_TESTS_NAMES_OUT ADDED_EXE_TARGET_NAME_OUT)
SET(multiValueArgs)
CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
TRIBITS_ADD_EXECUTABLE(${EXE_NAME} TESTONLY ADDED_EXE_TARGET_NAME_OUT TEST_NAME ${PARSE_UNPARSED_ARGUMENTS})
IF(WIN32)
ADD_TEST(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} COMMAND ${TEST_NAME}${CMAKE_EXECUTABLE_SUFFIX})
ELSE()
ADD_TEST(NAME ${TEST_NAME} COMMAND ${TEST_NAME})
ENDIF()
ADD_DEPENDENCIES(check ${TEST_NAME})
IF(PARSE_FAIL_REGULAR_EXPRESSION)
SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${PARSE_FAIL_REGULAR_EXPRESSION})
ENDIF()
IF(PARSE_PASS_REGULAR_EXPRESSION)
SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${PARSE_PASS_REGULAR_EXPRESSION})
ENDIF()
IF(PARSE_WILL_FAIL)
SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES WILL_FAIL ${PARSE_WILL_FAIL})
ENDIF()
IF(PARSE_ADDED_TESTS_NAMES_OUT)
SET(${PARSE_ADDED_TESTS_NAMES_OUT} ${TEST_NAME} PARENT_SCOPE)
ENDIF()
IF(PARSE_ADDED_EXE_TARGET_NAME_OUT)
SET(${PARSE_ADDED_EXE_TARGET_NAME_OUT} ${TEST_NAME} PARENT_SCOPE)
ENDIF()
ENDFUNCTION()
MACRO(TIBITS_CREATE_IMPORTED_TPL_LIBRARY TPL_NAME)
ADD_INTERFACE_LIBRARY(TPL_LIB_${TPL_NAME})
TARGET_LINK_LIBRARIES(TPL_LIB_${TPL_NAME} LINK_PUBLIC ${TPL_${TPL_NAME}_LIBRARIES})
TARGET_INCLUDE_DIRECTORIES(TPL_LIB_${TPL_NAME} INTERFACE ${TPL_${TPL_NAME}_INCLUDE_DIRS})
ENDMACRO()
FUNCTION(TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES TPL_NAME)
SET(options MUST_FIND_ALL_LIBS MUST_FIND_ALL_HEADERS NO_PRINT_ENABLE_SUCCESS_FAIL)
SET(oneValueArgs)
SET(multiValueArgs REQUIRED_HEADERS REQUIRED_LIBS_NAMES)
CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
SET(_${TPL_NAME}_ENABLE_SUCCESS TRUE)
IF (PARSE_REQUIRED_LIBS_NAMES)
FIND_LIBRARY(TPL_${TPL_NAME}_LIBRARIES NAMES ${PARSE_REQUIRED_LIBS_NAMES})
IF(NOT TPL_${TPL_NAME}_LIBRARIES)
SET(_${TPL_NAME}_ENABLE_SUCCESS FALSE)
ENDIF()
ENDIF()
IF (PARSE_REQUIRED_HEADERS)
FIND_PATH(TPL_${TPL_NAME}_INCLUDE_DIRS NAMES ${PARSE_REQUIRED_HEADERS})
IF(NOT TPL_${TPL_NAME}_INCLUDE_DIRS)
SET(_${TPL_NAME}_ENABLE_SUCCESS FALSE)
ENDIF()
ENDIF()
IF (_${TPL_NAME}_ENABLE_SUCCESS)
TIBITS_CREATE_IMPORTED_TPL_LIBRARY(${TPL_NAME})
ENDIF()
ENDFUNCTION()
MACRO(TRIBITS_PROCESS_TPL_DEP_FILE TPL_FILE)
GET_FILENAME_COMPONENT(TPL_NAME ${TPL_FILE} NAME_WE)
INCLUDE("${TPL_FILE}")
IF(TARGET TPL_LIB_${TPL_NAME})
MESSAGE(STATUS "Found tpl library: ${TPL_NAME}")
SET(TPL_ENABLE_${TPL_NAME} TRUE)
ELSE()
MESSAGE(STATUS "Tpl library not found: ${TPL_NAME}")
SET(TPL_ENABLE_${TPL_NAME} FALSE)
ENDIF()
ENDMACRO()
MACRO(PREPEND_TARGET_SET VARNAME TARGET_NAME TYPE)
IF(TYPE STREQUAL "REQUIRED")
SET(REQUIRED TRUE)
ELSE()
SET(REQUIRED FALSE)
ENDIF()
IF(TARGET ${TARGET_NAME})
PREPEND_GLOBAL_SET(${VARNAME} ${TARGET_NAME})
ELSE()
IF(REQUIRED)
MESSAGE(FATAL_ERROR "Missing dependency ${TARGET_NAME}")
ENDIF()
ENDIF()
ENDMACRO()
MACRO(TRIBITS_APPEND_PACKAGE_DEPS DEP_LIST TYPE)
FOREACH(DEP ${ARGN})
PREPEND_GLOBAL_SET(${DEP_LIST} PACKAGE_${DEP})
ENDFOREACH()
ENDMACRO()
MACRO(TRIBITS_APPEND_TPLS_DEPS DEP_LIST TYPE)
FOREACH(DEP ${ARGN})
PREPEND_TARGET_SET(${DEP_LIST} TPL_LIB_${DEP} ${TYPE})
ENDFOREACH()
ENDMACRO()
MACRO(TRIBITS_ENABLE_TPLS)
FOREACH(TPL ${ARGN})
IF(TARGET ${TPL})
GLOBAL_SET(${PACKAGE_NAME}_ENABLE_${TPL} TRUE)
ELSE()
GLOBAL_SET(${PACKAGE_NAME}_ENABLE_${TPL} FALSE)
ENDIF()
ENDFOREACH()
ENDMACRO()
MACRO(TRIBITS_PACKAGE_DEFINE_DEPENDENCIES)
SET(options)
SET(oneValueArgs)
SET(multiValueArgs
LIB_REQUIRED_PACKAGES
LIB_OPTIONAL_PACKAGES
TEST_REQUIRED_PACKAGES
TEST_OPTIONAL_PACKAGES
LIB_REQUIRED_TPLS
LIB_OPTIONAL_TPLS
TEST_REQUIRED_TPLS
TEST_OPTIONAL_TPLS
REGRESSION_EMAIL_LIST
SUBPACKAGES_DIRS_CLASSIFICATIONS_OPTREQS
)
CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
GLOBAL_SET(${PACKAGE_NAME}_DEPS "")
TRIBITS_APPEND_PACKAGE_DEPS(${PACKAGE_NAME}_DEPS REQUIRED ${PARSE_LIB_REQUIRED_PACKAGES})
TRIBITS_APPEND_PACKAGE_DEPS(${PACKAGE_NAME}_DEPS OPTIONAL ${PARSE_LIB_OPTIONAL_PACKAGES})
TRIBITS_APPEND_TPLS_DEPS(${PACKAGE_NAME}_DEPS REQUIRED ${PARSE_LIB_REQUIRED_TPLS})
TRIBITS_APPEND_TPLS_DEPS(${PACKAGE_NAME}_DEPS OPTIONAL ${PARSE_LIB_OPTIONAL_TPLS})
GLOBAL_SET(${PACKAGE_NAME}_TEST_DEPS "")
TRIBITS_APPEND_PACKAGE_DEPS(${PACKAGE_NAME}_TEST_DEPS REQUIRED ${PARSE_TEST_REQUIRED_PACKAGES})
TRIBITS_APPEND_PACKAGE_DEPS(${PACKAGE_NAME}_TEST_DEPS OPTIONAL ${PARSE_TEST_OPTIONAL_PACKAGES})
TRIBITS_APPEND_TPLS_DEPS(${PACKAGE_NAME}_TEST_DEPS REQUIRED ${PARSE_TEST_REQUIRED_TPLS})
TRIBITS_APPEND_TPLS_DEPS(${PACKAGE_NAME}_TEST_DEPS OPTIONAL ${PARSE_TEST_OPTIONAL_TPLS})
TRIBITS_ENABLE_TPLS(${PARSE_LIB_REQUIRED_TPLS} ${PARSE_LIB_OPTIONAL_TPLS} ${PARSE_TEST_REQUIRED_TPLS} ${PARSE_TEST_OPTIONAL_TPLS})
ENDMACRO()
MACRO(TRIBITS_SUBPACKAGE NAME)
SET(PACKAGE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
SET(PARENT_PACKAGE_NAME ${PACKAGE_NAME})
SET(PACKAGE_NAME ${PACKAGE_NAME}${NAME})
STRING(TOUPPER ${PACKAGE_NAME} PACKAGE_NAME_UC)
ADD_INTERFACE_LIBRARY(PACKAGE_${PACKAGE_NAME})
GLOBAL_SET(${PACKAGE_NAME}_LIBS "")
INCLUDE(${PACKAGE_SOURCE_DIR}/cmake/Dependencies.cmake)
ENDMACRO(TRIBITS_SUBPACKAGE)
MACRO(TRIBITS_SUBPACKAGE_POSTPROCESS)
TARGET_LINK_AND_INCLUDE_LIBRARIES(PACKAGE_${PACKAGE_NAME} ${${PACKAGE_NAME}_LIBS})
ENDMACRO(TRIBITS_SUBPACKAGE_POSTPROCESS)
MACRO(TRIBITS_PACKAGE_DECL NAME)
PROJECT(${NAME})
STRING(TOUPPER ${PROJECT_NAME} PROJECT_NAME_UC)
SET(PACKAGE_NAME ${PROJECT_NAME})
STRING(TOUPPER ${PACKAGE_NAME} PACKAGE_NAME_UC)
SET(TRIBITS_DEPS_DIR "${CMAKE_SOURCE_DIR}/cmake/deps")
FILE(GLOB TPLS_FILES "${TRIBITS_DEPS_DIR}/*.cmake")
FOREACH(TPL_FILE ${TPLS_FILES})
TRIBITS_PROCESS_TPL_DEP_FILE(${TPL_FILE})
ENDFOREACH()
ENDMACRO()
MACRO(TRIBITS_PROCESS_SUBPACKAGES)
FILE(GLOB SUBPACKAGES RELATIVE ${CMAKE_SOURCE_DIR} */cmake/Dependencies.cmake)
FOREACH(SUBPACKAGE ${SUBPACKAGES})
GET_FILENAME_COMPONENT(SUBPACKAGE_CMAKE ${SUBPACKAGE} DIRECTORY)
GET_FILENAME_COMPONENT(SUBPACKAGE_DIR ${SUBPACKAGE_CMAKE} DIRECTORY)
ADD_SUBDIRECTORY(${SUBPACKAGE_DIR})
ENDFOREACH()
ENDMACRO(TRIBITS_PROCESS_SUBPACKAGES)
MACRO(TRIBITS_PACKAGE_DEF)
ENDMACRO(TRIBITS_PACKAGE_DEF)
MACRO(TRIBITS_EXCLUDE_AUTOTOOLS_FILES)
ENDMACRO(TRIBITS_EXCLUDE_AUTOTOOLS_FILES)
MACRO(TRIBITS_EXCLUDE_FILES)
ENDMACRO(TRIBITS_EXCLUDE_FILES)
MACRO(TRIBITS_PACKAGE_POSTPROCESS)
ENDMACRO(TRIBITS_PACKAGE_POSTPROCESS)

View File

@ -0,0 +1,153 @@
// -------------------------------------------------------------------------------- //
The following steps are for workstations/servers with the SEMS environment installed.
// -------------------------------------------------------------------------------- //
Summary:
- Step 1: Rigorous testing of Kokkos' develop branch for each backend (Serial, OpenMP, Threads, Cuda) with all supported compilers.
- Step 2: Snapshot Kokkos' develop branch into current Trilinos develop branch.
- Step 3: Build and test Trilinos with combinations of compilers, types, backends.
- Step 4: Promote Kokkos develop branch to master if the snapshot does not cause any new tests to fail; else track/fix causes of new failures.
- Step 5: Snapshot Kokkos tagged master branch into Trilinos and push Trilinos.
// -------------------------------------------------------------------------------- //
// -------------------------------------------------------------------------------- //
Step 1:
1.1. Update kokkos develop branch (NOT a fork)
(From kokkos directory):
git fetch --all
git checkout develop
git reset --hard origin/develop
1.2. Create a testing directory - here the directory is created within the kokkos directory
mkdir testing
cd testing
1.3. Run the test_all_sandia script; various compiler and build-list options can be specified
../config/test_all_sandia
1.4 Clean repository of untracked files
cd ../
git clean -df
// -------------------------------------------------------------------------------- //
Step 2:
2.1 Update Trilinos develop branch
(From Trilinos directory):
git checkout develop
git fetch --all
git reset --hard origin/develop
git clean -df
2.2 Snapshot Kokkos into Trilinos - this requires python/2.7.9 and that both Trilinos and Kokkos be clean - no untracked or modified files
module load python/2.7.9
python KOKKOS_PATH/config/snapshot.py KOKKOS_PATH TRILINOS_PATH/packages
// -------------------------------------------------------------------------------- //
Step 3:
3.1. Build and test Trilinos with 3 different configurations; a configure-all script is provided in Trilinos and should be modified to test each of the following 3 configurations with appropriate environment variable(s):
- GCC/4.7.2-OpenMP/Complex
Run tests with the following environment variable:
export OMP_NUM_THREADS=2
- Intel/15.0.2-Serial/NoComplex
- GCC/4.8.4/CUDA/7.5.18-Cuda/Serial/NoComplex
Run tests with the following environment variables:
export CUDA_LAUNCH_BLOCKING=1
export CUDA_MANAGED_FORCE_DEVICE_ALLOC=1
mkdir Build
cd Build
cp TRILINOS_PATH/sampleScripts/Sandia-SEMS/configure-all ./
** Set the path to Trilinos appropriately within the configure-all script **
source $SEMS_MODULE_ROOT/utils/sems-modules-init.sh kokkos
source configure-all
make -k (-k means "keep going" to get past build errors; -j12 can also be specified to build with 12 threads, for example)
ctest
3.2. Compare the failed test output to the test output on the dashboard ( testing.sandia.gov/cdash select Trilinos ); investigate and fix problems if new tests fail after the Kokkos snapshot
// -------------------------------------------------------------------------------- //
Step 4:
4.1. Once all Trilinos tests pass promote Kokkos develop branch to master on Github
- DO NOT fast-forward the merge!!!!
(From kokkos directory):
git checkout master
git fetch --all
# Ensure we are on the current origin/master
git reset --hard origin/master
git merge --no-ff origin/develop
4.2. Update the tag in kokkos/config/master_history.txt
Tag description: MajorNumber.MinorNumber.WeeksSinceMinorNumberUpdate
Tag format: #.#.##
# Prepend master_history.txt with
# tag: #.#.##
# date: mm/dd/yyyy
# master: sha1
# develop: sha1
# -----------------------
git commit --amend -a
git tag -a #.#.##
tag: #.#.##
date: mm/dd/yyyy
master: sha1
develop: sha1
git push --follow-tags origin master
// -------------------------------------------------------------------------------- //
Step 5:
5.1. Make sure Trilinos is up-to-date - chances are other changes have been committed since the integration testing process began. If a substantial change has occurred that may be affected by the snapshot the testing procedure may need to be repeated
(From Trilinos directory):
git checkout develop
git fetch --all
git reset --hard origin/develop
git clean -df
5.2. Snapshot Kokkos master branch into Trilinos
(From kokkos directory):
git fetch --all
git checkout tags/#.#.##
git clean -df
python KOKKOS_PATH/config/snapshot.py KOKKOS_PATH TRILINOS_PATH/packages
5.3. Push the updated develop branch of Trilinos to Github - congratulations!!!
(From Trilinos directory):
git push
// -------------------------------------------------------------------------------- //

View File

@ -0,0 +1,3 @@
tag: 2.01.00 date: 07:21:2016 master: xxxxxxxx develop: fa6dfcc4
tag: 2.01.06 date: 09:02:2016 master: 9afaa87f develop: 555f1a3a

View File

@ -1,17 +1,12 @@
#!/bin/bash
#
# This shell script (nvcc_wrapper) wraps both the host compiler and
# NVCC, if you are building Trilinos with CUDA enabled. The script
# remedies some differences between the interface of NVCC and that of
# the host compiler, in particular for linking. It also means that
# Trilinos doesn't need separate .cu files; it can just use .cpp
# files.
# NVCC, if you are building legacy C or C++ code with CUDA enabled.
# The script remedies some differences between the interface of NVCC
# and that of the host compiler, in particular for linking.
# It also means that a legacy code doesn't need separate .cu files;
# it can just use .cpp files.
#
# Hopefully, at some point, NVIDIA may fix NVCC so as to make this
# script obsolete. For now, this script exists and if you want to
# build Trilinos with CUDA enabled, you must use this script as your
# compiler.
# Default settings: change those according to your machine. For
# example, you may have have two different wrappers with either icpc
# or g++ as their back-end compiler. The defaults can be overwritten
@ -53,6 +48,10 @@ object_files=""
# Link objects for the host linker only
object_files_xlinker=""
# Shared libraries with version numbers are not handled correctly by NVCC
shared_versioned_libraries_host=""
shared_versioned_libraries=""
# Does the User set the architecture
arch_set=0
@ -76,6 +75,9 @@ first_xcompiler_arg=1
temp_dir=${TMPDIR:-/tmp}
# Check if we have an optimization argument already
optimization_applied=0
#echo "Arguments: $# $@"
while [ $# -gt 0 ]
@ -97,8 +99,17 @@ do
*.cpp|*.cxx|*.cc|*.C|*.c++|*.cu)
cpp_files="$cpp_files $1"
;;
# Ensure we only have one optimization flag because NVCC doesn't allow muliple
-O*)
if [ $optimization_applied -eq 1 ]; then
echo "nvcc_wrapper - *warning* you have set multiple optimization flags (-O*), only the first is used because nvcc can only accept a single optimization setting."
else
shared_args="$shared_args $1"
optimization_applied=1
fi
;;
#Handle shared args (valid for both nvcc and the host compiler)
-O*|-D*|-c|-I*|-L*|-l*|-g|--help|--version|-E|-M|-shared)
-D*|-c|-I*|-L*|-l*|-g|--help|--version|-E|-M|-shared)
shared_args="$shared_args $1"
;;
#Handle shared args that have an argument
@ -107,7 +118,7 @@ do
shift
;;
#Handle known nvcc args
-gencode*|--dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|--resource-usage)
-gencode*|--dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|--resource-usage|-Xptxas*)
cuda_args="$cuda_args $1"
;;
#Handle known nvcc args that have an argument
@ -175,10 +186,15 @@ do
object_files_xlinker="$object_files_xlinker -Xlinker $1"
;;
#Handle object files which always need to use "-Xlinker": -x cu applies to all input files, so give them to linker, except if only linking
*.so.*|*.dylib)
*.dylib)
object_files="$object_files -Xlinker $1"
object_files_xlinker="$object_files_xlinker -Xlinker $1"
;;
#Handle shared libraries with *.so.* names which nvcc can't do.
*.so.*)
shared_versioned_libraries_host="$shared_versioned_libraries_host $1"
shared_versioned_libraries="$shared_versioned_libraries -Xlinker $1"
;;
#All other args are sent to the host compiler
*)
if [ $first_xcompiler_arg -eq 1 ]; then
@ -204,13 +220,13 @@ if [ $arch_set -ne 1 ]; then
fi
#Compose compilation command
nvcc_command="nvcc $cuda_args $shared_args $xlinker_args"
nvcc_command="nvcc $cuda_args $shared_args $xlinker_args $shared_versioned_libraries"
if [ $first_xcompiler_arg -eq 0 ]; then
nvcc_command="$nvcc_command -Xcompiler $xcompiler_args"
fi
#Compose host only command
host_command="$host_compiler $shared_args $xcompiler_args $host_linker_args"
host_command="$host_compiler $shared_args $xcompiler_args $host_linker_args $shared_versioned_libraries_host"
#nvcc does not accept '#pragma ident SOME_MACRO_STRING' but it does accept '#ident SOME_MACRO_STRING'
if [ $replace_pragma_ident -eq 1 ]; then

View File

@ -6,34 +6,36 @@
set -o pipefail
# Determine current machine
MACHINE=""
HOSTNAME=$(hostname)
if [[ "$HOSTNAME" =~ (white|ride).* ]]; then
MACHINE=white
elif [[ "$HOSTNAME" =~ .*bowman.* ]]; then
MACHINE=bowman
elif [[ "$HOSTNAME" =~ node.* ]]; then # Warning: very generic name
MACHINE=shepard
elif [ ! -z "$SEMS_MODULEFILES_ROOT" ]; then
MACHINE=sems
else
echo "Unrecognized machine" >&2
exit 1
fi
GCC_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial"
IBM_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
INTEL_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial"
CLANG_BUILD_LIST="Pthread,Serial,Pthread_Serial"
CUDA_BUILD_LIST="Cuda_OpenMP,Cuda_Pthread,Cuda_Serial"
GCC_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wignored-qualifiers,-Wempty-body,-Wclobbered,-Wuninitialized"
IBM_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
CLANG_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
INTEL_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
CUDA_WARNING_FLAGS=""
BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>/base,hwloc/1.10.1/<COMPILER_NAME>/<COMPILER_VERSION>/base"
CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/4.7.2/base"
export OMP_NUM_THREADS=4
declare -i NUM_RESULTS_TO_KEEP=7
RESULT_ROOT_PREFIX=TestAll
source /projects/modulefiles/utils/sems-modules-init.sh
source /projects/modulefiles/utils/kokkos-modules-init.sh
SCRIPT_KOKKOS_ROOT=$( cd "$( dirname "$0" )" && cd .. && pwd )
#
# Handle arguments
#
# Default. Machine specific can override
DEBUG=False
ARGS=""
CUSTOM_BUILD_LIST=""
@ -41,6 +43,107 @@ DRYRUN=False
BUILD_ONLY=False
declare -i NUM_JOBS_TO_RUN_IN_PARALLEL=3
TEST_SCRIPT=False
SKIP_HWLOC=False
ARCH_FLAG=""
#
# Machine specific config
#
if [ "$MACHINE" = "sems" ]; then
source /projects/modulefiles/utils/sems-modules-init.sh
source /projects/modulefiles/utils/kokkos-modules-init.sh
BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>/base,hwloc/1.10.1/<COMPILER_NAME>/<COMPILER_VERSION>/base"
CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/4.7.2/base"
# Format: (compiler module-list build-list exe-name warning-flag)
COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/4.9.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/5.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"clang/3.5.2 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
"clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
"cuda/6.5.14 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
"cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
"cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
)
elif [ "$MACHINE" = "white" ]; then
source /etc/profile.d/modules.sh
SKIP_HWLOC=True
export SLURM_TASKS_PER_NODE=32
BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>"
IBM_MODULE_LIST="<COMPILER_NAME>/xl/<COMPILER_VERSION>"
CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/4.9.2"
# Don't do pthread on white
GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
# Format: (compiler module-list build-list exe-name warning-flag)
COMPILERS=("gcc/4.9.2 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/5.3.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"ibm/13.1.3 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS"
)
ARCH_FLAG="--arch=Power8"
NUM_JOBS_TO_RUN_IN_PARALLEL=8
elif [ "$MACHINE" = "bowman" ]; then
source /etc/profile.d/modules.sh
SKIP_HWLOC=True
export SLURM_TASKS_PER_NODE=32
BASE_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial"
# Format: (compiler module-list build-list exe-name warning-flag)
COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"intel/17.0.064 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
)
ARCH_FLAG="--arch=KNL"
NUM_JOBS_TO_RUN_IN_PARALLEL=8
elif [ "$MACHINE" = "shepard" ]; then
source /etc/profile.d/modules.sh
SKIP_HWLOC=True
export SLURM_TASKS_PER_NODE=32
BASE_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial"
# Format: (compiler module-list build-list exe-name warning-flag)
COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"intel/17.0.064 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
)
ARCH_FLAG="--arch=HSW"
NUM_JOBS_TO_RUN_IN_PARALLEL=8
else
echo "Unhandled machine $MACHINE" >&2
exit 1
fi
export OMP_NUM_THREADS=4
declare -i NUM_RESULTS_TO_KEEP=7
RESULT_ROOT_PREFIX=TestAll
SCRIPT_KOKKOS_ROOT=$( cd "$( dirname "$0" )" && cd .. && pwd )
#
# Handle arguments
#
while [[ $# > 0 ]]
do
@ -61,6 +164,9 @@ BUILD_ONLY=True
--test-script*)
TEST_SCRIPT=True
;;
--skip-hwloc*)
SKIP_HWLOC=True
;;
--num*)
NUM_JOBS_TO_RUN_IN_PARALLEL="${key#*=}"
;;
@ -73,6 +179,7 @@ echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory"
echo " Defaults to root repo containing this script"
echo "--debug: Run tests in debug. Defaults to False"
echo "--test-script: Test this script, not Kokkos"
echo "--skip-hwloc: Do not do hwloc tests"
echo "--num=N: Number of jobs to run in parallel "
echo "--dry-run: Just print what would be executed"
echo "--build-only: Just do builds, don't run anything"
@ -82,21 +189,16 @@ echo " Valid items:"
echo " OpenMP, Pthread, Serial, OpenMP_Serial, Pthread_Serial"
echo " Cuda_OpenMP, Cuda_Pthread, Cuda_Serial"
echo ""
echo "ARGS: list of expressions matching compilers to test"
echo " supported compilers"
echo " gcc/4.7.2"
echo " gcc/4.8.4"
echo " gcc/4.9.2"
echo " gcc/5.1.0"
echo " intel/14.0.4"
echo " intel/15.0.2"
echo " intel/16.0.1"
echo " clang/3.5.2"
echo " clang/3.6.1"
echo " cuda/6.5.14"
echo " cuda/7.0.28"
echo " cuda/7.5.18"
echo " supported compilers sems"
for COMPILER_DATA in "${COMPILERS[@]}"; do
ARR=($COMPILER_DATA)
COMPILER=${ARR[0]}
echo " $COMPILER"
done
echo ""
echo "Examples:"
echo " Run all tests"
echo " % test_all_sandia"
@ -147,21 +249,6 @@ if [ -z "$ARGS" ]; then
ARGS='?'
fi
# Format: (compiler module-list build-list exe-name warning-flag)
COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/4.9.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/5.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"clang/3.5.2 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
"clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
"cuda/6.5.14 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
"cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
"cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
)
# Process args to figure out which compilers to test
COMPILERS_TO_TEST=""
for ARG in $ARGS; do
@ -240,18 +327,19 @@ run_cmd() {
fi
}
# report_and_log_test_results <SUCCESS> <DESC> <PHASE>
# report_and_log_test_results <SUCCESS> <DESC> <COMMENT>
report_and_log_test_result() {
# Use sane var names
local success=$1; local desc=$2; local phase=$3;
local success=$1; local desc=$2; local comment=$3;
if [ "$success" = "0" ]; then
echo " PASSED $desc"
touch $PASSED_DIR/$desc
echo $comment > $PASSED_DIR/$desc
else
# For failures, comment should be the name of the phase that failed
echo " FAILED $desc" >&2
echo $phase > $FAILED_DIR/$desc
cat ${desc}.${phase}.log
echo $comment > $FAILED_DIR/$desc
cat ${desc}.${comment}.log
fi
}
@ -309,6 +397,8 @@ single_build_and_test() {
echo " Starting job $desc"
local comment="no_comment"
if [ "$TEST_SCRIPT" = "True" ]; then
local rand=$[ 1 + $[ RANDOM % 10 ]]
sleep $rand
@ -316,14 +406,19 @@ single_build_and_test() {
run_cmd ls fake_problem >& ${desc}.configure.log || { report_and_log_test_result 1 $desc configure && return 0; }
fi
else
run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
local -i build_start_time=$(date +%s)
run_cmd make build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
local -i build_end_time=$(date +%s)
comment="build_time=$(($build_end_time-$build_start_time))"
if [[ "$BUILD_ONLY" == False ]]; then
run_cmd make test >& ${desc}.test.log || { report_and_log_test_result 1 ${desc} test && return 0; }
local -i run_end_time=$(date +%s)
comment="$comment run_time=$(($run_end_time-$build_end_time))"
fi
fi
report_and_log_test_result 0 $desc
report_and_log_test_result 0 $desc "$comment"
return 0
}
@ -374,7 +469,7 @@ build_and_test_all() {
run_in_background $compiler $build $BUILD_TYPE
# If not cuda, do a hwloc test too
if [[ "$compiler" != cuda* ]]; then
if [[ "$compiler" != cuda* && "$SKIP_HWLOC" == False ]]; then
run_in_background $compiler $build "hwloc-$BUILD_TYPE"
fi
done
@ -401,7 +496,11 @@ wait_summarize_and_exit() {
echo "PASSED TESTS"
echo "#######################################################"
\ls -1 $PASSED_DIR | sort
local passed_test
for passed_test in $(\ls -1 $PASSED_DIR | sort)
do
echo $passed_test $(cat $PASSED_DIR/$passed_test)
done
echo "#######################################################"
echo "FAILED TESTS"
@ -409,7 +508,7 @@ wait_summarize_and_exit() {
local failed_test
local -i rv=0
for failed_test in $(\ls -1 $FAILED_DIR)
for failed_test in $(\ls -1 $FAILED_DIR | sort)
do
echo $failed_test "("$(cat $FAILED_DIR/$failed_test)" failed)"
rv=$rv+1

View File

@ -16,11 +16,22 @@ IF(Kokkos_ENABLE_OpenMP)
LIST( APPEND SOURCES TestOpenMP.cpp)
ENDIF()
TRIBITS_ADD_EXECUTABLE_AND_TEST(
PerformanceTest
# Per #374, we always want to build this test, but we only want to run
# it as a PERFORMANCE test. That's why we separate building the test
# from running the test.
TRIBITS_ADD_EXECUTABLE(
PerfTestExec
SOURCES ${SOURCES}
COMM serial mpi
NUM_MPI_PROCS 1
FAIL_REGULAR_EXPRESSION " FAILED "
TESTONLYLIBS kokkos_gtest
)
TRIBITS_ADD_TEST(
PerformanceTest
NAME PerfTestExec
COMM serial mpi
NUM_MPI_PROCS 1
CATEGORIES PERFORMANCE
FAIL_REGULAR_EXPRESSION " FAILED "
)

View File

@ -54,6 +54,8 @@
#if defined( KOKKOS_HAVE_CUDA )
#include <TestDynRankView.hpp>
#include <Kokkos_UnorderedMap.hpp>
#include <TestGlobal2LocalIds.hpp>
@ -77,6 +79,13 @@ protected:
}
};
TEST_F( cuda, dynrankview_perf )
{
std::cout << "Cuda" << std::endl;
std::cout << " DynRankView vs View: Initialization Only " << std::endl;
test_dynrankview_op_perf<Kokkos::Cuda>( 4096 );
}
TEST_F( cuda, global_2_local)
{
std::cout << "Cuda" << std::endl;

View File

@ -0,0 +1,265 @@
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
#ifndef KOKKOS_TEST_DYNRANKVIEW_HPP
#define KOKKOS_TEST_DYNRANKVIEW_HPP
#include <Kokkos_Core.hpp>
#include <Kokkos_DynRankView.hpp>
#include <vector>
#include <impl/Kokkos_Timer.hpp>
// Compare performance of DynRankView to View, specific focus on the parenthesis operators
namespace Performance {
//View functor
template <typename DeviceType>
struct InitViewFunctor {
typedef Kokkos::View<double***, DeviceType> inviewtype;
inviewtype _inview;
InitViewFunctor( inviewtype &inview_ ) : _inview(inview_)
{}
KOKKOS_INLINE_FUNCTION
void operator()(const int i) const {
for (unsigned j = 0; j < _inview.dimension(1); ++j) {
for (unsigned k = 0; k < _inview.dimension(2); ++k) {
_inview(i,j,k) = i/2 -j*j + k/3;
}
}
}
struct SumComputationTest
{
typedef Kokkos::View<double***, DeviceType> inviewtype;
inviewtype _inview;
typedef Kokkos::View<double*, DeviceType> outviewtype;
outviewtype _outview;
KOKKOS_INLINE_FUNCTION
SumComputationTest(inviewtype &inview_ , outviewtype &outview_) : _inview(inview_), _outview(outview_) {}
KOKKOS_INLINE_FUNCTION
void operator()(const int i) const {
for (unsigned j = 0; j < _inview.dimension(1); ++j) {
for (unsigned k = 0; k < _inview.dimension(2); ++k) {
_outview(i) += _inview(i,j,k) ;
}
}
}
};
};
template <typename DeviceType>
struct InitStrideViewFunctor {
typedef Kokkos::View<double***, Kokkos::LayoutStride, DeviceType> inviewtype;
inviewtype _inview;
InitStrideViewFunctor( inviewtype &inview_ ) : _inview(inview_)
{}
KOKKOS_INLINE_FUNCTION
void operator()(const int i) const {
for (unsigned j = 0; j < _inview.dimension(1); ++j) {
for (unsigned k = 0; k < _inview.dimension(2); ++k) {
_inview(i,j,k) = i/2 -j*j + k/3;
}
}
}
};
template <typename DeviceType>
struct InitViewRank7Functor {
typedef Kokkos::View<double*******, DeviceType> inviewtype;
inviewtype _inview;
InitViewRank7Functor( inviewtype &inview_ ) : _inview(inview_)
{}
KOKKOS_INLINE_FUNCTION
void operator()(const int i) const {
for (unsigned j = 0; j < _inview.dimension(1); ++j) {
for (unsigned k = 0; k < _inview.dimension(2); ++k) {
_inview(i,j,k,0,0,0,0) = i/2 -j*j + k/3;
}
}
}
};
//DynRankView functor
template <typename DeviceType>
struct InitDynRankViewFunctor {
typedef Kokkos::DynRankView<double, DeviceType> inviewtype;
inviewtype _inview;
InitDynRankViewFunctor( inviewtype &inview_ ) : _inview(inview_)
{}
KOKKOS_INLINE_FUNCTION
void operator()(const int i) const {
for (unsigned j = 0; j < _inview.dimension(1); ++j) {
for (unsigned k = 0; k < _inview.dimension(2); ++k) {
_inview(i,j,k) = i/2 -j*j + k/3;
}
}
}
struct SumComputationTest
{
typedef Kokkos::DynRankView<double, DeviceType> inviewtype;
inviewtype _inview;
typedef Kokkos::DynRankView<double, DeviceType> outviewtype;
outviewtype _outview;
KOKKOS_INLINE_FUNCTION
SumComputationTest(inviewtype &inview_ , outviewtype &outview_) : _inview(inview_), _outview(outview_) {}
KOKKOS_INLINE_FUNCTION
void operator()(const int i) const {
for (unsigned j = 0; j < _inview.dimension(1); ++j) {
for (unsigned k = 0; k < _inview.dimension(2); ++k) {
_outview(i) += _inview(i,j,k) ;
}
}
}
};
};
template <typename DeviceType>
void test_dynrankview_op_perf( const int par_size )
{
typedef DeviceType execution_space;
typedef typename execution_space::size_type size_type;
const size_type dim2 = 900;
const size_type dim3 = 300;
double elapsed_time_view = 0;
double elapsed_time_compview = 0;
double elapsed_time_strideview = 0;
double elapsed_time_view_rank7 = 0;
double elapsed_time_drview = 0;
double elapsed_time_compdrview = 0;
Kokkos::Timer timer;
{
Kokkos::View<double***,DeviceType> testview("testview",par_size,dim2,dim3);
typedef InitViewFunctor<DeviceType> FunctorType;
timer.reset();
Kokkos::RangePolicy<DeviceType> policy(0,par_size);
Kokkos::parallel_for( policy , FunctorType(testview) );
DeviceType::fence();
elapsed_time_view = timer.seconds();
std::cout << " View time (init only): " << elapsed_time_view << std::endl;
timer.reset();
Kokkos::View<double*,DeviceType> sumview("sumview",par_size);
Kokkos::parallel_for( policy , typename FunctorType::SumComputationTest(testview, sumview) );
DeviceType::fence();
elapsed_time_compview = timer.seconds();
std::cout << " View sum computation time: " << elapsed_time_view << std::endl;
Kokkos::View<double***,Kokkos::LayoutStride, DeviceType> teststrideview = Kokkos::subview(testview, Kokkos::ALL, Kokkos::ALL,Kokkos::ALL);
typedef InitStrideViewFunctor<DeviceType> FunctorStrideType;
timer.reset();
Kokkos::parallel_for( policy , FunctorStrideType(teststrideview) );
DeviceType::fence();
elapsed_time_strideview = timer.seconds();
std::cout << " Strided View time (init only): " << elapsed_time_strideview << std::endl;
}
{
Kokkos::View<double*******,DeviceType> testview("testview",par_size,dim2,dim3,1,1,1,1);
typedef InitViewRank7Functor<DeviceType> FunctorType;
timer.reset();
Kokkos::RangePolicy<DeviceType> policy(0,par_size);
Kokkos::parallel_for( policy , FunctorType(testview) );
DeviceType::fence();
elapsed_time_view_rank7 = timer.seconds();
std::cout << " View Rank7 time (init only): " << elapsed_time_view_rank7 << std::endl;
}
{
Kokkos::DynRankView<double,DeviceType> testdrview("testdrview",par_size,dim2,dim3);
typedef InitDynRankViewFunctor<DeviceType> FunctorType;
timer.reset();
Kokkos::RangePolicy<DeviceType> policy(0,par_size);
Kokkos::parallel_for( policy , FunctorType(testdrview) );
DeviceType::fence();
elapsed_time_drview = timer.seconds();
std::cout << " DynRankView time (init only): " << elapsed_time_drview << std::endl;
timer.reset();
Kokkos::DynRankView<double,DeviceType> sumview("sumview",par_size);
Kokkos::parallel_for( policy , typename FunctorType::SumComputationTest(testdrview, sumview) );
DeviceType::fence();
elapsed_time_compdrview = timer.seconds();
std::cout << " DynRankView sum computation time: " << elapsed_time_compdrview << std::endl;
}
std::cout << " Ratio of View to DynRankView time: " << elapsed_time_view / elapsed_time_drview << std::endl; //expect < 1
std::cout << " Ratio of View to DynRankView sum computation time: " << elapsed_time_compview / elapsed_time_compdrview << std::endl; //expect < 1
std::cout << " Ratio of View to View Rank7 time: " << elapsed_time_view / elapsed_time_view_rank7 << std::endl; //expect < 1
std::cout << " Ratio of StrideView to DynRankView time: " << elapsed_time_strideview / elapsed_time_drview << std::endl; //expect < 1
std::cout << " Ratio of DynRankView to View Rank7 time: " << elapsed_time_drview / elapsed_time_view_rank7 << std::endl; //expect ?
timer.reset();
} //end test_dynrankview
} //end Performance
#endif

View File

@ -178,7 +178,7 @@ void test_global_to_local_ids(unsigned num_ids)
std::cout << num_ids << ", ";
double elasped_time = 0;
Kokkos::Impl::Timer timer;
Kokkos::Timer timer;
local_id_view local_2_global("local_ids", num_ids);
global_id_view global_2_local((3u*num_ids)/2u);

View File

@ -50,6 +50,8 @@
#include <TestGlobal2LocalIds.hpp>
#include <TestUnorderedMapPerformance.hpp>
#include <TestDynRankView.hpp>
#include <iomanip>
#include <sstream>
#include <string>
@ -91,6 +93,13 @@ protected:
}
};
TEST_F( openmp, dynrankview_perf )
{
std::cout << "OpenMP" << std::endl;
std::cout << " DynRankView vs View: Initialization Only " << std::endl;
test_dynrankview_op_perf<Kokkos::OpenMP>( 8192 );
}
TEST_F( openmp, global_2_local)
{
std::cout << "OpenMP" << std::endl;

View File

@ -52,6 +52,8 @@
#include <TestGlobal2LocalIds.hpp>
#include <TestUnorderedMapPerformance.hpp>
#include <TestDynRankView.hpp>
#include <iomanip>
#include <sstream>
#include <string>
@ -85,6 +87,13 @@ protected:
}
};
TEST_F( threads, dynrankview_perf )
{
std::cout << "Threads" << std::endl;
std::cout << " DynRankView vs View: Initialization Only " << std::endl;
test_dynrankview_op_perf<Kokkos::Threads>( 8192 );
}
TEST_F( threads, global_2_local)
{
std::cout << "Threads" << std::endl;

View File

@ -80,7 +80,7 @@ struct UnorderedMapTest
, map(capacity)
, histogram(map.get_histogram())
{
Kokkos::Impl::Timer wall_clock ;
Kokkos::Timer wall_clock ;
wall_clock.reset();
value_type v = {};
@ -228,7 +228,7 @@ void run_performance_tests(std::string const & base_file_name)
distance_out << "\b\b\b " << std::endl;
block_distance_out << "\b\b\b " << std::endl;
Kokkos::Impl::Timer wall_clock ;
Kokkos::Timer wall_clock ;
for (int i=0; i < num_collisions ; ++i) {
wall_clock.reset();
std::cout << "Collisions: " << collisions[i] << std::endl;

File diff suppressed because it is too large Load Diff

View File

@ -77,10 +77,7 @@ private:
public:
typedef Kokkos::Experimental::MemoryPool
< typename traits::memory_space
, typename traits::execution_space
> memory_pool ;
typedef Kokkos::Experimental::MemoryPool< typename traits::device_type > memory_pool ;
private:
@ -338,7 +335,7 @@ public:
void operator()( unsigned i ) const
{
if ( m_destroy && i < m_chunk_max && 0 != m_chunks[i] ) {
m_pool.deallocate( m_chunks[i] , m_pool.get_min_chunk_size() );
m_pool.deallocate( m_chunks[i] , m_pool.get_min_block_size() );
}
m_chunks[i] = 0 ;
}
@ -397,7 +394,7 @@ public:
// The memory pool chunk is guaranteed to be a power of two
, m_chunk_shift(
Kokkos::Impl::integral_power_of_two(
m_pool.get_min_chunk_size()/sizeof(typename traits::value_type)) )
m_pool.get_min_block_size()/sizeof(typename traits::value_type)) )
, m_chunk_mask( ( 1 << m_chunk_shift ) - 1 )
, m_chunk_max( ( arg_size_max + m_chunk_mask ) >> m_chunk_shift )
{

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -45,6 +45,7 @@
#define KOKKOS_BITSET_IMPL_HPP
#include <Kokkos_Macros.hpp>
#include <impl/Kokkos_BitOps.hpp>
#include <stdint.h>
#include <cstdio>
@ -52,122 +53,57 @@
#include <iostream>
#include <iomanip>
namespace Kokkos { namespace Impl {
namespace Kokkos {
namespace Impl {
KOKKOS_FORCEINLINE_FUNCTION
unsigned rotate_right(unsigned i, int r)
unsigned rotate_right( unsigned i, int r )
{
enum { size = static_cast<int>(sizeof(unsigned)*CHAR_BIT) };
return r ? ((i >> r) | (i << (size-r))) : i ;
enum { size = static_cast<int>( sizeof(unsigned) * CHAR_BIT ) };
return r ? ( ( i >> r ) | ( i << ( size - r ) ) ) : i ;
}
KOKKOS_FORCEINLINE_FUNCTION
int bit_scan_forward(unsigned i)
{
#if defined( __CUDA_ARCH__ )
return __ffs(i) - 1;
#elif defined( __GNUC__ ) || defined( __GNUG__ )
return __builtin_ffs(i) - 1;
#elif defined( __INTEL_COMPILER )
return _bit_scan_forward(i);
#else
unsigned t = 1u;
int r = 0;
while (i && (i & t == 0))
{
t = t << 1;
++r;
}
return r;
#endif
}
KOKKOS_FORCEINLINE_FUNCTION
int bit_scan_reverse(unsigned i)
{
enum { shift = static_cast<int>(sizeof(unsigned)*CHAR_BIT - 1) };
#if defined( __CUDA_ARCH__ )
return shift - __clz(i);
#elif defined( __GNUC__ ) || defined( __GNUG__ )
return shift - __builtin_clz(i);
#elif defined( __INTEL_COMPILER )
return _bit_scan_reverse(i);
#else
unsigned t = 1u << shift;
int r = 0;
while (i && (i & t == 0))
{
t = t >> 1;
++r;
}
return r;
#endif
}
// count the bits set
KOKKOS_FORCEINLINE_FUNCTION
int popcount(unsigned i)
{
#if defined( __CUDA_ARCH__ )
return __popc(i);
#elif defined( __GNUC__ ) || defined( __GNUG__ )
return __builtin_popcount(i);
#elif defined ( __INTEL_COMPILER )
return _popcnt32(i);
#else
// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetNaive
i = i - ((i >> 1) & ~0u/3u); // temp
i = (i & ~0u/15u*3u) + ((i >> 2) & ~0u/15u*3u); // temp
i = (i + (i >> 4)) & ~0u/255u*15u; // temp
return (int)((i * (~0u/255u)) >> (sizeof(unsigned) - 1) * CHAR_BIT); // count
#endif
}
template <typename Bitset>
template < typename Bitset >
struct BitsetCount
{
typedef Bitset bitset_type;
typedef typename bitset_type::execution_space::execution_space execution_space;
typedef typename bitset_type::size_type size_type;
typedef size_type value_type;
typedef Bitset bitset_type;
typedef typename bitset_type::execution_space::execution_space execution_space;
typedef typename bitset_type::size_type size_type;
typedef size_type value_type;
bitset_type m_bitset;
BitsetCount( bitset_type const& bitset)
BitsetCount( bitset_type const& bitset )
: m_bitset(bitset)
{}
size_type apply() const
{
size_type count = 0u;
parallel_reduce(m_bitset.m_blocks.dimension_0(), *this, count);
parallel_reduce( m_bitset.m_blocks.dimension_0(), *this, count );
return count;
}
KOKKOS_INLINE_FUNCTION
static void init( value_type & count)
void init( value_type & count ) const
{
count = 0u;
}
KOKKOS_INLINE_FUNCTION
static void join( volatile value_type & count, const volatile size_type & incr )
void join( volatile value_type & count, const volatile size_type & incr ) const
{
count += incr;
}
KOKKOS_INLINE_FUNCTION
void operator()( size_type i, value_type & count) const
void operator()( size_type i, value_type & count ) const
{
count += popcount(m_bitset.m_blocks[i]);
count += bit_count( m_bitset.m_blocks[i] );
}
};
}} //Kokkos::Impl
} // namespace Impl
} // namespace Kokkos
#endif // KOKKOS_BITSET_IMPL_HPP

View File

@ -713,13 +713,20 @@ public:
typedef Kokkos::Experimental::DynRankView< const T , device > const_dView0 ;
typedef Kokkos::Experimental::DynRankView< T, device, Kokkos::MemoryUnmanaged > dView0_unmanaged ;
typedef typename dView0::host_mirror_space host ;
typedef typename dView0::host_mirror_space host_drv_space ;
typedef Kokkos::Experimental::View< T , device > View0 ;
typedef Kokkos::Experimental::View< T* , device > View1 ;
typedef Kokkos::Experimental::View< T******* , device > View7 ;
typedef typename View0::host_mirror_space host_view_space ;
TestDynViewAPI()
{
run_test_resize_realloc();
run_test_mirror();
run_test();
run_test_scalar();
run_test();
run_test_const();
run_test_subview();
run_test_subview_strided();
@ -735,19 +742,147 @@ public:
TestViewOperator_LeftAndRight< int , device , 1 >::testit(2);
}
static void run_test_resize_realloc()
{
dView0 drv0("drv0", 10, 20, 30);
ASSERT_EQ( drv0.rank(), 3);
Kokkos::Experimental::resize(drv0, 5, 10);
ASSERT_EQ( drv0.rank(), 2);
ASSERT_EQ( drv0.dimension_0(), 5);
ASSERT_EQ( drv0.dimension_1(), 10);
ASSERT_EQ( drv0.dimension_2(), 1);
Kokkos::Experimental::realloc(drv0, 10, 20);
ASSERT_EQ( drv0.rank(), 2);
ASSERT_EQ( drv0.dimension_0(), 10);
ASSERT_EQ( drv0.dimension_1(), 20);
ASSERT_EQ( drv0.dimension_2(), 1);
}
static void run_test_mirror()
{
typedef Kokkos::Experimental::DynRankView< int , host > view_type ;
typedef Kokkos::Experimental::DynRankView< int , host_drv_space > view_type ;
typedef typename view_type::HostMirror mirror_type ;
view_type a("a");
mirror_type am = Kokkos::Experimental::create_mirror_view(a);
mirror_type ax = Kokkos::Experimental::create_mirror(a);
ASSERT_EQ( & a() , & am() );
ASSERT_EQ( a.rank() , am.rank() );
ASSERT_EQ( ax.rank() , am.rank() );
if (Kokkos::HostSpace::execution_space::is_initialized() )
{
Kokkos::DynRankView<double, Kokkos::LayoutLeft, Kokkos::HostSpace> a_h("A",1000);
auto a_h2 = Kokkos::create_mirror(Kokkos::HostSpace(),a_h);
auto a_d = Kokkos::create_mirror(typename device::memory_space(),a_h);
int equal_ptr_h_h2 = (a_h.data() ==a_h2.data())?1:0;
int equal_ptr_h_d = (a_h.data() ==a_d. data())?1:0;
int equal_ptr_h2_d = (a_h2.data()==a_d. data())?1:0;
ASSERT_EQ(equal_ptr_h_h2,0);
ASSERT_EQ(equal_ptr_h_d ,0);
ASSERT_EQ(equal_ptr_h2_d,0);
ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
ASSERT_EQ(a_h.rank(),a_h2.rank());
ASSERT_EQ(a_h.rank(),a_d.rank());
}
if (Kokkos::HostSpace::execution_space::is_initialized() )
{
Kokkos::DynRankView<double, Kokkos::LayoutRight, Kokkos::HostSpace> a_h("A",1000);
auto a_h2 = Kokkos::create_mirror(Kokkos::HostSpace(),a_h);
auto a_d = Kokkos::create_mirror(typename device::memory_space(),a_h);
int equal_ptr_h_h2 = (a_h.data() ==a_h2.data())?1:0;
int equal_ptr_h_d = (a_h.data() ==a_d. data())?1:0;
int equal_ptr_h2_d = (a_h2.data()==a_d. data())?1:0;
ASSERT_EQ(equal_ptr_h_h2,0);
ASSERT_EQ(equal_ptr_h_d ,0);
ASSERT_EQ(equal_ptr_h2_d,0);
ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
ASSERT_EQ(a_h.rank(),a_h2.rank());
ASSERT_EQ(a_h.rank(),a_d.rank());
}
if (Kokkos::HostSpace::execution_space::is_initialized() )
{
Kokkos::DynRankView<double, Kokkos::LayoutLeft, Kokkos::HostSpace> a_h("A",1000);
auto a_h2 = Kokkos::create_mirror_view(Kokkos::HostSpace(),a_h);
auto a_d = Kokkos::create_mirror_view(typename device::memory_space(),a_h);
int equal_ptr_h_h2 = a_h.data() ==a_h2.data()?1:0;
int equal_ptr_h_d = a_h.data() ==a_d. data()?1:0;
int equal_ptr_h2_d = a_h2.data()==a_d. data()?1:0;
int is_same_memspace = std::is_same<Kokkos::HostSpace,typename device::memory_space>::value?1:0;
ASSERT_EQ(equal_ptr_h_h2,1);
ASSERT_EQ(equal_ptr_h_d ,is_same_memspace);
ASSERT_EQ(equal_ptr_h2_d ,is_same_memspace);
ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
ASSERT_EQ(a_h.rank(),a_h2.rank());
ASSERT_EQ(a_h.rank(),a_d.rank());
}
if (Kokkos::HostSpace::execution_space::is_initialized() )
{
Kokkos::DynRankView<double, Kokkos::LayoutRight, Kokkos::HostSpace> a_h("A",1000);
auto a_h2 = Kokkos::create_mirror_view(Kokkos::HostSpace(),a_h);
auto a_d = Kokkos::create_mirror_view(typename device::memory_space(),a_h);
int equal_ptr_h_h2 = a_h.data() ==a_h2.data()?1:0;
int equal_ptr_h_d = a_h.data() ==a_d. data()?1:0;
int equal_ptr_h2_d = a_h2.data()==a_d. data()?1:0;
int is_same_memspace = std::is_same<Kokkos::HostSpace,typename device::memory_space>::value?1:0;
ASSERT_EQ(equal_ptr_h_h2,1);
ASSERT_EQ(equal_ptr_h_d ,is_same_memspace);
ASSERT_EQ(equal_ptr_h2_d ,is_same_memspace);
ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
ASSERT_EQ(a_h.rank(),a_h2.rank());
ASSERT_EQ(a_h.rank(),a_d.rank());
}
if (Kokkos::HostSpace::execution_space::is_initialized() )
{
typedef Kokkos::DynRankView< int , Kokkos::LayoutStride , Kokkos::HostSpace > view_stride_type ;
unsigned order[] = { 6,5,4,3,2,1,0 }, dimen[] = { N0, N1, N2, 2, 2, 2, 2 }; //LayoutRight equivalent
view_stride_type a_h( "a" , Kokkos::LayoutStride::order_dimensions(7, order, dimen) );
auto a_h2 = Kokkos::create_mirror_view(Kokkos::HostSpace(),a_h);
auto a_d = Kokkos::create_mirror_view(typename device::memory_space(),a_h);
int equal_ptr_h_h2 = a_h.data() ==a_h2.data()?1:0;
int equal_ptr_h_d = a_h.data() ==a_d. data()?1:0;
int equal_ptr_h2_d = a_h2.data()==a_d. data()?1:0;
int is_same_memspace = std::is_same<Kokkos::HostSpace,typename device::memory_space>::value?1:0;
ASSERT_EQ(equal_ptr_h_h2,1);
ASSERT_EQ(equal_ptr_h_d ,is_same_memspace);
ASSERT_EQ(equal_ptr_h2_d ,is_same_memspace);
ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
ASSERT_EQ(a_h.rank(),a_h2.rank());
ASSERT_EQ(a_h.rank(),a_d.rank());
}
}
static void run_test_scalar()
{
typedef typename dView0::HostMirror hView0 ;
typedef typename dView0::HostMirror hView0 ; //HostMirror of DynRankView is a DynRankView
dView0 dx , dy ;
hView0 hx , hy ;
@ -765,6 +900,79 @@ public:
Kokkos::Experimental::deep_copy( hy , dy );
ASSERT_EQ( hx(), hy() );
ASSERT_EQ( dx.rank() , hx.rank() );
ASSERT_EQ( dy.rank() , hy.rank() );
//View - DynRankView Interoperability tests
// deep_copy DynRankView to View
View0 vx("vx");
Kokkos::deep_copy( vx , dx );
ASSERT_EQ( rank(dx) , rank(vx) );
View0 vy("vy");
Kokkos::deep_copy( vy , dy );
ASSERT_EQ( rank(dy) , rank(vy) );
// deep_copy View to DynRankView
dView0 dxx("dxx");
Kokkos::deep_copy( dxx , vx );
ASSERT_EQ( rank(dxx) , rank(vx) );
View7 vcast = dx.ConstDownCast();
ASSERT_EQ( dx.dimension_0() , vcast.dimension_0() );
ASSERT_EQ( dx.dimension_1() , vcast.dimension_1() );
ASSERT_EQ( dx.dimension_2() , vcast.dimension_2() );
ASSERT_EQ( dx.dimension_3() , vcast.dimension_3() );
ASSERT_EQ( dx.dimension_4() , vcast.dimension_4() );
View7 vcast1( dy.ConstDownCast() );
ASSERT_EQ( dy.dimension_0() , vcast1.dimension_0() );
ASSERT_EQ( dy.dimension_1() , vcast1.dimension_1() );
ASSERT_EQ( dy.dimension_2() , vcast1.dimension_2() );
ASSERT_EQ( dy.dimension_3() , vcast1.dimension_3() );
ASSERT_EQ( dy.dimension_4() , vcast1.dimension_4() );
//View - DynRankView Interoperability tests
// copy View to DynRankView
dView0 dfromvx( vx );
auto hmx = Kokkos::create_mirror_view(dfromvx) ;
Kokkos::deep_copy(hmx , dfromvx);
auto hvx = Kokkos::create_mirror_view(vx) ;
Kokkos::deep_copy(hvx , vx);
ASSERT_EQ( rank(hvx) , rank(hmx) );
ASSERT_EQ( hvx.dimension_0() , hmx.dimension_0() );
ASSERT_EQ( hvx.dimension_1() , hmx.dimension_1() );
// copy-assign View to DynRankView
dView0 dfromvy = vy ;
auto hmy = Kokkos::create_mirror_view(dfromvy) ;
Kokkos::deep_copy(hmy , dfromvy);
auto hvy = Kokkos::create_mirror_view(vy) ;
Kokkos::deep_copy(hvy , vy);
ASSERT_EQ( rank(hvy) , rank(hmy) );
ASSERT_EQ( hvy.dimension_0() , hmy.dimension_0() );
ASSERT_EQ( hvy.dimension_1() , hmy.dimension_1() );
View7 vtest1("vtest1",2,2,2,2,2,2,2);
dView0 dfromv1( vtest1 );
ASSERT_EQ( dfromv1.rank() , vtest1.Rank );
ASSERT_EQ( dfromv1.dimension_0() , vtest1.dimension_0() );
ASSERT_EQ( dfromv1.dimension_1() , vtest1.dimension_1() );
ASSERT_EQ( dfromv1.use_count() , vtest1.use_count() );
dView0 dfromv2( vcast );
ASSERT_EQ( dfromv2.rank() , vcast.Rank );
ASSERT_EQ( dfromv2.dimension_0() , vcast.dimension_0() );
ASSERT_EQ( dfromv2.dimension_1() , vcast.dimension_1() );
ASSERT_EQ( dfromv2.use_count() , vcast.use_count() );
dView0 dfromv3 = vcast1;
ASSERT_EQ( dfromv3.rank() , vcast1.Rank );
ASSERT_EQ( dfromv3.dimension_0() , vcast1.dimension_0() );
ASSERT_EQ( dfromv3.dimension_1() , vcast1.dimension_1() );
ASSERT_EQ( dfromv3.use_count() , vcast1.use_count() );
}
static void run_test()
@ -782,22 +990,32 @@ public:
(void) thing;
}
dView0 d_uninitialized(Kokkos::ViewAllocateWithoutInitializing("uninit"),10,20);
ASSERT_TRUE( d_uninitialized.data() != nullptr );
ASSERT_EQ( d_uninitialized.rank() , 2 );
ASSERT_EQ( d_uninitialized.dimension_0() , 10 );
ASSERT_EQ( d_uninitialized.dimension_1() , 20 );
ASSERT_EQ( d_uninitialized.dimension_2() , 1 );
dView0 dx , dy , dz ;
hView0 hx , hy , hz ;
ASSERT_TRUE( dx.ptr_on_device() == 0 );
ASSERT_TRUE( dy.ptr_on_device() == 0 );
ASSERT_TRUE( dz.ptr_on_device() == 0 );
ASSERT_TRUE( Kokkos::Experimental::is_dyn_rank_view<dView0>::value );
ASSERT_FALSE( Kokkos::Experimental::is_dyn_rank_view< Kokkos::View<double> >::value );
ASSERT_TRUE( dx.ptr_on_device() == 0 ); //Okay with UVM
ASSERT_TRUE( dy.ptr_on_device() == 0 ); //Okay with UVM
ASSERT_TRUE( dz.ptr_on_device() == 0 ); //Okay with UVM
ASSERT_TRUE( hx.ptr_on_device() == 0 );
ASSERT_TRUE( hy.ptr_on_device() == 0 );
ASSERT_TRUE( hz.ptr_on_device() == 0 );
ASSERT_EQ( dx.dimension_0() , 0u );
ASSERT_EQ( dy.dimension_0() , 0u );
ASSERT_EQ( dz.dimension_0() , 0u );
ASSERT_EQ( dx.dimension_0() , 0u ); //Okay with UVM
ASSERT_EQ( dy.dimension_0() , 0u ); //Okay with UVM
ASSERT_EQ( dz.dimension_0() , 0u ); //Okay with UVM
ASSERT_EQ( hx.dimension_0() , 0u );
ASSERT_EQ( hy.dimension_0() , 0u );
ASSERT_EQ( hz.dimension_0() , 0u );
ASSERT_EQ( dx.rank() , 0u );
ASSERT_EQ( dx.rank() , 0u ); //Okay with UVM
ASSERT_EQ( hx.rank() , 0u );
dx = dView0( "dx" , N1 , N2 , N3 );
@ -806,11 +1024,11 @@ public:
hx = hView0( "hx" , N1 , N2 , N3 );
hy = hView0( "hy" , N1 , N2 , N3 );
ASSERT_EQ( dx.dimension_0() , unsigned(N1) );
ASSERT_EQ( dy.dimension_0() , unsigned(N1) );
ASSERT_EQ( dx.dimension_0() , unsigned(N1) ); //Okay with UVM
ASSERT_EQ( dy.dimension_0() , unsigned(N1) ); //Okay with UVM
ASSERT_EQ( hx.dimension_0() , unsigned(N1) );
ASSERT_EQ( hy.dimension_0() , unsigned(N1) );
ASSERT_EQ( dx.rank() , 3 );
ASSERT_EQ( dx.rank() , 3 ); //Okay with UVM
ASSERT_EQ( hx.rank() , 3 );
dx = dView0( "dx" , N0 , N1 , N2 , N3 );
@ -823,19 +1041,23 @@ public:
ASSERT_EQ( hx.dimension_0() , unsigned(N0) );
ASSERT_EQ( hy.dimension_0() , unsigned(N0) );
ASSERT_EQ( dx.rank() , 4 );
ASSERT_EQ( dy.rank() , 4 );
ASSERT_EQ( hx.rank() , 4 );
ASSERT_EQ( hy.rank() , 4 );
ASSERT_EQ( dx.use_count() , size_t(1) );
dView0_unmanaged unmanaged_dx = dx;
ASSERT_EQ( dx.use_count() , size_t(1) );
dView0_unmanaged unmanaged_from_ptr_dx = dView0_unmanaged(dx.ptr_on_device(),
dx.dimension_0(),
dx.dimension_1(),
dx.dimension_2(),
dx.dimension_3());
{
// Destruction of this view should be harmless
const_dView0 unmanaged_from_ptr_const_dx( dx.ptr_on_device() ,
@ -888,6 +1110,19 @@ public:
hx = Kokkos::Experimental::create_mirror( dx );
hy = Kokkos::Experimental::create_mirror( dy );
ASSERT_EQ( hx.rank() , dx.rank() );
ASSERT_EQ( hy.rank() , dy.rank() );
ASSERT_EQ( hx.dimension_0() , unsigned(N0) );
ASSERT_EQ( hx.dimension_1() , unsigned(N1) );
ASSERT_EQ( hx.dimension_2() , unsigned(N2) );
ASSERT_EQ( hx.dimension_3() , unsigned(N3) );
ASSERT_EQ( hy.dimension_0() , unsigned(N0) );
ASSERT_EQ( hy.dimension_1() , unsigned(N1) );
ASSERT_EQ( hy.dimension_2() , unsigned(N2) );
ASSERT_EQ( hy.dimension_3() , unsigned(N3) );
// T v1 = hx() ; // Generates compile error as intended
// T v2 = hx(0,0) ; // Generates compile error as intended
// hx(0,0) = v2 ; // Generates compile error as intended
@ -990,7 +1225,9 @@ public:
for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
{ ASSERT_EQ( hx(ip,i1,i2,i3) , T(0) ); }
}}}}
// ASSERT_EQ( hx(0,0,0,0,0,0,0,0) , T(0) ); //Test rank8 op behaves properly - if implemented
}
dz = dx ; ASSERT_EQ( dx, dz); ASSERT_NE( dy, dz);
dz = dy ; ASSERT_EQ( dy, dz); ASSERT_NE( dx, dz);
@ -1006,6 +1243,35 @@ public:
ASSERT_TRUE( dx.ptr_on_device() == 0 );
ASSERT_TRUE( dy.ptr_on_device() == 0 );
ASSERT_TRUE( dz.ptr_on_device() == 0 );
//View - DynRankView Interoperability tests
// deep_copy from view to dynrankview
const int testdim = 4;
dView0 dxx("dxx",testdim);
View1 vxx("vxx",testdim);
auto hvxx = Kokkos::create_mirror_view(vxx);
for (int i = 0; i < testdim; ++i)
{ hvxx(i) = i; }
Kokkos::deep_copy(vxx,hvxx);
Kokkos::deep_copy(dxx,vxx);
auto hdxx = Kokkos::create_mirror_view(dxx);
Kokkos::deep_copy(hdxx,dxx);
for (int i = 0; i < testdim; ++i)
{ ASSERT_EQ( hvxx(i) , hdxx(i) ); }
ASSERT_EQ( rank(hdxx) , rank(hvxx) );
ASSERT_EQ( hdxx.dimension_0() , testdim );
ASSERT_EQ( hdxx.dimension_0() , hvxx.dimension_0() );
// deep_copy from dynrankview to view
View1 vdxx("vdxx",testdim);
auto hvdxx = Kokkos::create_mirror_view(vdxx);
Kokkos::deep_copy(hvdxx , hdxx);
ASSERT_EQ( rank(hdxx) , rank(hvdxx) );
ASSERT_EQ( hvdxx.dimension_0() , testdim );
ASSERT_EQ( hdxx.dimension_0() , hvdxx.dimension_0() );
for (int i = 0; i < testdim; ++i)
{ ASSERT_EQ( hvxx(i) , hvdxx(i) ); }
}
typedef T DataType ;
@ -1059,35 +1325,66 @@ public:
// N0 = 1000,N1 = 3,N2 = 5,N3 = 7
unsigned order[] = { 6,5,4,3,2,1,0 }, dimen[] = { N0, N1, N2, 2, 2, 2, 2 }; //LayoutRight equivalent
sdView d7( "d7" , Kokkos::LayoutStride::order_dimensions(7, order, dimen) );
ASSERT_EQ( d7.rank() , 7 );
sdView ds0 = Kokkos::subdynrankview( d7 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ); //Should be rank0 subview
sdView ds0 = Kokkos::subdynrankview( d7 , 1 , 1 , 1 , 1 , 1 , 1 , 1 );
ASSERT_EQ( ds0.rank() , 0 );
//Basic test - ALL
sdView dsALL = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() ); //compiles and runs
sdView dsALL = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() );
ASSERT_EQ( dsALL.rank() , 7 );
// Send a single value for one rank
// Send a value to final rank returning rank 6 subview
sdView dsm1 = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , 1 );
ASSERT_EQ( dsm1.rank() , 6 );
// Send a std::pair as a rank
// Send a std::pair as argument to a rank
sdView dssp = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , std::pair<unsigned,unsigned>(1,2) );
ASSERT_EQ( dssp.rank() , 7 );
// Send a kokkos::pair as a rank; take default layout as input
// Send a kokkos::pair as argument to a rank; take default layout as input
dView0 dd0("dd0" , N0 , N1 , N2 , 2 , 2 , 2 , 2 ); //default layout
ASSERT_EQ( dd0.rank() , 7 );
sdView dtkp = Kokkos::Experimental::subdynrankview( dd0 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) );
ASSERT_EQ( dtkp.rank() , 7 );
// Return rank 7 subview, taking a pair as one argument, layout stride input
sdView ds7 = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) );
ASSERT_EQ( ds7.rank() , 7 );
// Default Layout DynRankView
dView dv6("dv6" , N0 , N1 , N2 , N3 , 2 , 2 );
ASSERT_EQ( dv6.rank() , 6 );
// DynRankView with LayoutRight
typedef Kokkos::Experimental::DynRankView< T , Kokkos::LayoutRight , device > drView ;
drView dr5( "dr5" , N0 , N1 , N2 , 2 , 2 );
ASSERT_EQ( dr5.rank() , 5 );
// LayoutStride but arranged as LayoutRight
unsigned order3[] = { 4,3,2,1,0 }, dimen3[] = { N0, N1, N2, 2, 2 };
sdView d5( "d5" , Kokkos::LayoutStride::order_dimensions(5, order3, dimen3) );
// NOTE: unused arg_layout dimensions must be set to ~size_t(0) so that
// rank deduction can properly take place
unsigned order5[] = { 4,3,2,1,0 }, dimen5[] = { N0, N1, N2, 2, 2 };
Kokkos::LayoutStride ls = Kokkos::LayoutStride::order_dimensions(5, order5, dimen5);
ls.dimension[5] = ~size_t(0);
ls.dimension[6] = ~size_t(0);
ls.dimension[7] = ~size_t(0);
sdView d5("d5", ls);
ASSERT_EQ( d5.rank() , 5 );
// LayoutStride arranged as LayoutRight - commented out as example that fails unit test
// unsigned order5[] = { 4,3,2,1,0 }, dimen5[] = { N0, N1, N2, 2, 2 };
// sdView d5( "d5" , Kokkos::LayoutStride::order_dimensions(5, order5, dimen5) );
//
// Fails the following unit test:
// ASSERT_EQ( d5.rank() , dr5.rank() );
//
// Explanation: In construction of the Kokkos::LayoutStride below, since the
// remaining dimensions are not specified, they will default to values of 0
// rather than ~size_t(0).
// When passed to the DynRankView constructor the default dimensions (of 0)
// will be counted toward the dynamic rank and returning an incorrect value
// (i.e. rank 7 rather than 5).
// Check LayoutRight dr5 and LayoutStride d5 dimensions agree (as they should)
ASSERT_EQ( d5.dimension_0() , dr5.dimension_0() );
@ -1100,21 +1397,21 @@ public:
// Rank 5 subview of rank 5 dynamic rank view, layout stride input
sdView ds5 = Kokkos::Experimental::subdynrankview( d5 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) );
ASSERT_EQ( ds5.rank() , 5 );
// Pass in extra ALL arguments beyond the rank of the DynRank View.
// This behavior is allowed - ignore the extra ALL arguments when
// the src.rank() < number of arguments, but be careful!
sdView ds5plus = Kokkos::Experimental::subdynrankview( d5 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) , Kokkos::ALL() );
ASSERT_EQ( ds5.rank() , ds5plus.rank() );
ASSERT_EQ( ds5.dimension_0() , ds5plus.dimension_0() );
ASSERT_EQ( ds5.dimension_4() , ds5plus.dimension_4() );
ASSERT_EQ( ds5.dimension_5() , ds5plus.dimension_5() );
ASSERT_EQ( ds5.rank() , ds5plus.rank() );
ASSERT_EQ( ds5.rank() , 5 );
#if ! defined( KOKKOS_HAVE_CUDA ) || defined ( KOKKOS_USE_CUDA_UVM )
ASSERT_EQ( & ds5(1,1,1,1) - & ds5plus(1,1,1,1) , 0 );
ASSERT_EQ( & ds5(1,1,1,1,0) - & ds5plus(1,1,1,1,0) , 0 );
ASSERT_EQ( & ds5(1,1,1,1,0,0) - & ds5plus(1,1,1,1,0,0) , 0 ); // passing argument to rank beyond the view's rank is allowed iff it is a 0.
#endif
// Similar test to rank 5 above, but create rank 4 subview
@ -1131,9 +1428,9 @@ public:
static void run_test_subview_strided()
{
typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutLeft , host > drview_left ;
typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutRight , host > drview_right ;
typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutStride , host > drview_stride ;
typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutLeft , host_drv_space > drview_left ;
typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutRight , host_drv_space > drview_right ;
typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutStride , host_drv_space > drview_stride ;
drview_left xl2( "xl2", 100 , 200 );
drview_right xr2( "xr2", 100 , 200 );
@ -1159,35 +1456,37 @@ public:
drview_left xl4( "xl4", 10 , 20 , 30 , 40 );
drview_right xr4( "xr4", 10 , 20 , 30 , 40 );
drview_stride yl4 = Kokkos::Experimental::subdynrankview( xl4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() );
drview_stride yr4 = Kokkos::Experimental::subdynrankview( xr4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() );
//Replace subdynrankview with subview - test
drview_stride yl4 = Kokkos::Experimental::subview( xl4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() );
drview_stride yr4 = Kokkos::Experimental::subview( xr4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() );
ASSERT_EQ( yl4.dimension_0() , xl4.dimension_1() );
ASSERT_EQ( yl4.dimension_1() , xl4.dimension_3() );
ASSERT_EQ( yr4.dimension_0() , xr4.dimension_1() );
ASSERT_EQ( yr4.dimension_1() , xr4.dimension_3() );
ASSERT_EQ( yl4.rank() , 2);
ASSERT_EQ( yr4.rank() , 2);
ASSERT_EQ( & yl4(4,4) - & xl4(1,4,2,4) , 0 );
ASSERT_EQ( & yr4(4,4) - & xr4(1,4,2,4) , 0 );
}
static void run_test_vector()
{
static const unsigned Length = 1000 , Count = 8 ;
typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutLeft , host > multivector_type ;
typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutLeft , host_drv_space > multivector_type ;
typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutRight , host > multivector_right_type ;
typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutRight , host_drv_space > multivector_right_type ;
multivector_type mv = multivector_type( "mv" , Length , Count );
multivector_right_type mv_right = multivector_right_type( "mv" , Length , Count );
typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutStride , host > svector_type ;
typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutStride , host > smultivector_type ;
typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host > const_svector_right_type ; //LayoutStride, not right; setup to match original ViewAPI calls... update
typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host > const_svector_type ;
typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host > const_smultivector_type ;
typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutStride , host_drv_space > svector_type ;
typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutStride , host_drv_space > smultivector_type ;
typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host_drv_space > const_svector_right_type ;
typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host_drv_space > const_svector_type ;
typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host_drv_space > const_smultivector_type ;
svector_type v1 = Kokkos::Experimental::subdynrankview( mv , Kokkos::ALL() , 0 );
svector_type v2 = Kokkos::Experimental::subdynrankview( mv , Kokkos::ALL() , 1 );
@ -1251,7 +1550,6 @@ public:
const_smultivector_type cmv( mv );
typename smultivector_type::const_type cmvX( cmv );
typename const_smultivector_type::const_type ccmvX( cmv );
}
};

View File

@ -61,8 +61,7 @@ struct TestDynamicView
typedef typename Space::execution_space execution_space ;
typedef typename Space::memory_space memory_space ;
typedef Kokkos::Experimental::MemoryPool< memory_space , execution_space >
memory_pool_type ;
typedef Kokkos::Experimental::MemoryPool<typename Space::device_type> memory_pool_type;
typedef Kokkos::Experimental::DynamicView<Scalar*,Space> view_type;
@ -129,11 +128,9 @@ struct TestDynamicView
typedef Kokkos::TeamPolicy<execution_space,TEST> TestPolicy ;
typedef Kokkos::TeamPolicy<execution_space,VERIFY> VerifyPolicy ;
const unsigned int chunk_size = 1024 ;
// printf("TestDynamicView::run(%d) construct memory pool\n",arg_total_size);
memory_pool_type pool( memory_space() , chunk_size , arg_total_size * sizeof(Scalar) );
memory_pool_type pool( memory_space() , arg_total_size * sizeof(Scalar) * 1.2 );
// printf("TestDynamicView::run(%d) construct dynamic view\n",arg_total_size);

View File

@ -34,6 +34,7 @@
#cmakedefine KOKKOS_HAVE_Winthread
#cmakedefine KOKKOS_HAVE_OPENMP
#cmakedefine KOKKOS_HAVE_HWLOC
#cmakedefine KOKKOS_HAVE_DEBUG
#cmakedefine KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
#cmakedefine KOKKOS_HAVE_CXX11
#cmakedefine KOKKOS_HAVE_CUSPARSE

View File

@ -8,11 +8,22 @@ SET(SOURCES
PerfTestCuda.cpp
)
TRIBITS_ADD_EXECUTABLE_AND_TEST(
PerfTest
# Per #374, we always want to build this test, but we only want to run
# it as a PERFORMANCE test. That's why we separate building the test
# from running the test.
TRIBITS_ADD_EXECUTABLE(
PerfTestExec
SOURCES ${SOURCES}
COMM serial mpi
NUM_MPI_PROCS 1
FAIL_REGULAR_EXPRESSION " FAILED "
TESTONLYLIBS kokkos_gtest
)
TRIBITS_ADD_EXECUTABLE_AND_TEST(
PerfTest
NAME PerfTestExec
COMM serial mpi
NUM_MPI_PROCS 1
CATEGORIES PERFORMANCE
FAIL_REGULAR_EXPRESSION " FAILED "
)

View File

@ -159,7 +159,7 @@ struct TextureFetch
Kokkos::Cuda::fence();
Kokkos::Impl::Timer timer;
Kokkos::Timer timer;
for (int j=0; j<10; ++j) {
RandomReduce f(array,indexes);
f.apply(reduce);

View File

@ -153,7 +153,7 @@ struct ModifiedGramSchmidt
Kokkos::deep_copy( one , (Scalar) 1 );
Kokkos::Impl::Timer timer ;
Kokkos::Timer timer ;
for ( size_type j = 0 ; j < count ; ++j ) {
// Reduction : tmp = dot( Q(:,j) , Q(:,j) );

View File

@ -252,7 +252,7 @@ struct HexGrad
execution_space::fence();
for ( int i = 0 ; i < iter ; ++i ) {
Kokkos::Impl::Timer timer ;
Kokkos::Timer timer ;
Kokkos::parallel_for( count , HexGrad<execution_space>( coord , grad ) );
execution_space::fence();
const double dt = timer.seconds();

View File

@ -414,24 +414,27 @@ void Loop(int loop, int test, const char* type_name) {
Kokkos::Impl::Timer timer;
T res = LoopVariant<T>(loop,test);
double time1 = timer.seconds();
double time = timer.seconds();
timer.reset();
T resNonAtomic = LoopVariantNonAtomic<T>(loop,test);
double time2 = timer.seconds();
double timeNonAtomic = timer.seconds();
timer.reset();
T resSerial = LoopVariantSerial<T>(loop,test);
double time3 = timer.seconds();
double timeSerial = timer.seconds();
time1*=1e6/loop;
time2*=1e6/loop;
time3*=1e6/loop;
time *=1e6/loop;
timeNonAtomic*=1e6/loop;
timeSerial *=1e6/loop;
//textcolor_standard();
bool passed = true;
if(resSerial!=res) passed = false;
//if(!passed) textcolor(RESET,BLACK,YELLOW);
printf("%s Test %i %s --- Loop: %i Value (S,A,NA): %e %e %e Time: %7.4e %7.4e %7.4e Size of Type %i)",type_name,test,passed?"PASSED":"FAILED",loop,1.0*resSerial,1.0*res,1.0*resNonAtomic,time1,time2,time3,(int)sizeof(T));
printf("%s Test %i %s --- Loop: %i Value (S,A,NA): %e %e %e Time: %7.4e %7.4e %7.4e Size of Type %i)",
type_name,test,passed?"PASSED":"FAILED",loop,
1.0*resSerial,1.0*res,1.0*resNonAtomic,
timeSerial,time,timeNonAtomic,(int)sizeof(T));
//if(!passed) textcolor_standard();
printf("\n");
}
@ -452,7 +455,7 @@ void Test(int loop, int test, const char* type_name) {
int main(int argc, char* argv[])
{
int type = -1;
int loop = 1000000;
int loop = 100000;
int test = -1;
for(int i=0;i<argc;i++)

View File

@ -124,15 +124,31 @@ unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits:
#endif
namespace Kokkos {
namespace Impl {
struct CudaLockArraysStruct {
int* atomic;
int* scratch;
int* threadid;
};
}
}
__device__ __constant__
#ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
extern
#endif
int* kokkos_impl_cuda_atomic_lock_array ;
Kokkos::Impl::CudaLockArraysStruct kokkos_impl_cuda_lock_arrays ;
#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
#define CUDA_SPACE_ATOMIC_XOR_MASK 0x15A39
namespace Kokkos {
namespace Impl {
void* cuda_resize_scratch_space(size_t bytes, bool force_shrink = false);
}
}
namespace Kokkos {
namespace Impl {
__device__ inline
@ -140,8 +156,7 @@ bool lock_address_cuda_space(void* ptr) {
size_t offset = size_t(ptr);
offset = offset >> 2;
offset = offset & CUDA_SPACE_ATOMIC_MASK;
//offset = offset xor CUDA_SPACE_ATOMIC_XOR_MASK;
return (0 == atomicCAS(&kokkos_impl_cuda_atomic_lock_array[offset],0,1));
return (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[offset],0,1));
}
__device__ inline
@ -149,8 +164,7 @@ void unlock_address_cuda_space(void* ptr) {
size_t offset = size_t(ptr);
offset = offset >> 2;
offset = offset & CUDA_SPACE_ATOMIC_MASK;
//offset = offset xor CUDA_SPACE_ATOMIC_XOR_MASK;
atomicExch( &kokkos_impl_cuda_atomic_lock_array[ offset ], 0);
atomicExch( &kokkos_impl_cuda_lock_arrays.atomic[ offset ], 0);
}
}
@ -232,8 +246,11 @@ struct CudaParallelLaunch< DriverType , true > {
cudaMemcpyToSymbol( kokkos_impl_cuda_constant_memory_buffer , & driver , sizeof(DriverType) );
#ifndef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
int* lock_array_ptr = lock_array_cuda_space_ptr();
cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) );
Kokkos::Impl::CudaLockArraysStruct locks;
locks.atomic = atomic_lock_array_cuda_space_ptr(false);
locks.scratch = scratch_lock_array_cuda_space_ptr(false);
locks.threadid = threadid_lock_array_cuda_space_ptr(false);
cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
#endif
// Invoke the driver function on the device
@ -271,8 +288,11 @@ struct CudaParallelLaunch< DriverType , false > {
#endif
#ifndef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
int* lock_array_ptr = lock_array_cuda_space_ptr();
cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) );
Kokkos::Impl::CudaLockArraysStruct locks;
locks.atomic = atomic_lock_array_cuda_space_ptr(false);
locks.scratch = scratch_lock_array_cuda_space_ptr(false);
locks.threadid = threadid_lock_array_cuda_space_ptr(false);
cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
#endif
cuda_parallel_launch_local_memory< DriverType ><<< grid , block , shmem , stream >>>( driver );

View File

@ -51,10 +51,10 @@
/* only compile this file if CUDA is enabled for Kokkos */
#ifdef KOKKOS_HAVE_CUDA
#include <Kokkos_Core.hpp>
#include <Kokkos_Cuda.hpp>
#include <Kokkos_CudaSpace.hpp>
#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>
#include <Cuda/Kokkos_Cuda_Internal.hpp>
#include <impl/Kokkos_Error.hpp>
@ -107,68 +107,6 @@ void DeepCopyAsyncCuda( void * dst , const void * src , size_t n) {
namespace Kokkos {
#if ! KOKKOS_USING_EXP_VIEW
namespace {
void texture_object_attach_impl( Impl::AllocationTracker const & tracker
, unsigned type_size
, ::cudaChannelFormatDesc const & desc
)
{
enum { TEXTURE_BOUND_1D = 2u << 27 };
if ( tracker.attribute() == NULL ) {
// check for correct allocator
const bool ok_alloc = tracker.allocator()->support_texture_binding();
const bool ok_count = (tracker.alloc_size() / type_size) < TEXTURE_BOUND_1D;
if (ok_alloc && ok_count) {
Impl::TextureAttribute * attr = new Impl::TextureAttribute( tracker.alloc_ptr(), tracker.alloc_size(), desc );
tracker.set_attribute( attr );
}
else {
std::ostringstream oss;
oss << "Error: Cannot attach texture object";
if (!ok_alloc) {
oss << ", incompatabile allocator " << tracker.allocator()->name();
}
if (!ok_count) {
oss << ", array " << tracker.label() << " too large";
}
oss << ".";
Kokkos::Impl::throw_runtime_exception( oss.str() );
}
}
if ( NULL == dynamic_cast<Impl::TextureAttribute *>(tracker.attribute()) ) {
std::ostringstream oss;
oss << "Error: Allocation " << tracker.label() << " already has an attribute attached.";
Kokkos::Impl::throw_runtime_exception( oss.str() );
}
}
} // unnamed namespace
/*--------------------------------------------------------------------------*/
Impl::AllocationTracker CudaSpace::allocate_and_track( const std::string & label, const size_t size )
{
return Impl::AllocationTracker( allocator(), size, label);
}
void CudaSpace::texture_object_attach( Impl::AllocationTracker const & tracker
, unsigned type_size
, ::cudaChannelFormatDesc const & desc
)
{
texture_object_attach_impl( tracker, type_size, desc );
}
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
void CudaSpace::access_error()
{
const std::string msg("Kokkos::CudaSpace::access_error attempt to execute Cuda function from non-Cuda space" );
@ -183,23 +121,6 @@ void CudaSpace::access_error( const void * const )
/*--------------------------------------------------------------------------*/
#if ! KOKKOS_USING_EXP_VIEW
Impl::AllocationTracker CudaUVMSpace::allocate_and_track( const std::string & label, const size_t size )
{
return Impl::AllocationTracker( allocator(), size, label);
}
void CudaUVMSpace::texture_object_attach( Impl::AllocationTracker const & tracker
, unsigned type_size
, ::cudaChannelFormatDesc const & desc
)
{
texture_object_attach_impl( tracker, type_size, desc );
}
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
bool CudaUVMSpace::available()
{
#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && !defined(__APPLE__)
@ -212,15 +133,6 @@ bool CudaUVMSpace::available()
/*--------------------------------------------------------------------------*/
#if ! KOKKOS_USING_EXP_VIEW
Impl::AllocationTracker CudaHostPinnedSpace::allocate_and_track( const std::string & label, const size_t size )
{
return Impl::AllocationTracker( allocator(), size, label);
}
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
@ -824,16 +736,26 @@ print_records( std::ostream & s , const Kokkos::CudaHostPinnedSpace & space , bo
namespace Kokkos {
namespace {
__global__ void init_lock_array_kernel() {
__global__ void init_lock_array_kernel_atomic() {
unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
if(i<CUDA_SPACE_ATOMIC_MASK+1)
kokkos_impl_cuda_atomic_lock_array[i] = 0;
kokkos_impl_cuda_lock_arrays.atomic[i] = 0;
}
__global__ void init_lock_array_kernel_scratch_threadid(int N) {
unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
if(i<N) {
kokkos_impl_cuda_lock_arrays.scratch[i] = 0;
kokkos_impl_cuda_lock_arrays.threadid[i] = 0;
}
}
}
namespace Impl {
int* lock_array_cuda_space_ptr(bool deallocate) {
int* atomic_lock_array_cuda_space_ptr(bool deallocate) {
static int* ptr = NULL;
if(deallocate) {
cudaFree(ptr);
@ -845,13 +767,60 @@ int* lock_array_cuda_space_ptr(bool deallocate) {
return ptr;
}
void init_lock_array_cuda_space() {
int is_initialized = 0;
if(! is_initialized) {
int* lock_array_ptr = lock_array_cuda_space_ptr();
cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) );
init_lock_array_kernel<<<(CUDA_SPACE_ATOMIC_MASK+255)/256,256>>>();
int* scratch_lock_array_cuda_space_ptr(bool deallocate) {
static int* ptr = NULL;
if(deallocate) {
cudaFree(ptr);
ptr = NULL;
}
if(ptr==NULL && !deallocate)
cudaMalloc(&ptr,sizeof(int)*(Cuda::concurrency()));
return ptr;
}
int* threadid_lock_array_cuda_space_ptr(bool deallocate) {
static int* ptr = NULL;
if(deallocate) {
cudaFree(ptr);
ptr = NULL;
}
if(ptr==NULL && !deallocate)
cudaMalloc(&ptr,sizeof(int)*(Cuda::concurrency()));
return ptr;
}
void init_lock_arrays_cuda_space() {
static int is_initialized = 0;
if(! is_initialized) {
Kokkos::Impl::CudaLockArraysStruct locks;
locks.atomic = atomic_lock_array_cuda_space_ptr(false);
locks.scratch = scratch_lock_array_cuda_space_ptr(false);
locks.threadid = threadid_lock_array_cuda_space_ptr(false);
cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK+255)/256,256>>>();
init_lock_array_kernel_scratch_threadid<<<(Kokkos::Cuda::concurrency()+255)/256,256>>>(Kokkos::Cuda::concurrency());
}
}
void* cuda_resize_scratch_space(size_t bytes, bool force_shrink) {
static void* ptr = NULL;
static size_t current_size = 0;
if(current_size == 0) {
current_size = bytes;
ptr = Kokkos::kokkos_malloc<Kokkos::CudaSpace>("CudaSpace::ScratchMemory",current_size);
}
if(bytes > current_size) {
current_size = bytes;
ptr = Kokkos::kokkos_realloc<Kokkos::CudaSpace>(ptr,current_size);
}
if((bytes < current_size) && (force_shrink)) {
current_size = bytes;
Kokkos::kokkos_free<Kokkos::CudaSpace>(ptr);
ptr = Kokkos::kokkos_malloc<Kokkos::CudaSpace>("CudaSpace::ScratchMemory",current_size);
}
return ptr;
}
}

View File

@ -50,7 +50,6 @@
#ifdef KOKKOS_HAVE_CUDA
#include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_AllocationTracker.hpp> // AllocatorAttributeBase
namespace Kokkos {
namespace Impl {

View File

@ -1,198 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Macros.hpp>
#if ! KOKKOS_USING_EXP_VIEW
/* only compile this file if CUDA is enabled for Kokkos */
#ifdef KOKKOS_HAVE_CUDA
#include <impl/Kokkos_Error.hpp>
#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>
#include <Cuda/Kokkos_Cuda_Error.hpp>
#include <sstream>
namespace Kokkos { namespace Impl {
/*--------------------------------------------------------------------------*/
TextureAttribute::TextureAttribute( void * const alloc_ptr
, size_t alloc_size
, cudaChannelFormatDesc const & desc
)
: m_tex_obj(0)
{
cuda_device_synchronize();
struct cudaResourceDesc resDesc ;
struct cudaTextureDesc texDesc ;
memset( & resDesc , 0 , sizeof(resDesc) );
memset( & texDesc , 0 , sizeof(texDesc) );
resDesc.resType = cudaResourceTypeLinear ;
resDesc.res.linear.desc = desc ;
resDesc.res.linear.sizeInBytes = alloc_size ;
resDesc.res.linear.devPtr = alloc_ptr ;
CUDA_SAFE_CALL( cudaCreateTextureObject( & m_tex_obj , & resDesc, & texDesc, NULL) );
cuda_device_synchronize();
}
TextureAttribute::~TextureAttribute()
{
if (m_tex_obj) {
cudaDestroyTextureObject( m_tex_obj );
}
}
/*--------------------------------------------------------------------------*/
void * CudaMallocAllocator::allocate( size_t size )
{
void * ptr = NULL;
CUDA_SAFE_CALL( cudaMalloc( &ptr, size ) );
return ptr;
}
void CudaMallocAllocator::deallocate( void * ptr, size_t /*size*/ )
{
try {
CUDA_SAFE_CALL( cudaFree( ptr ) );
} catch(...) {}
}
void * CudaMallocAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
{
void * ptr = old_ptr;
if (old_size != new_size) {
ptr = allocate( new_size );
size_t copy_size = old_size < new_size ? old_size : new_size;
CUDA_SAFE_CALL( cudaMemcpy( ptr , old_ptr , copy_size , cudaMemcpyDefault ) );
deallocate( old_ptr, old_size );
}
return ptr;
}
/*--------------------------------------------------------------------------*/
void * CudaUVMAllocator::allocate( size_t size )
{
#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION )
void * ptr = NULL;
CUDA_SAFE_CALL( cudaMallocManaged( &ptr, size, cudaMemAttachGlobal ) );
return ptr;
#else
throw_runtime_exception( "CUDA VERSION does not support UVM" );
return NULL;
#endif
}
void CudaUVMAllocator::deallocate( void * ptr, size_t /*size*/ )
{
try {
CUDA_SAFE_CALL( cudaFree( ptr ) );
} catch(...) {}
}
void * CudaUVMAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
{
void * ptr = old_ptr;
if (old_size != new_size) {
ptr = allocate( new_size );
size_t copy_size = old_size < new_size ? old_size : new_size;
CUDA_SAFE_CALL( cudaMemcpy( ptr , old_ptr , copy_size , cudaMemcpyDefault ) );
deallocate( old_ptr, old_size );
}
return ptr;
}
/*--------------------------------------------------------------------------*/
void * CudaHostAllocator::allocate( size_t size )
{
void * ptr = NULL;
CUDA_SAFE_CALL( cudaHostAlloc( &ptr , size , cudaHostAllocDefault ) );
return ptr;
}
void CudaHostAllocator::deallocate( void * ptr, size_t /*size*/ )
{
try {
CUDA_SAFE_CALL( cudaFreeHost( ptr ) );
} catch(...) {}
}
void * CudaHostAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
{
void * ptr = old_ptr;
if (old_size != new_size) {
ptr = allocate( new_size );
size_t copy_size = old_size < new_size ? old_size : new_size;
CUDA_SAFE_CALL( cudaMemcpy( ptr , old_ptr , copy_size , cudaMemcpyHostToHost ) );
deallocate( old_ptr, old_size );
}
return ptr;
}
/*--------------------------------------------------------------------------*/
}} // namespace Kokkos::Impl
#endif //KOKKOS_HAVE_CUDA
#endif /* #if ! KOKKOS_USING_EXP_VIEW */

View File

@ -1,190 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CUDA_BASIC_ALLOCATORS_HPP
#define KOKKOS_CUDA_BASIC_ALLOCATORS_HPP
#include <Kokkos_Macros.hpp>
#if ! KOKKOS_USING_EXP_VIEW
/* only compile this file if CUDA is enabled for Kokkos */
#ifdef KOKKOS_HAVE_CUDA
#include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_AllocationTracker.hpp> // AllocatorAttributeBase
namespace Kokkos { namespace Impl {
// Cuda 5.0 <texture_types.h> defines 'cudaTextureObject_t'
// to be an 'unsigned long long'. This chould change with
// future version of Cuda and this typedef would have to
// change accordingly.
#if defined( CUDA_VERSION ) && ( 5000 <= CUDA_VERSION )
typedef enable_if<
sizeof(::cudaTextureObject_t) == sizeof(const void *) ,
::cudaTextureObject_t >::type cuda_texture_object_type ;
#else
typedef const void * cuda_texture_object_type ;
#endif
struct TextureAttribute : public AllocatorAttributeBase
{
cuda_texture_object_type m_tex_obj ;
TextureAttribute( void * const alloc_ptr
, size_t alloc_size
, cudaChannelFormatDesc const & desc
);
~TextureAttribute();
};
/// class CudaUnmanagedAllocator
/// does nothing when deallocate(ptr,size) is called
struct CudaUnmanagedAllocator
{
static const char * name()
{
return "Cuda Unmanaged Allocator";
}
static void deallocate(void * /*ptr*/, size_t /*size*/) {}
static bool support_texture_binding() { return true; }
};
/// class CudaUnmanagedAllocator
/// does nothing when deallocate(ptr,size) is called
struct CudaUnmanagedUVMAllocator
{
static const char * name()
{
return "Cuda Unmanaged UVM Allocator";
}
static void deallocate(void * /*ptr*/, size_t /*size*/) {}
static bool support_texture_binding() { return true; }
};
/// class CudaUnmanagedHostAllocator
/// does nothing when deallocate(ptr,size) is called
class CudaUnmanagedHostAllocator
{
public:
static const char * name()
{
return "Cuda Unmanaged Host Allocator";
}
// Unmanaged deallocate does nothing
static void deallocate(void * /*ptr*/, size_t /*size*/) {}
};
/// class CudaMallocAllocator
class CudaMallocAllocator
{
public:
static const char * name()
{
return "Cuda Malloc Allocator";
}
static void* allocate(size_t size);
static void deallocate(void * ptr, size_t);
static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
static bool support_texture_binding() { return true; }
};
/// class CudaUVMAllocator
class CudaUVMAllocator
{
public:
static const char * name()
{
return "Cuda UVM Allocator";
}
static void* allocate(size_t size);
static void deallocate(void * ptr, size_t);
static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
static bool support_texture_binding() { return true; }
};
/// class CudaHostAllocator
class CudaHostAllocator
{
public:
static const char * name()
{
return "Cuda Host Allocator";
}
static void* allocate(size_t size);
static void deallocate(void * ptr, size_t);
static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
};
}} // namespace Kokkos::Impl
#endif //KOKKOS_HAVE_CUDA
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
#endif //KOKKOS_CUDA_BASIC_ALLOCATORS_HPP

View File

@ -51,8 +51,8 @@
#include <Cuda/Kokkos_Cuda_Error.hpp>
#include <Cuda/Kokkos_Cuda_Internal.hpp>
#include <impl/Kokkos_AllocationTracker.hpp>
#include <impl/Kokkos_Error.hpp>
#include <impl/Kokkos_Profiling_Interface.hpp>
/*--------------------------------------------------------------------------*/
/* Standard 'C' libraries */
@ -70,7 +70,7 @@ __device__ __constant__
unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits::ConstantMemoryUsage / sizeof(unsigned long) ] ;
__device__ __constant__
int* kokkos_impl_cuda_atomic_lock_array ;
Kokkos::Impl::CudaLockArraysStruct kokkos_impl_cuda_lock_arrays ;
#endif
@ -190,7 +190,7 @@ namespace {
class CudaInternalDevices {
public:
enum { MAXIMUM_DEVICE_COUNT = 8 };
enum { MAXIMUM_DEVICE_COUNT = 64 };
struct cudaDeviceProp m_cudaProp[ MAXIMUM_DEVICE_COUNT ] ;
int m_cudaDevCount ;
@ -206,6 +206,9 @@ CudaInternalDevices::CudaInternalDevices()
CUDA_SAFE_CALL (cudaGetDeviceCount( & m_cudaDevCount ) );
if(m_cudaDevCount > MAXIMUM_DEVICE_COUNT) {
Kokkos::abort("Sorry, you have more GPUs per node than we thought anybody would ever have. Please report this to github.com/kokkos/kokkos.");
}
for ( int i = 0 ; i < m_cudaDevCount ; ++i ) {
CUDA_SAFE_CALL( cudaGetDeviceProperties( m_cudaProp + i , i ) );
}
@ -226,14 +229,6 @@ private:
CudaInternal( const CudaInternal & );
CudaInternal & operator = ( const CudaInternal & );
#if ! KOKKOS_USING_EXP_VIEW
AllocationTracker m_scratchFlagsTracker;
AllocationTracker m_scratchSpaceTracker;
AllocationTracker m_scratchUnifiedTracker;
#endif
public:
@ -255,6 +250,8 @@ public:
size_type * m_scratchUnified ;
cudaStream_t * m_stream ;
static int was_initialized;
static int was_finalized;
static CudaInternal & singleton();
@ -293,6 +290,8 @@ public:
size_type * scratch_unified( const size_type size );
};
int CudaInternal::was_initialized = 0;
int CudaInternal::was_finalized = 0;
//----------------------------------------------------------------------------
@ -367,6 +366,10 @@ CudaInternal & CudaInternal::singleton()
void CudaInternal::initialize( int cuda_device_id , int stream_count )
{
if ( was_finalized ) Kokkos::abort("Calling Cuda::initialize after Cuda::finalize is illegal\n");
was_initialized = 1;
if ( is_initialized() ) return;
enum { WordSize = sizeof(size_type) };
if ( ! HostSpace::execution_space::is_initialized() ) {
@ -526,11 +529,14 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
// Init the array for used for arbitrarily sized atomics
Impl::init_lock_array_cuda_space();
Impl::init_lock_arrays_cuda_space();
#ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
int* lock_array_ptr = lock_array_cuda_space_ptr();
cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) );
Kokkos::Impl::CudaLockArraysStruct locks;
locks.atomic = atomic_lock_array_cuda_space_ptr(false);
locks.scratch = scratch_lock_array_cuda_space_ptr(false);
locks.threadid = threadid_lock_array_cuda_space_ptr(false);
cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
#endif
}
@ -548,14 +554,6 @@ CudaInternal::scratch_flags( const Cuda::size_type size )
m_scratchFlagsCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
#if ! KOKKOS_USING_EXP_VIEW
m_scratchFlagsTracker = CudaSpace::allocate_and_track( std::string("InternalScratchFlags") , sizeof( ScratchGrain ) * m_scratchFlagsCount );
m_scratchFlags = reinterpret_cast<size_type *>(m_scratchFlagsTracker.alloc_ptr());
#else
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;
Record * const r = Record::allocate( Kokkos::CudaSpace()
@ -566,9 +564,6 @@ CudaInternal::scratch_flags( const Cuda::size_type size )
m_scratchFlags = reinterpret_cast<size_type *>( r->data() );
#endif
CUDA_SAFE_CALL( cudaMemset( m_scratchFlags , 0 , m_scratchFlagsCount * sizeScratchGrain ) );
}
@ -582,26 +577,15 @@ CudaInternal::scratch_space( const Cuda::size_type size )
m_scratchSpaceCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
#if ! KOKKOS_USING_EXP_VIEW
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;
m_scratchSpaceTracker = CudaSpace::allocate_and_track( std::string("InternalScratchSpace") , sizeof( ScratchGrain ) * m_scratchSpaceCount );
Record * const r = Record::allocate( Kokkos::CudaSpace()
, "InternalScratchSpace"
, ( sizeof( ScratchGrain ) * m_scratchSpaceCount ) );
m_scratchSpace = reinterpret_cast<size_type *>(m_scratchSpaceTracker.alloc_ptr());
#else
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;
Record * const r = Record::allocate( Kokkos::CudaSpace()
, "InternalScratchSpace"
, ( sizeof( ScratchGrain ) * m_scratchSpaceCount ) );
Record::increment( r );
m_scratchSpace = reinterpret_cast<size_type *>( r->data() );
#endif
Record::increment( r );
m_scratchSpace = reinterpret_cast<size_type *>( r->data() );
}
return m_scratchSpace ;
@ -615,14 +599,6 @@ CudaInternal::scratch_unified( const Cuda::size_type size )
m_scratchUnifiedCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
#if ! KOKKOS_USING_EXP_VIEW
m_scratchUnifiedTracker = CudaHostPinnedSpace::allocate_and_track( std::string("InternalScratchUnified") , sizeof( ScratchGrain ) * m_scratchUnifiedCount );
m_scratchUnified = reinterpret_cast<size_type *>( m_scratchUnifiedTracker.alloc_ptr() );
#else
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void > Record ;
Record * const r = Record::allocate( Kokkos::CudaHostPinnedSpace()
@ -632,9 +608,6 @@ CudaInternal::scratch_unified( const Cuda::size_type size )
Record::increment( r );
m_scratchUnified = reinterpret_cast<size_type *>( r->data() );
#endif
}
return m_scratchUnified ;
@ -644,9 +617,13 @@ CudaInternal::scratch_unified( const Cuda::size_type size )
void CudaInternal::finalize()
{
was_finalized = 1;
if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {
lock_array_cuda_space_ptr(true);
atomic_lock_array_cuda_space_ptr(false);
scratch_lock_array_cuda_space_ptr(false);
threadid_lock_array_cuda_space_ptr(false);
if ( m_stream ) {
for ( size_type i = 1 ; i < m_streamCount ; ++i ) {
cudaStreamDestroy( m_stream[i] );
@ -655,14 +632,6 @@ void CudaInternal::finalize()
::free( m_stream );
}
#if ! KOKKOS_USING_EXP_VIEW
m_scratchSpaceTracker.clear();
m_scratchFlagsTracker.clear();
m_scratchUnifiedTracker.clear();
#else
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< CudaSpace > RecordCuda ;
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< CudaHostPinnedSpace > RecordHost ;
@ -670,8 +639,6 @@ void CudaInternal::finalize()
RecordCuda::decrement( RecordCuda::get_record( m_scratchSpace ) );
RecordHost::decrement( RecordHost::get_record( m_scratchUnified ) );
#endif
m_cudaDev = -1 ;
m_multiProcCount = 0 ;
m_maxWarpCount = 0 ;
@ -730,7 +697,13 @@ int Cuda::is_initialized()
{ return Impl::CudaInternal::singleton().is_initialized(); }
void Cuda::initialize( const Cuda::SelectDevice config , size_t num_instances )
{ Impl::CudaInternal::singleton().initialize( config.cuda_device_id , num_instances ); }
{
Impl::CudaInternal::singleton().initialize( config.cuda_device_id , num_instances );
#if (KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::initialize();
#endif
}
std::vector<unsigned>
Cuda::detect_device_arch()
@ -763,7 +736,13 @@ Cuda::size_type Cuda::device_arch()
}
void Cuda::finalize()
{ Impl::CudaInternal::singleton().finalize(); }
{
Impl::CudaInternal::singleton().finalize();
#if (KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::finalize();
#endif
}
Cuda::Cuda()
: m_device( Impl::CudaInternal::singleton().m_cudaDev )

View File

@ -57,17 +57,20 @@ template<class DriverType, bool Large>
struct CudaGetMaxBlockSize;
template<class DriverType, bool Large = (CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>
int cuda_get_max_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
return CudaGetMaxBlockSize<DriverType,Large>::get_block_size(f,vector_length, shmem_extra);
int cuda_get_max_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
return CudaGetMaxBlockSize<DriverType,Large>::get_block_size(f,vector_length, shmem_extra_block,shmem_extra_thread);
}
template<class DriverType>
struct CudaGetMaxBlockSize<DriverType,true> {
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
int numBlocks;
int blockSize=32;
int sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,
cuda_parallel_launch_constant_memory<DriverType>,
@ -76,7 +79,8 @@ struct CudaGetMaxBlockSize<DriverType,true> {
while (blockSize<1024 && numBlocks>0) {
blockSize*=2;
sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length);
sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,
@ -91,11 +95,13 @@ struct CudaGetMaxBlockSize<DriverType,true> {
template<class DriverType>
struct CudaGetMaxBlockSize<DriverType,false> {
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
int numBlocks;
int blockSize=32;
int sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,
cuda_parallel_launch_local_memory<DriverType>,
@ -104,7 +110,8 @@ struct CudaGetMaxBlockSize<DriverType,false> {
while (blockSize<1024 && numBlocks>0) {
blockSize*=2;
sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,
@ -123,13 +130,15 @@ template<class DriverType, bool Large>
struct CudaGetOptBlockSize;
template<class DriverType, bool Large = (CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>
int cuda_get_opt_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
return CudaGetOptBlockSize<DriverType,Large>::get_block_size(f,vector_length,shmem_extra);
int cuda_get_opt_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
return CudaGetOptBlockSize<DriverType,Large>::get_block_size(f,vector_length,shmem_extra_block,shmem_extra_thread);
}
template<class DriverType>
struct CudaGetOptBlockSize<DriverType,true> {
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
int blockSize=16;
int numBlocks;
int sharedmem;
@ -140,7 +149,8 @@ struct CudaGetOptBlockSize<DriverType,true> {
blockSize*=2;
//calculate the occupancy with that optBlockSize and check whether its larger than the largest one found so far
sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,
cuda_parallel_launch_constant_memory<DriverType>,
@ -157,7 +167,8 @@ struct CudaGetOptBlockSize<DriverType,true> {
template<class DriverType>
struct CudaGetOptBlockSize<DriverType,false> {
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
int blockSize=16;
int numBlocks;
int sharedmem;
@ -166,7 +177,8 @@ struct CudaGetOptBlockSize<DriverType,false> {
while(blockSize<1024) {
blockSize*=2;
sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,

File diff suppressed because it is too large Load Diff

View File

@ -130,16 +130,17 @@ inline void cuda_intra_block_reduction( ValueType& value,
cuda_inter_warp_reduction(value,join,max_active_thread);
}
template< class FunctorType , class JoinOp>
template< class FunctorType , class JoinOp , class ArgTag = void >
__device__
bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , void >::reference_type value,
bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgTag >::reference_type value,
typename FunctorValueTraits< FunctorType , ArgTag >::reference_type neutral,
const JoinOp& join,
Cuda::size_type * const m_scratch_space,
typename FunctorValueTraits< FunctorType , void >::pointer_type const result,
typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type const result,
Cuda::size_type * const m_scratch_flags,
const int max_active_thread = blockDim.y) {
typedef typename FunctorValueTraits< FunctorType , void >::pointer_type pointer_type;
typedef typename FunctorValueTraits< FunctorType , void >::value_type value_type;
typedef typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type pointer_type;
typedef typename FunctorValueTraits< FunctorType , ArgTag >::value_type value_type;
//Do the intra-block reduction with shfl operations and static shared memory
cuda_intra_block_reduction(value,join,max_active_thread);
@ -170,7 +171,7 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , void
if(id == 0)
*m_scratch_flags = 0;
last_block = true;
value = 0;
value = neutral;
pointer_type const volatile global = (pointer_type) m_scratch_space ;
@ -366,7 +367,12 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType & functor ,
size_type * const shared = shared_data + word_count.value * BlockSizeMask ;
size_type * const global = global_data + word_count.value * block_id ;
#if (__CUDA_ARCH__ < 500)
for ( size_type i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i] ; }
#else
for ( size_type i = 0 ; i < word_count.value ; i += 1 ) { global[i] = shared[i] ; }
#endif
}
// Contributing blocks note that their contribution has been completed via an atomic-increment flag

View File

@ -0,0 +1,179 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core.hpp>
#if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY )
#include <impl/Kokkos_TaskQueue_impl.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template class TaskQueue< Kokkos::Cuda > ;
//----------------------------------------------------------------------------
__device__
void TaskQueueSpecialization< Kokkos::Cuda >::driver
( TaskQueueSpecialization< Kokkos::Cuda >::queue_type * const queue )
{
using Member = TaskExec< Kokkos::Cuda > ;
using Queue = TaskQueue< Kokkos::Cuda > ;
using task_root_type = TaskBase< Kokkos::Cuda , void , void > ;
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
Member single_exec( 1 );
Member team_exec( blockDim.y );
const int warp_lane = threadIdx.x + threadIdx.y * blockDim.x ;
union {
task_root_type * ptr ;
int raw[2] ;
} task ;
// Loop until all queues are empty and no tasks in flight
do {
// Each team lead attempts to acquire either a thread team task
// or collection of single thread tasks for the team.
if ( 0 == warp_lane ) {
task.ptr = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
// Loop by priority and then type
for ( int i = 0 ; i < Queue::NumQueue && end == task.ptr ; ++i ) {
for ( int j = 0 ; j < 2 && end == task.ptr ; ++j ) {
task.ptr = Queue::pop_task( & queue->m_ready[i][j] );
}
}
#if 0
printf("TaskQueue<Cuda>::driver(%d,%d) task(%lx)\n",threadIdx.z,blockIdx.x
, uintptr_t(task.ptr));
#endif
}
// shuffle broadcast
task.raw[0] = __shfl( task.raw[0] , 0 );
task.raw[1] = __shfl( task.raw[1] , 0 );
if ( 0 == task.ptr ) break ; // 0 == queue->m_ready_count
if ( end != task.ptr ) {
if ( task_root_type::TaskTeam == task.ptr->m_task_type ) {
// Thread Team Task
(*task.ptr->m_apply)( task.ptr , & team_exec );
}
else if ( 0 == threadIdx.y ) {
// Single Thread Task
(*task.ptr->m_apply)( task.ptr , & single_exec );
}
if ( 0 == warp_lane ) {
queue->complete( task.ptr );
}
}
} while(1);
}
namespace {
__global__
void cuda_task_queue_execute( TaskQueue< Kokkos::Cuda > * queue )
{ TaskQueueSpecialization< Kokkos::Cuda >::driver( queue ); }
}
void TaskQueueSpecialization< Kokkos::Cuda >::execute
( TaskQueue< Kokkos::Cuda > * const queue )
{
const int warps_per_block = 4 ;
const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 );
const dim3 block( 1 , Kokkos::Impl::CudaTraits::WarpSize , warps_per_block );
const int shared = 0 ;
const cudaStream_t stream = 0 ;
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
#if 0
printf("cuda_task_queue_execute before\n");
#endif
// Query the stack size, in bytes:
//
// size_t stack_size = 0 ;
// CUDA_SAFE_CALL( cudaDeviceGetLimit( & stack_size , cudaLimitStackSize ) );
//
// If not large enough then set the stack size, in bytes:
//
// CUDA_SAFE_CALL( cudaDeviceSetLimit( cudaLimitStackSize , stack_size ) );
cuda_task_queue_execute<<< grid , block , shared , stream >>>( queue );
CUDA_SAFE_CALL( cudaGetLastError() );
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
#if 0
printf("cuda_task_queue_execute after\n");
#endif
}
}} /* namespace Kokkos::Impl */
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */

View File

@ -0,0 +1,519 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_IMPL_CUDA_TASK_HPP
#define KOKKOS_IMPL_CUDA_TASK_HPP
#if defined( KOKKOS_ENABLE_TASKPOLICY )
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
namespace {
template< typename TaskType >
__global__
void set_cuda_task_base_apply_function_pointer
( TaskBase<Kokkos::Cuda,void,void>::function_type * ptr )
{ *ptr = TaskType::apply ; }
}
template<>
class TaskQueueSpecialization< Kokkos::Cuda >
{
public:
using execution_space = Kokkos::Cuda ;
using memory_space = Kokkos::CudaUVMSpace ;
using queue_type = TaskQueue< execution_space > ;
static
void iff_single_thread_recursive_execute( queue_type * const ) {}
__device__
static void driver( queue_type * const );
static
void execute( queue_type * const );
template< typename FunctorType >
static
void proc_set_apply( TaskBase<execution_space,void,void>::function_type * ptr )
{
using TaskType = TaskBase< execution_space
, typename FunctorType::value_type
, FunctorType > ;
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
set_cuda_task_base_apply_function_pointer<TaskType><<<1,1>>>(ptr);
CUDA_SAFE_CALL( cudaGetLastError() );
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
}
};
extern template class TaskQueue< Kokkos::Cuda > ;
//----------------------------------------------------------------------------
/**\brief Impl::TaskExec<Cuda> is the TaskPolicy<Cuda>::member_type
* passed to tasks running in a Cuda space.
*
* Cuda thread blocks for tasking are dimensioned:
* blockDim.x == vector length
* blockDim.y == team size
* blockDim.z == number of teams
* where
* blockDim.x * blockDim.y == WarpSize
*
* Both single thread and thread team tasks are run by a full Cuda warp.
* A single thread task is called by warp lane #0 and the remaining
* lanes of the warp are idle.
*/
template<>
class TaskExec< Kokkos::Cuda >
{
private:
TaskExec( TaskExec && ) = delete ;
TaskExec( TaskExec const & ) = delete ;
TaskExec & operator = ( TaskExec && ) = delete ;
TaskExec & operator = ( TaskExec const & ) = delete ;
friend class Kokkos::Impl::TaskQueue< Kokkos::Cuda > ;
friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::Cuda > ;
const int m_team_size ;
__device__
TaskExec( int arg_team_size = blockDim.y )
: m_team_size( arg_team_size ) {}
public:
#if defined( __CUDA_ARCH__ )
__device__ void team_barrier() { /* __threadfence_block(); */ }
__device__ int team_rank() const { return threadIdx.y ; }
__device__ int team_size() const { return m_team_size ; }
#else
__host__ void team_barrier() {}
__host__ int team_rank() const { return 0 ; }
__host__ int team_size() const { return 0 ; }
#endif
};
//----------------------------------------------------------------------------
template<typename iType>
struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
{
typedef iType index_type;
const iType start ;
const iType end ;
const iType increment ;
const TaskExec< Kokkos::Cuda > & thread;
#if defined( __CUDA_ARCH__ )
__device__ inline
TeamThreadRangeBoundariesStruct
( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count)
: start( threadIdx.y )
, end(arg_count)
, increment( blockDim.y )
, thread(arg_thread)
{}
__device__ inline
TeamThreadRangeBoundariesStruct
( const TaskExec< Kokkos::Cuda > & arg_thread
, const iType & arg_start
, const iType & arg_end
)
: start( arg_start + threadIdx.y )
, end( arg_end)
, increment( blockDim.y )
, thread( arg_thread )
{}
#else
TeamThreadRangeBoundariesStruct
( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count);
TeamThreadRangeBoundariesStruct
( const TaskExec< Kokkos::Cuda > & arg_thread
, const iType & arg_start
, const iType & arg_end
);
#endif
};
//----------------------------------------------------------------------------
template<typename iType>
struct ThreadVectorRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
{
typedef iType index_type;
const iType start ;
const iType end ;
const iType increment ;
const TaskExec< Kokkos::Cuda > & thread;
#if defined( __CUDA_ARCH__ )
__device__ inline
ThreadVectorRangeBoundariesStruct
( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count)
: start( threadIdx.x )
, end(arg_count)
, increment( blockDim.x )
, thread(arg_thread)
{}
#else
ThreadVectorRangeBoundariesStruct
( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count);
#endif
};
}} /* namespace Kokkos::Impl */
//----------------------------------------------------------------------------
namespace Kokkos {
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread
, const iType & count )
{
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >(thread,count);
}
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread, const iType & start , const iType & end )
{
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Cuda > >(thread,start,end);
}
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
ThreadVectorRange( const Impl::TaskExec< Kokkos::Cuda > & thread
, const iType & count )
{
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >(thread,count);
}
/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all threads of the the calling thread team.
* This functionality requires C++11 support.
*/
template<typename iType, class Lambda>
KOKKOS_INLINE_FUNCTION
void parallel_for
( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Cuda > >& loop_boundaries
, const Lambda& lambda
)
{
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i);
}
}
// reduce across corresponding lanes between team members within warp
// assume stride*team_size == warp_size
template< typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void strided_shfl_warp_reduction
(const JoinType& join,
ValueType& val,
int team_size,
int stride)
{
for (int lane_delta=(team_size*stride)>>1; lane_delta>=stride; lane_delta>>=1) {
join(val, Kokkos::shfl_down(val, lane_delta, team_size*stride));
}
}
// multiple within-warp non-strided reductions
template< typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void multi_shfl_warp_reduction
(const JoinType& join,
ValueType& val,
int vec_length)
{
for (int lane_delta=vec_length>>1; lane_delta; lane_delta>>=1) {
join(val, Kokkos::shfl_down(val, lane_delta, vec_length));
}
}
// broadcast within warp
template< class ValueType >
KOKKOS_INLINE_FUNCTION
ValueType shfl_warp_broadcast
(ValueType& val,
int src_lane,
int width)
{
return Kokkos::shfl(val, src_lane, width);
}
// all-reduce across corresponding vector lanes between team members within warp
// assume vec_length*team_size == warp_size
// blockDim.x == vec_length == stride
// blockDim.y == team_size
// threadIdx.x == position in vec
// threadIdx.y == member number
template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
const Lambda & lambda,
const JoinType& join,
ValueType& initialized_result) {
ValueType result = initialized_result;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i,result);
}
initialized_result = result;
strided_shfl_warp_reduction<ValueType, JoinType>(
join,
initialized_result,
loop_boundaries.thread.team_size(),
blockDim.x);
initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, threadIdx.x, Impl::CudaTraits::WarpSize );
}
// all-reduce across corresponding vector lanes between team members within warp
// if no join() provided, use sum
// assume vec_length*team_size == warp_size
// blockDim.x == vec_length == stride
// blockDim.y == team_size
// threadIdx.x == position in vec
// threadIdx.y == member number
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
const Lambda & lambda,
ValueType& initialized_result) {
//TODO what is the point of creating this temporary?
ValueType result = initialized_result;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i,result);
}
initialized_result = result;
strided_shfl_warp_reduction(
[&] (ValueType& val1, const ValueType& val2) { val1 += val2; },
initialized_result,
loop_boundaries.thread.team_size(),
blockDim.x);
initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, threadIdx.x, Impl::CudaTraits::WarpSize );
}
// all-reduce within team members within warp
// assume vec_length*team_size == warp_size
// blockDim.x == vec_length == stride
// blockDim.y == team_size
// threadIdx.x == position in vec
// threadIdx.y == member number
template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
const Lambda & lambda,
const JoinType& join,
ValueType& initialized_result) {
ValueType result = initialized_result;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i,result);
}
initialized_result = result;
multi_shfl_warp_reduction<ValueType, JoinType>(join, initialized_result, blockDim.x);
initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, 0, blockDim.x );
}
// all-reduce within team members within warp
// if no join() provided, use sum
// assume vec_length*team_size == warp_size
// blockDim.x == vec_length == stride
// blockDim.y == team_size
// threadIdx.x == position in vec
// threadIdx.y == member number
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
const Lambda & lambda,
ValueType& initialized_result) {
ValueType result = initialized_result;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i,result);
}
initialized_result = result;
//initialized_result = multi_shfl_warp_reduction(
multi_shfl_warp_reduction(
[&] (ValueType& val1, const ValueType& val2) { val1 += val2; },
initialized_result,
blockDim.x);
initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, 0, blockDim.x );
}
// scan across corresponding vector lanes between team members within warp
// assume vec_length*team_size == warp_size
// blockDim.x == vec_length == stride
// blockDim.y == team_size
// threadIdx.x == position in vec
// threadIdx.y == member number
template< typename ValueType, typename iType, class Lambda >
KOKKOS_INLINE_FUNCTION
void parallel_scan
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
const Lambda & lambda) {
ValueType accum = 0 ;
ValueType val, y, local_total;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
val = 0;
lambda(i,val,false);
// intra-blockDim.y exclusive scan on 'val'
// accum = accumulated, sum in total for this iteration
// INCLUSIVE scan
for( int offset = blockDim.x ; offset < Impl::CudaTraits::WarpSize ; offset <<= 1 ) {
y = Kokkos::shfl_up(val, offset, Impl::CudaTraits::WarpSize);
if(threadIdx.y*blockDim.x >= offset) { val += y; }
}
// pass accum to all threads
local_total = shfl_warp_broadcast<ValueType>(val,
threadIdx.x+Impl::CudaTraits::WarpSize-blockDim.x,
Impl::CudaTraits::WarpSize);
// make EXCLUSIVE scan by shifting values over one
val = Kokkos::shfl_up(val, blockDim.x, Impl::CudaTraits::WarpSize);
if ( threadIdx.y == 0 ) { val = 0 ; }
val += accum;
lambda(i,val,true);
accum += local_total;
}
}
// scan within team member (vector) within warp
// assume vec_length*team_size == warp_size
// blockDim.x == vec_length == stride
// blockDim.y == team_size
// threadIdx.x == position in vec
// threadIdx.y == member number
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_scan
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
const Lambda & lambda)
{
ValueType accum = 0 ;
ValueType val, y, local_total;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
val = 0;
lambda(i,val,false);
// intra-blockDim.x exclusive scan on 'val'
// accum = accumulated, sum in total for this iteration
// INCLUSIVE scan
for( int offset = 1 ; offset < blockDim.x ; offset <<= 1 ) {
y = Kokkos::shfl_up(val, offset, blockDim.x);
if(threadIdx.x >= offset) { val += y; }
}
// pass accum to all threads
local_total = shfl_warp_broadcast<ValueType>(val, blockDim.x-1, blockDim.x);
// make EXCLUSIVE scan by shifting values over one
val = Kokkos::shfl_up(val, 1, blockDim.x);
if ( threadIdx.x == 0 ) { val = 0 ; }
val += accum;
lambda(i,val,true);
accum += local_total;
}
}
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
#endif /* #ifndef KOKKOS_IMPL_CUDA_TASK_HPP */

View File

@ -46,9 +46,10 @@
#include <stdio.h>
#include <iostream>
#include <sstream>
#include <Kokkos_Core.hpp>
#include <Cuda/Kokkos_Cuda_TaskPolicy.hpp>
#if defined( KOKKOS_ENABLE_CUDA_TASK_POLICY )
#if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY )
// #define DETAILED_PRINT
@ -93,9 +94,8 @@ CudaTaskPolicyQueue
, const unsigned arg_team_size
)
: m_space( Kokkos::CudaUVMSpace()
, arg_task_max_size
, arg_task_max_size * arg_task_max_count
, 1 /* only one level of memory pool */
, arg_task_max_size * arg_task_max_count * 1.2
, 16 /* log2(superblock size) */
)
, m_team { 0 , 0 , 0 }
, m_serial { 0 , 0 , 0 }
@ -172,6 +172,8 @@ if ( IS_TEAM_LEAD && 0 != team_task ) {
member( kokkos_impl_cuda_shared_memory<void>()
, 16 /* shared_begin */
, team_task->m_shmem_size /* shared size */
, 0 /* scratch level 1 pointer */
, 0 /* scratch level 1 size */
, 0 /* league rank */
, 1 /* league size */
);
@ -926,5 +928,5 @@ void Task::clear_dependence()
} /* namespace Kokkos */
#endif /* #if defined( KOKKOS_ENABLE_CUDA_TASK_POLICY ) */
#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */

View File

@ -47,19 +47,11 @@
#define KOKKOS_CUDA_TASKPOLICY_HPP
#include <Kokkos_Core_fwd.hpp>
#if defined( KOKKOS_HAVE_CUDA ) && \
defined( KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE )
#define KOKKOS_ENABLE_CUDA_TASK_POLICY
/* The TaskPolicy< Cuda > capability requires nvcc using the option:
* --relocatable-device-code=true
*/
#include <Kokkos_Cuda.hpp>
#include <Kokkos_TaskPolicy.hpp>
#if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY )
//----------------------------------------------------------------------------
namespace Kokkos {
@ -81,8 +73,6 @@ public:
private:
friend struct CudaTaskPolicyQueue ;
CudaTaskPolicyQueue * m_policy ;
TaskMember * volatile * m_queue ;
function_team_type m_team ; ///< Apply function on CUDA
@ -819,9 +809,11 @@ public:
static member_type member_single()
{
return
member_type( 0 /* shared memory */
, 0 /* shared memory begin */
, 0 /* shared memory size */
member_type( 0 /* shared memory pointer */
, 0 /* shared memory begin offset */
, 0 /* shared memory end offset */
, 0 /* scratch level_1 pointer */
, 0 /* scratch level_1 size */
, 0 /* league rank */
, 1 /* league size */ );
}
@ -832,10 +824,10 @@ public:
} /* namespace Experimental */
} /* namespace Kokkos */
#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE ) */
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
#endif /* #ifndef KOKKOS_CUDA_TASKPOLICY_HPP */

View File

@ -56,8 +56,6 @@
#include <impl/Kokkos_Shape.hpp>
#include <Kokkos_View.hpp>
#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
@ -90,343 +88,6 @@ struct AssertShapeBoundsAbort< CudaSpace >
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#if ! KOKKOS_USING_EXP_VIEW
namespace Kokkos {
namespace Impl {
//----------------------------------------------------------------------------
// Cuda Texture fetches can be performed for 4, 8 and 16 byte objects (int,int2,int4)
// Via reinterpret_case this can be used to support all scalar types of those sizes.
// Any other scalar type falls back to either normal reads out of global memory,
// or using the __ldg intrinsic on Kepler GPUs or newer (Compute Capability >= 3.0)
template< typename ValueType
, class MemorySpace
, class AliasType =
typename Kokkos::Impl::if_c< ( sizeof(ValueType) == 4 ) , int ,
typename Kokkos::Impl::if_c< ( sizeof(ValueType) == 8 ) , ::int2 ,
typename Kokkos::Impl::if_c< ( sizeof(ValueType) == 16 ) , ::int4 ,
typename Kokkos::Impl::if_c< ( sizeof(ValueType) == 32 ) , ::float4 ,void
>::type
>::type
>::type
>::type
>
class CudaTextureFetch {
private:
cuda_texture_object_type m_obj ;
const ValueType * m_alloc_ptr ;
int m_offset ;
void attach( const ValueType * const arg_ptr, AllocationTracker const & tracker )
{
typedef char const * const byte;
m_alloc_ptr = reinterpret_cast<ValueType *>(tracker.alloc_ptr());
size_t byte_offset = reinterpret_cast<byte>(arg_ptr) - reinterpret_cast<byte>(m_alloc_ptr);
const bool ok_aligned = 0 == byte_offset % sizeof(ValueType);
const size_t count = tracker.alloc_size() / sizeof(ValueType);
const bool ok_contains = (m_alloc_ptr <= arg_ptr) && (arg_ptr < (m_alloc_ptr + count));
if (ok_aligned && ok_contains) {
if (tracker.attribute() == NULL ) {
MemorySpace::texture_object_attach(
tracker
, sizeof(ValueType)
, cudaCreateChannelDesc< AliasType >()
);
}
m_obj = dynamic_cast<TextureAttribute*>(tracker.attribute())->m_tex_obj;
m_offset = arg_ptr - m_alloc_ptr;
}
else if( !ok_contains ) {
throw_runtime_exception("Error: cannot attach a texture object to a tracker which does not bound the pointer.");
}
else {
throw_runtime_exception("Error: cannot attach a texture object to an incorrectly aligned pointer.");
}
}
public:
KOKKOS_INLINE_FUNCTION
CudaTextureFetch() : m_obj() , m_alloc_ptr() , m_offset() {}
KOKKOS_INLINE_FUNCTION
~CudaTextureFetch() {}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch( const CudaTextureFetch & rhs )
: m_obj( rhs.m_obj )
, m_alloc_ptr( rhs.m_alloc_ptr )
, m_offset( rhs.m_offset )
{}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch & operator = ( const CudaTextureFetch & rhs )
{
m_obj = rhs.m_obj ;
m_alloc_ptr = rhs.m_alloc_ptr ;
m_offset = rhs.m_offset ;
return *this ;
}
KOKKOS_INLINE_FUNCTION explicit
CudaTextureFetch( const ValueType * const arg_ptr, AllocationTracker const & tracker )
: m_obj( 0 ) , m_alloc_ptr(0) , m_offset(0)
{
#if defined( KOKKOS_USE_LDG_INTRINSIC )
m_alloc_ptr(arg_ptr);
#elif defined( __CUDACC__ ) && ! defined( __CUDA_ARCH__ )
if ( arg_ptr != NULL ) {
if ( tracker.is_valid() ) {
attach( arg_ptr, tracker );
}
else {
AllocationTracker found_tracker = AllocationTracker::find<typename MemorySpace::allocator>(arg_ptr);
if ( found_tracker.is_valid() ) {
attach( arg_ptr, found_tracker );
} else {
throw_runtime_exception("Error: cannot attach a texture object to an untracked pointer!");
}
}
}
#endif
}
KOKKOS_INLINE_FUNCTION
operator const ValueType * () const { return m_alloc_ptr + m_offset ; }
template< typename iType >
KOKKOS_INLINE_FUNCTION
ValueType operator[]( const iType & i ) const
{
#if defined( KOKKOS_USE_LDG_INTRINSIC ) && defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
AliasType v = __ldg(reinterpret_cast<AliasType*>(&m_alloc_ptr[i]));
return *(reinterpret_cast<ValueType*> (&v));
#elif defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
AliasType v = tex1Dfetch<AliasType>( m_obj , i + m_offset );
return *(reinterpret_cast<ValueType*> (&v));
#else
return m_alloc_ptr[ i + m_offset ];
#endif
}
};
template< typename ValueType, class MemorySpace >
class CudaTextureFetch< const ValueType, MemorySpace, float4 > {
private:
typedef float4 AliasType;
cuda_texture_object_type m_obj ;
const ValueType * m_alloc_ptr ;
int m_offset ;
void attach( const ValueType * const arg_ptr, AllocationTracker const & tracker )
{
typedef char const * const byte;
m_alloc_ptr = reinterpret_cast<ValueType *>(tracker.alloc_ptr());
size_t byte_offset = reinterpret_cast<byte>(arg_ptr) - reinterpret_cast<byte>(m_alloc_ptr);
const bool ok_aligned = 0 == byte_offset % sizeof(ValueType);
const size_t count = tracker.alloc_size() / sizeof(ValueType);
const bool ok_contains = (m_alloc_ptr <= arg_ptr) && (arg_ptr < (m_alloc_ptr + count));
if (ok_aligned && ok_contains) {
if (tracker.attribute() == NULL ) {
MemorySpace::texture_object_attach(
tracker
, sizeof(ValueType)
, cudaCreateChannelDesc< AliasType >()
);
}
m_obj = dynamic_cast<TextureAttribute*>(tracker.attribute())->m_tex_obj;
m_offset = arg_ptr - m_alloc_ptr;
}
else if( !ok_contains ) {
throw_runtime_exception("Error: cannot attach a texture object to a tracker which does not bound the pointer.");
}
else {
throw_runtime_exception("Error: cannot attach a texture object to an incorrectly aligned pointer.");
}
}
public:
KOKKOS_INLINE_FUNCTION
CudaTextureFetch() : m_obj() , m_alloc_ptr() , m_offset() {}
KOKKOS_INLINE_FUNCTION
~CudaTextureFetch() {}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch( const CudaTextureFetch & rhs )
: m_obj( rhs.m_obj )
, m_alloc_ptr( rhs.m_alloc_ptr )
, m_offset( rhs.m_offset )
{}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch & operator = ( const CudaTextureFetch & rhs )
{
m_obj = rhs.m_obj ;
m_alloc_ptr = rhs.m_alloc_ptr ;
m_offset = rhs.m_offset ;
return *this ;
}
KOKKOS_INLINE_FUNCTION explicit
CudaTextureFetch( const ValueType * const arg_ptr, AllocationTracker const & tracker )
: m_obj( 0 ) , m_alloc_ptr(0) , m_offset(0)
{
#if defined( KOKKOS_USE_LDG_INTRINSIC )
m_alloc_ptr(arg_ptr);
#elif defined( __CUDACC__ ) && ! defined( __CUDA_ARCH__ )
if ( arg_ptr != NULL ) {
if ( tracker.is_valid() ) {
attach( arg_ptr, tracker );
}
else {
AllocationTracker found_tracker = AllocationTracker::find<typename MemorySpace::allocator>(arg_ptr);
if ( found_tracker.is_valid() ) {
attach( arg_ptr, found_tracker );
} else {
throw_runtime_exception("Error: cannot attach a texture object to an untracked pointer!");
}
}
}
#endif
}
KOKKOS_INLINE_FUNCTION
operator const ValueType * () const { return m_alloc_ptr + m_offset ; }
template< typename iType >
KOKKOS_INLINE_FUNCTION
ValueType operator[]( const iType & i ) const
{
#if defined( KOKKOS_USE_LDG_INTRINSIC ) && defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
AliasType v = __ldg(reinterpret_cast<AliasType*>(&m_alloc_ptr[i]));
return *(reinterpret_cast<ValueType*> (&v));
#elif defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
union Float4ValueType {
float4 f4[2];
ValueType val;
};
Float4ValueType convert;
convert.f4[0] = tex1Dfetch<AliasType>( m_obj , 2*(i + m_offset) );
convert.f4[1] = tex1Dfetch<AliasType>( m_obj , 2*(i + m_offset)+1 );
return convert.val;
#else
return m_alloc_ptr[ i + m_offset ];
#endif
}
};
template< typename ValueType, class MemorySpace >
class CudaTextureFetch< const ValueType, MemorySpace, void >
{
private:
const ValueType * m_ptr ;
public:
KOKKOS_INLINE_FUNCTION
CudaTextureFetch() : m_ptr(0) {};
KOKKOS_INLINE_FUNCTION
~CudaTextureFetch() {
}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch( const ValueType * ptr, const AllocationTracker & ) : m_ptr(ptr) {}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch( const CudaTextureFetch & rhs ) : m_ptr(rhs.m_ptr) {}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch & operator = ( const CudaTextureFetch & rhs ) {
m_ptr = rhs.m_ptr;
return *this ;
}
explicit KOKKOS_INLINE_FUNCTION
CudaTextureFetch( ValueType * const base_view_ptr, AllocationTracker const & /*tracker*/ ) {
m_ptr = base_view_ptr;
}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch & operator = (const ValueType* base_view_ptr) {
m_ptr = base_view_ptr;
return *this;
}
KOKKOS_INLINE_FUNCTION
operator const ValueType * () const { return m_ptr ; }
template< typename iType >
KOKKOS_INLINE_FUNCTION
ValueType operator[]( const iType & i ) const
{
return m_ptr[ i ];
}
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
/** \brief Replace Default ViewDataHandle with Cuda texture fetch specialization
* if 'const' value type, CudaSpace and random access.
*/
template< class ViewTraits >
class ViewDataHandle< ViewTraits ,
typename enable_if< ( is_same< typename ViewTraits::memory_space,CudaSpace>::value ||
is_same< typename ViewTraits::memory_space,CudaUVMSpace>::value )
&&
is_same<typename ViewTraits::const_value_type,typename ViewTraits::value_type>::value
&&
ViewTraits::memory_traits::RandomAccess
>::type >
{
public:
enum { ReturnTypeIsReference = false };
typedef Impl::CudaTextureFetch< typename ViewTraits::value_type
, typename ViewTraits::memory_space> handle_type;
KOKKOS_INLINE_FUNCTION
static handle_type create_handle( typename ViewTraits::value_type * arg_data_ptr, AllocationTracker const & arg_tracker )
{
return handle_type(arg_data_ptr, arg_tracker);
}
typedef typename ViewTraits::value_type return_type;
};
}
}
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif // KOKKOS_HAVE_CUDA
#endif /* #ifndef KOKKOS_CUDA_VIEW_HPP */

View File

@ -0,0 +1,611 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
#define KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
#include <Kokkos_ExecPolicy.hpp>
#include <Kokkos_Parallel.hpp>
#include <initializer_list>
#if defined(KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION) && defined(KOKKOS_HAVE_PRAGMA_IVDEP) && !defined(__CUDA_ARCH__)
#define KOKKOS_MDRANGE_IVDEP
#endif
namespace Kokkos { namespace Experimental {
enum class Iterate
{
Default, // Default for the device
Left, // Left indices stride fastest
Right, // Right indices stride fastest
Flat, // Do not tile, only valid for inner direction
};
template <typename ExecSpace>
struct default_outer_direction
{
using type = Iterate;
static constexpr Iterate value = Iterate::Right;
};
template <typename ExecSpace>
struct default_inner_direction
{
using type = Iterate;
static constexpr Iterate value = Iterate::Right;
};
// Iteration Pattern
template < unsigned N
, Iterate OuterDir = Iterate::Default
, Iterate InnerDir = Iterate::Default
>
struct Rank
{
static_assert( N != 0u, "Kokkos Error: rank 0 undefined");
static_assert( N != 1u, "Kokkos Error: rank 1 is not a multi-dimensional range");
static_assert( N < 4u, "Kokkos Error: Unsupported rank...");
using iteration_pattern = Rank<N, OuterDir, InnerDir>;
static constexpr int rank = N;
static constexpr Iterate outer_direction = OuterDir;
static constexpr Iterate inner_direction = InnerDir;
};
// multi-dimensional iteration pattern
template <typename... Properties>
struct MDRangePolicy
{
using range_policy = RangePolicy<Properties...>;
static_assert( !std::is_same<range_policy,void>::value
, "Kokkos Error: MD iteration pattern not defined" );
using iteration_pattern = typename range_policy::iteration_pattern;
using work_tag = typename range_policy::work_tag;
static constexpr int rank = iteration_pattern::rank;
static constexpr int outer_direction = static_cast<int> (
(iteration_pattern::outer_direction != Iterate::Default && iteration_pattern::outer_direction != Iterate::Flat)
? iteration_pattern::outer_direction
: default_outer_direction< typename range_policy::execution_space>::value );
static constexpr int inner_direction = static_cast<int> (
iteration_pattern::inner_direction != Iterate::Default
? iteration_pattern::inner_direction
: default_inner_direction< typename range_policy::execution_space>::value ) ;
// Ugly ugly workaround intel 14 not handling scoped enum correctly
static constexpr int Flat = static_cast<int>( Iterate::Flat );
static constexpr int Right = static_cast<int>( Iterate::Right );
using size_type = typename range_policy::index_type;
using index_type = typename std::make_signed<size_type>::type;
template <typename I>
MDRangePolicy( std::initializer_list<I> upper_corner )
{
static_assert( std::is_integral<I>::value, "Kokkos Error: corner defined with non-integral type" );
// TODO check size of lists equal to rank
// static_asserts on initializer_list.size() require c++14
//static_assert( upper_corner.size() == rank, "Kokkos Error: upper_corner has incorrect rank" );
const auto u = upper_corner.begin();
m_num_tiles = 1;
for (int i=0; i<rank; ++i) {
m_offset[i] = static_cast<index_type>(0);
m_dim[i] = static_cast<index_type>(u[i]);
if (inner_direction != Flat) {
// default tile size to 4
m_tile[i] = 4;
} else {
m_tile[i] = 1;
}
m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i];
m_num_tiles *= m_tile_dim[i];
}
}
template <typename IA, typename IB>
MDRangePolicy( std::initializer_list<IA> corner_a
, std::initializer_list<IB> corner_b
)
{
static_assert( std::is_integral<IA>::value, "Kokkos Error: corner A defined with non-integral type" );
static_assert( std::is_integral<IB>::value, "Kokkos Error: corner B defined with non-integral type" );
// TODO check size of lists equal to rank
// static_asserts on initializer_list.size() require c++14
//static_assert( corner_a.size() == rank, "Kokkos Error: corner_a has incorrect rank" );
//static_assert( corner_b.size() == rank, "Kokkos Error: corner_b has incorrect rank" );
using A = typename std::make_signed<IA>::type;
using B = typename std::make_signed<IB>::type;
const auto a = [=](int i) { return static_cast<A>(corner_a.begin()[i]); };
const auto b = [=](int i) { return static_cast<B>(corner_b.begin()[i]); };
m_num_tiles = 1;
for (int i=0; i<rank; ++i) {
m_offset[i] = static_cast<index_type>(a(i) <= b(i) ? a(i) : b(i));
m_dim[i] = static_cast<index_type>(a(i) <= b(i) ? b(i) - a(i) : a(i) - b(i));
if (inner_direction != Flat) {
// default tile size to 4
m_tile[i] = 4;
} else {
m_tile[i] = 1;
}
m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i];
m_num_tiles *= m_tile_dim[i];
}
}
template <typename IA, typename IB, typename T>
MDRangePolicy( std::initializer_list<IA> corner_a
, std::initializer_list<IB> corner_b
, std::initializer_list<T> tile
)
{
static_assert( std::is_integral<IA>::value, "Kokkos Error: corner A defined with non-integral type" );
static_assert( std::is_integral<IB>::value, "Kokkos Error: corner B defined with non-integral type" );
static_assert( std::is_integral<T>::value, "Kokkos Error: tile defined with non-integral type" );
static_assert( inner_direction != Flat, "Kokkos Error: tiling not support with flat iteration" );
// TODO check size of lists equal to rank
// static_asserts on initializer_list.size() require c++14
//static_assert( corner_a.size() == rank, "Kokkos Error: corner_a has incorrect rank" );
//static_assert( corner_b.size() == rank, "Kokkos Error: corner_b has incorrect rank" );
//static_assert( tile.size() == rank, "Kokkos Error: tile has incorrect rank" );
using A = typename std::make_signed<IA>::type;
using B = typename std::make_signed<IB>::type;
const auto a = [=](int i) { return static_cast<A>(corner_a.begin()[i]); };
const auto b = [=](int i) { return static_cast<B>(corner_b.begin()[i]); };
const auto t = tile.begin();
m_num_tiles = 1;
for (int i=0; i<rank; ++i) {
m_offset[i] = static_cast<index_type>(a(i) <= b(i) ? a(i) : b(i));
m_dim[i] = static_cast<index_type>(a(i) <= b(i) ? b(i) - a(i) : a(i) - b(i));
m_tile[i] = static_cast<int>(t[i] > (T)0 ? t[i] : (T)1 );
m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i];
m_num_tiles *= m_tile_dim[i];
}
}
index_type m_offset[rank];
index_type m_dim[rank];
int m_tile[rank];
index_type m_tile_dim[rank];
size_type m_num_tiles; // product of tile dims
};
namespace Impl {
// Serial, Threads, OpenMP
// use enable_if to overload for Cuda
template < typename MDRange, typename Functor, typename Enable = void >
struct MDForFunctor
{
using work_tag = typename MDRange::work_tag;
using index_type = typename MDRange::index_type;
using size_type = typename MDRange::size_type;
MDRange m_range;
Functor m_func;
KOKKOS_INLINE_FUNCTION
MDForFunctor( MDRange const& range, Functor const& f )
: m_range(range)
, m_func( f )
{}
KOKKOS_INLINE_FUNCTION
MDForFunctor( MDRange const& range, Functor && f )
: m_range(range)
, m_func( std::forward<Functor>(f) )
{}
KOKKOS_INLINE_FUNCTION
MDForFunctor( MDRange && range, Functor const& f )
: m_range( std::forward<MDRange>(range) )
, m_func( f )
{}
KOKKOS_INLINE_FUNCTION
MDForFunctor( MDRange && range, Functor && f )
: m_range( std::forward<MDRange>(range) )
, m_func( std::forward<Functor>(f) )
{}
KOKKOS_INLINE_FUNCTION
MDForFunctor( MDForFunctor const& ) = default;
KOKKOS_INLINE_FUNCTION
MDForFunctor& operator=( MDForFunctor const& ) = default;
KOKKOS_INLINE_FUNCTION
MDForFunctor( MDForFunctor && ) = default;
KOKKOS_INLINE_FUNCTION
MDForFunctor& operator=( MDForFunctor && ) = default;
// Rank-2, Flat, No Tag
template <typename Idx>
KOKKOS_FORCEINLINE_FUNCTION
typename std::enable_if<( std::is_integral<Idx>::value
&& std::is_same<void, work_tag>::value
&& MDRange::rank == 2
&& MDRange::inner_direction == MDRange::Flat
)>::type
operator()(Idx t) const
{
if ( MDRange::outer_direction == MDRange::Right ) {
m_func( m_range.m_offset[0] + ( t / m_range.m_dim[1] )
, m_range.m_offset[1] + ( t % m_range.m_dim[1] ) );
} else {
m_func( m_range.m_offset[0] + ( t % m_range.m_dim[0] )
, m_range.m_offset[1] + ( t / m_range.m_dim[0] ) );
}
}
// Rank-2, Flat, Tag
template <typename Idx>
KOKKOS_FORCEINLINE_FUNCTION
typename std::enable_if<( std::is_integral<Idx>::value
&& !std::is_same<void, work_tag>::value
&& MDRange::rank == 2
&& MDRange::inner_direction == MDRange::Flat
)>::type
operator()(Idx t) const
{
if ( MDRange::outer_direction == MDRange::Right ) {
m_func( work_tag{}, m_range.m_offset[0] + ( t / m_range.m_dim[1] )
, m_range.m_offset[1] + ( t % m_range.m_dim[1] ) );
} else {
m_func( work_tag{}, m_range.m_offset[0] + ( t % m_range.m_dim[0] )
, m_range.m_offset[1] + ( t / m_range.m_dim[0] ) );
}
}
// Rank-2, Not Flat, No Tag
template <typename Idx>
KOKKOS_FORCEINLINE_FUNCTION
typename std::enable_if<( std::is_integral<Idx>::value
&& std::is_same<void, work_tag>::value
&& MDRange::rank == 2
&& MDRange::inner_direction != MDRange::Flat
)>::type
operator()(Idx t) const
{
index_type t0, t1;
if ( MDRange::outer_direction == MDRange::Right ) {
t0 = t / m_range.m_tile_dim[1];
t1 = t % m_range.m_tile_dim[1];
} else {
t0 = t % m_range.m_tile_dim[0];
t1 = t / m_range.m_tile_dim[0];
}
const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
if ( MDRange::inner_direction == MDRange::Right ) {
for (int i0=b0; i0<e0; ++i0) {
#if defined(KOKKOS_MDRANGE_IVDEP)
#pragma ivdep
#endif
for (int i1=b1; i1<e1; ++i1) {
m_func( i0, i1 );
}}
} else {
for (int i1=b1; i1<e1; ++i1) {
#if defined(KOKKOS_MDRANGE_IVDEP)
#pragma ivdep
#endif
for (int i0=b0; i0<e0; ++i0) {
m_func( i0, i1 );
}}
}
}
// Rank-2, Not Flat, Tag
template <typename Idx>
KOKKOS_FORCEINLINE_FUNCTION
typename std::enable_if<( std::is_integral<Idx>::value
&& !std::is_same<void, work_tag>::value
&& MDRange::rank == 2
&& MDRange::inner_direction != MDRange::Flat
)>::type
operator()(Idx t) const
{
work_tag tag;
index_type t0, t1;
if ( MDRange::outer_direction == MDRange::Right ) {
t0 = t / m_range.m_tile_dim[1];
t1 = t % m_range.m_tile_dim[1];
} else {
t0 = t % m_range.m_tile_dim[0];
t1 = t / m_range.m_tile_dim[0];
}
const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
if ( MDRange::inner_direction == MDRange::Right ) {
for (int i0=b0; i0<e0; ++i0) {
#if defined(KOKKOS_MDRANGE_IVDEP)
#pragma ivdep
#endif
for (int i1=b1; i1<e1; ++i1) {
m_func( tag, i0, i1 );
}}
} else {
for (int i1=b1; i1<e1; ++i1) {
#if defined(KOKKOS_MDRANGE_IVDEP)
#pragma ivdep
#endif
for (int i0=b0; i0<e0; ++i0) {
m_func( tag, i0, i1 );
}}
}
}
//---------------------------------------------------------------------------
// Rank-3, Flat, No Tag
template <typename Idx>
KOKKOS_FORCEINLINE_FUNCTION
typename std::enable_if<( std::is_integral<Idx>::value
&& std::is_same<void, work_tag>::value
&& MDRange::rank == 3
&& MDRange::inner_direction == MDRange::Flat
)>::type
operator()(Idx t) const
{
if ( MDRange::outer_direction == MDRange::Right ) {
const int64_t tmp_prod = m_range.m_dim[1]*m_range.m_dim[2];
m_func( m_range.m_offset[0] + ( t / tmp_prod )
, m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[2] )
, m_range.m_offset[2] + ( (t % tmp_prod) % m_range.m_dim[2] )
);
} else {
const int64_t tmp_prod = m_range.m_dim[0]*m_range.m_dim[1];
m_func( m_range.m_offset[0] + ( (t % tmp_prod) % m_range.m_dim[0] )
, m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[0] )
, m_range.m_offset[2] + ( t / tmp_prod )
);
}
}
// Rank-3, Flat, Tag
template <typename Idx>
KOKKOS_FORCEINLINE_FUNCTION
typename std::enable_if<( std::is_integral<Idx>::value
&& !std::is_same<void, work_tag>::value
&& MDRange::rank == 3
&& MDRange::inner_direction == MDRange::Flat
)>::type
operator()(Idx t) const
{
if ( MDRange::outer_direction == MDRange::Right ) {
const int64_t tmp_prod = m_range.m_dim[1]*m_range.m_dim[2];
m_func( work_tag{}
, m_range.m_offset[0] + ( t / tmp_prod )
, m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[2] )
, m_range.m_offset[2] + ( (t % tmp_prod) % m_range.m_dim[2] )
);
} else {
const int64_t tmp_prod = m_range.m_dim[0]*m_range.m_dim[1];
m_func( work_tag{}
, m_range.m_offset[0] + ( (t % tmp_prod) % m_range.m_dim[0] )
, m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[0] )
, m_range.m_offset[2] + ( t / tmp_prod )
);
}
}
// Rank-3, Not Flat, No Tag
template <typename Idx>
KOKKOS_FORCEINLINE_FUNCTION
typename std::enable_if<( std::is_integral<Idx>::value
&& std::is_same<void, work_tag>::value
&& MDRange::rank == 3
&& MDRange::inner_direction != MDRange::Flat
)>::type
operator()(Idx t) const
{
index_type t0, t1, t2;
if ( MDRange::outer_direction == MDRange::Right ) {
const index_type tmp_prod = ( m_range.m_tile_dim[1]*m_range.m_tile_dim[2]);
t0 = t / tmp_prod;
t1 = ( t % tmp_prod ) / m_range.m_tile_dim[2];
t2 = ( t % tmp_prod ) % m_range.m_tile_dim[2];
} else {
const index_type tmp_prod = ( m_range.m_tile_dim[0]*m_range.m_tile_dim[1]);
t0 = ( t % tmp_prod ) % m_range.m_tile_dim[0];
t1 = ( t % tmp_prod ) / m_range.m_tile_dim[0];
t2 = t / tmp_prod;
}
const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
const index_type b2 = t2 * m_range.m_tile[2] + m_range.m_offset[2];
const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
const index_type e2 = b2 + m_range.m_tile[2] <= (m_range.m_dim[2] + m_range.m_offset[2] ) ? b2 + m_range.m_tile[2] : ( m_range.m_dim[2] + m_range.m_offset[2] );
if ( MDRange::inner_direction == MDRange::Right ) {
for (int i0=b0; i0<e0; ++i0) {
for (int i1=b1; i1<e1; ++i1) {
#if defined(KOKKOS_MDRANGE_IVDEP)
#pragma ivdep
#endif
for (int i2=b2; i2<e2; ++i2) {
m_func( i0, i1, i2 );
}}}
} else {
for (int i2=b2; i2<e2; ++i2) {
for (int i1=b1; i1<e1; ++i1) {
#if defined(KOKKOS_MDRANGE_IVDEP)
#pragma ivdep
#endif
for (int i0=b0; i0<e0; ++i0) {
m_func( i0, i1, i2 );
}}}
}
}
// Rank-3, Not Flat, Tag
template <typename Idx>
KOKKOS_FORCEINLINE_FUNCTION
typename std::enable_if<( std::is_integral<Idx>::value
&& !std::is_same<void, work_tag>::value
&& MDRange::rank == 3
&& MDRange::inner_direction != MDRange::Flat
)>::type
operator()(Idx t) const
{
work_tag tag;
index_type t0, t1, t2;
if ( MDRange::outer_direction == MDRange::Right ) {
const index_type tmp_prod = ( m_range.m_tile_dim[1]*m_range.m_tile_dim[2]);
t0 = t / tmp_prod;
t1 = ( t % tmp_prod ) / m_range.m_tile_dim[2];
t2 = ( t % tmp_prod ) % m_range.m_tile_dim[2];
} else {
const index_type tmp_prod = ( m_range.m_tile_dim[0]*m_range.m_tile_dim[1]);
t0 = ( t % tmp_prod ) % m_range.m_tile_dim[0];
t1 = ( t % tmp_prod ) / m_range.m_tile_dim[0];
t2 = t / tmp_prod;
}
const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
const index_type b2 = t2 * m_range.m_tile[2] + m_range.m_offset[2];
const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
const index_type e2 = b2 + m_range.m_tile[2] <= (m_range.m_dim[2] + m_range.m_offset[2] ) ? b2 + m_range.m_tile[2] : ( m_range.m_dim[2] + m_range.m_offset[2] );
if ( MDRange::inner_direction == MDRange::Right ) {
for (int i0=b0; i0<e0; ++i0) {
for (int i1=b1; i1<e1; ++i1) {
#if defined(KOKKOS_MDRANGE_IVDEP)
#pragma ivdep
#endif
for (int i2=b2; i2<e2; ++i2) {
m_func( tag, i0, i1, i2 );
}}}
} else {
for (int i2=b2; i2<e2; ++i2) {
for (int i1=b1; i1<e1; ++i1) {
#if defined(KOKKOS_MDRANGE_IVDEP)
#pragma ivdep
#endif
for (int i0=b0; i0<e0; ++i0) {
m_func( tag, i0, i1, i2 );
}}}
}
}
};
} // namespace Impl
template <typename MDRange, typename Functor>
void md_parallel_for( MDRange const& range
, Functor const& f
, const std::string& str = ""
)
{
Impl::MDForFunctor<MDRange, Functor> g(range, f);
using range_policy = typename MDRange::range_policy;
Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
}
template <typename MDRange, typename Functor>
void md_parallel_for( const std::string& str
, MDRange const& range
, Functor const& f
)
{
Impl::MDForFunctor<MDRange, Functor> g(range, f);
using range_policy = typename MDRange::range_policy;
Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
}
}} // namespace Kokkos::Experimental
#endif //KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP

File diff suppressed because it is too large Load Diff

View File

@ -121,13 +121,22 @@ public:
return *this;
}
//! Assignment operator.
/// \brief Assignment operator, for volatile <tt>*this</tt> and
/// nonvolatile input.
///
/// \param src [in] Input; right-hand side of the assignment.
///
/// This operator returns \c void instead of <tt>volatile
/// complex<RealType>& </tt>. See Kokkos Issue #177 for the
/// explanation. In practice, this means that you should not chain
/// assignments with volatile lvalues.
template<class InputRealType>
KOKKOS_INLINE_FUNCTION
volatile complex<RealType>& operator= (const complex<InputRealType>& src) volatile {
void operator= (const complex<InputRealType>& src) volatile {
re_ = src.re_;
im_ = src.im_;
return *this;
// We deliberately do not return anything here. See explanation
// in public documentation above.
}
//! Assignment operator.

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,86 +36,43 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_BASIC_ALLOCATORS_HPP
#define KOKKOS_BASIC_ALLOCATORS_HPP
#ifndef KOKKOS_CORE_CONCEPTS_HPP
#define KOKKOS_CORE_CONCEPTS_HPP
#if ! KOKKOS_USING_EXP_VIEW
#include <type_traits>
namespace Kokkos { namespace Impl {
namespace Kokkos {
//Schedules for Execution Policies
struct Static {};
struct Dynamic {};
/// class UnmanagedAllocator
/// does nothing when deallocate(ptr,size) is called
class UnmanagedAllocator
//Schedule Wrapper Type
template<class T>
struct Schedule
{
public:
static const char * name() { return "Unmanaged Allocator"; }
static void deallocate(void * /*ptr*/, size_t /*size*/) {}
static_assert( std::is_same<T,Static>::value
|| std::is_same<T,Dynamic>::value
, "Kokkos: Invalid Schedule<> type."
);
using schedule_type = Schedule<T>;
using type = T;
};
/// class MallocAllocator
class MallocAllocator
//Specify Iteration Index Type
template<typename T>
struct IndexType
{
public:
static const char * name()
{
return "Malloc Allocator";
}
static void* allocate(size_t size);
static void deallocate(void * ptr, size_t size);
static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
static_assert(std::is_integral<T>::value,"Kokkos: Invalid IndexType<>.");
using index_type = IndexType<T>;
using type = T;
};
} // namespace Kokkos
/// class AlignedAllocator
/// memory aligned to Kokkos::Impl::MEMORY_ALIGNMENT
class AlignedAllocator
{
public:
static const char * name()
{
return "Aligned Allocator";
}
static void* allocate(size_t size);
static void deallocate(void * ptr, size_t size);
static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
};
/// class PageAlignedAllocator
/// memory aligned to PAGE_SIZE
class PageAlignedAllocator
{
public:
static const char * name()
{
return "Page Aligned Allocator";
}
static void* allocate(size_t size);
static void deallocate(void * ptr, size_t size);
static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
};
}} // namespace Kokkos::Impl
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
#endif //KOKKOS_BASIC_ALLOCATORS_HPP
#endif // KOKKOS_CORE_CONCEPTS_HPP

View File

@ -159,8 +159,6 @@ void * kokkos_realloc( void * arg_alloc , const size_t arg_alloc_size )
} // namespace Kokkos
#if KOKKOS_USING_EXP_VIEW
namespace Kokkos {
using Kokkos::Experimental::kokkos_malloc ;
@ -169,76 +167,6 @@ using Kokkos::Experimental::kokkos_free ;
}
#else
namespace Kokkos {
namespace Impl {
// should only by used by kokkos_malloc and kokkos_free
struct MallocHelper
{
static void increment_ref_count( AllocationTracker const & tracker )
{
tracker.increment_ref_count();
}
static void decrement_ref_count( AllocationTracker const & tracker )
{
tracker.decrement_ref_count();
}
};
} // namespace Impl
/* Allocate memory from a memory space.
* The allocation is tracked in Kokkos memory tracking system, so
* leaked memory can be identified.
*/
template< class Arg = DefaultExecutionSpace>
void* kokkos_malloc(const std::string label, size_t count) {
if(count == 0) return NULL;
typedef typename Arg::memory_space MemorySpace;
Impl::AllocationTracker tracker = MemorySpace::allocate_and_track(label,count);;
Impl::MallocHelper::increment_ref_count( tracker );
return tracker.alloc_ptr();
}
template< class Arg = DefaultExecutionSpace>
void* kokkos_malloc(const size_t& count) {
return kokkos_malloc<Arg>("DefaultLabel",count);
}
/* Free memory from a memory space.
*/
template< class Arg = DefaultExecutionSpace>
void kokkos_free(const void* ptr) {
typedef typename Arg::memory_space MemorySpace;
typedef typename MemorySpace::allocator allocator;
Impl::AllocationTracker tracker = Impl::AllocationTracker::find<allocator>(ptr);
if (tracker.is_valid()) {
Impl::MallocHelper::decrement_ref_count( tracker );
}
}
template< class Arg = DefaultExecutionSpace>
void* kokkos_realloc(const void* old_ptr, size_t size) {
if(old_ptr == NULL)
return kokkos_malloc<Arg>(size);
typedef typename Arg::memory_space MemorySpace;
typedef typename MemorySpace::allocator allocator;
Impl::AllocationTracker tracker = Impl::AllocationTracker::find<allocator>(old_ptr);
tracker.reallocate(size);
return tracker.alloc_ptr();
}
} // namespace Kokkos
#endif
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

View File

@ -69,6 +69,9 @@ namespace {
/**\brief Token to indicate that a parameter's value is to be automatically selected */
constexpr AUTO_t AUTO = Kokkos::AUTO_t();
}
struct InvalidType {};
}
//----------------------------------------------------------------------------
@ -205,7 +208,7 @@ namespace Impl {
template< class Functor
, class Policy
, class EnableFunctor = void
, class EnablePolicy = void
, class EnablePolicy = void
>
struct FunctorPolicyExecutionSpace;
@ -225,7 +228,7 @@ template< class FunctorType , class ExecPolicy , class ExecutionSpace =
///
/// This is an implementation detail of parallel_reduce. Users should
/// skip this and go directly to the nonmember function parallel_reduce.
template< class FunctorType , class ExecPolicy , class ExecutionSpace =
template< class FunctorType , class ExecPolicy , class ReducerType = InvalidType, class ExecutionSpace =
typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
> class ParallelReduce ;

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -56,11 +56,14 @@
#include <Kokkos_CudaSpace.hpp>
#include <Kokkos_Parallel.hpp>
#include <Kokkos_TaskPolicy.hpp>
#include <Kokkos_Layout.hpp>
#include <Kokkos_ScratchSpace.hpp>
#include <Kokkos_MemoryTraits.hpp>
#include <impl/Kokkos_Tags.hpp>
#include <KokkosExp_MDRangePolicy.hpp>
/*--------------------------------------------------------------------------*/
namespace Kokkos {
@ -108,7 +111,7 @@ public:
//! This execution space's preferred array layout.
typedef LayoutLeft array_layout ;
//!
//!
typedef ScratchMemorySpace< Cuda > scratch_memory_space ;
//@}
@ -257,10 +260,10 @@ struct VerifyExecutionCanAccessMemorySpace
#include <Cuda/Kokkos_CudaExec.hpp>
#include <Cuda/Kokkos_Cuda_View.hpp>
#include <KokkosExp_View.hpp>
#include <Cuda/KokkosExp_Cuda_View.hpp>
#include <Cuda/Kokkos_Cuda_Parallel.hpp>
#include <Cuda/Kokkos_Cuda_Task.hpp>
//----------------------------------------------------------------------------

View File

@ -54,10 +54,7 @@
#include <Kokkos_HostSpace.hpp>
#include <impl/Kokkos_AllocationTracker.hpp>
#include <Cuda/Kokkos_Cuda_abort.hpp>
#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>
/*--------------------------------------------------------------------------*/
@ -77,33 +74,6 @@ public:
/*--------------------------------*/
#if ! KOKKOS_USING_EXP_VIEW
typedef Impl::CudaMallocAllocator allocator;
/** \brief Allocate a contiguous block of memory.
*
* The input label is associated with the block of memory.
* The block of memory is tracked via reference counting where
* allocation gives it a reference count of one.
*/
static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
/*--------------------------------*/
/** \brief Cuda specific function to attached texture object to an allocation.
* Output the texture object, base pointer, and offset from the input pointer.
*/
#if defined( __CUDACC__ )
static void texture_object_attach( Impl::AllocationTracker const & tracker
, unsigned type_size
, ::cudaChannelFormatDesc const & desc
);
#endif
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
/*--------------------------------*/
CudaSpace();
CudaSpace( CudaSpace && rhs ) = default ;
CudaSpace( const CudaSpace & rhs ) = default ;
@ -137,7 +107,7 @@ namespace Impl {
/// where the hash value is derived from the address of the
/// object for which an atomic operation is performed.
/// This function initializes the locks to zero (unset).
void init_lock_array_cuda_space();
void init_lock_arrays_cuda_space();
/// \brief Retrieve the pointer to the lock array for arbitrary size atomics.
///
@ -146,7 +116,23 @@ void init_lock_array_cuda_space();
/// object for which an atomic operation is performed.
/// This function retrieves the lock array pointer.
/// If the array is not yet allocated it will do so.
int* lock_array_cuda_space_ptr(bool deallocate = false);
int* atomic_lock_array_cuda_space_ptr(bool deallocate = false);
/// \brief Retrieve the pointer to the scratch array for team and thread private global memory.
///
/// Team and Thread private scratch allocations in
/// global memory are aquired via locks.
/// This function retrieves the lock array pointer.
/// If the array is not yet allocated it will do so.
int* scratch_lock_array_cuda_space_ptr(bool deallocate = false);
/// \brief Retrieve the pointer to the scratch array for unique identifiers.
///
/// Unique identifiers in the range 0-Cuda::concurrency
/// are provided via locks.
/// This function retrieves the lock array pointer.
/// If the array is not yet allocated it will do so.
int* threadid_lock_array_cuda_space_ptr(bool deallocate = false);
}
} // namespace Kokkos
@ -172,33 +158,6 @@ public:
/*--------------------------------*/
#if ! KOKKOS_USING_EXP_VIEW
typedef Impl::CudaUVMAllocator allocator;
/** \brief Allocate a contiguous block of memory.
*
* The input label is associated with the block of memory.
* The block of memory is tracked via reference counting where
* allocation gives it a reference count of one.
*/
static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
/** \brief Cuda specific function to attached texture object to an allocation.
* Output the texture object, base pointer, and offset from the input pointer.
*/
#if defined( __CUDACC__ )
static void texture_object_attach( Impl::AllocationTracker const & tracker
, unsigned type_size
, ::cudaChannelFormatDesc const & desc
);
#endif
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
/*--------------------------------*/
CudaUVMSpace();
CudaUVMSpace( CudaUVMSpace && rhs ) = default ;
CudaUVMSpace( const CudaUVMSpace & rhs ) = default ;
@ -242,22 +201,6 @@ public:
/*--------------------------------*/
#if ! KOKKOS_USING_EXP_VIEW
typedef Impl::CudaHostAllocator allocator ;
/** \brief Allocate a contiguous block of memory.
*
* The input label is associated with the block of memory.
* The block of memory is tracked via reference counting where
* allocation gives it a reference count of one.
*/
static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
/*--------------------------------*/
CudaHostPinnedSpace();
CudaHostPinnedSpace( CudaHostPinnedSpace && rhs ) = default ;
CudaHostPinnedSpace( const CudaHostPinnedSpace & rhs ) = default ;

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -47,167 +47,15 @@
#include <Kokkos_Core_fwd.hpp>
#include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_StaticAssert.hpp>
#include <impl/Kokkos_Error.hpp>
#include <impl/Kokkos_Tags.hpp>
#include <impl/Kokkos_AnalyzePolicy.hpp>
#include <Kokkos_Concepts.hpp>
#include <iostream>
//----------------------------------------------------------------------------
namespace Kokkos {
//Schedules for Execution Policies
struct Static {
};
struct Dynamic {
};
//Schedule Wrapper Type
template<class ScheduleType>
struct Schedule {
static_assert(std::is_same<ScheduleType,Static>::value ||
std::is_same<ScheduleType,Dynamic>::value,
"Kokkos: Invalid Schedule<> type.");
typedef Schedule<ScheduleType> schedule_type;
typedef ScheduleType type;
};
//Specif Iteration Index Type
template<typename iType>
struct IndexType {
static_assert(std::is_integral<iType>::value,"Kokkos: Invalid IndexType<>.");
typedef IndexType<iType> index_type;
typedef iType type;
};
namespace Impl {
template<class Arg>
struct is_schedule_type {
enum { value = 0};
};
template<class ScheduleType>
struct is_schedule_type<Schedule<ScheduleType> > {
enum {value = 1 };
};
template<class Arg>
struct is_index_type {
enum { value = 0 };
};
template<typename iType>
struct is_index_type<IndexType<iType> > {
enum { value = 1 };
};
template<typename Arg>
struct is_tag_type {
enum { value = !(is_execution_space<Arg>::value ||
is_schedule_type<Arg>::value ||
is_index_type<Arg>::value ||
std::is_integral<Arg>::value)};
};
//Policy Traits
template<class ... Properties>
struct PolicyTraits;
template<>
struct PolicyTraits<void> {
typedef void execution_space;
typedef void schedule_type;
typedef void index_type;
typedef void tag_type;
};
//Strip off ExecutionSpace
template<class ExecutionSpace, class ... Props>
struct PolicyTraits<typename std::enable_if<is_execution_space<ExecutionSpace>::value >::type,ExecutionSpace,Props ...> {
static_assert( std::is_same<typename PolicyTraits<void, Props ...>::execution_space, void>::value,
"ExecutionPolicy: Only one execution space template argument may be used.");
typedef ExecutionSpace execution_space;
typedef typename PolicyTraits<void, Props ...>::schedule_type schedule_type;
typedef typename PolicyTraits<void, Props ...>::index_type index_type;
typedef typename PolicyTraits<void, Props ...>::tag_type tag_type;
};
//Strip off ScheduleType
template<class ScheduleType, class ... Props>
struct PolicyTraits<typename std::enable_if<is_schedule_type<Schedule<ScheduleType> >::value >::type,Schedule<ScheduleType>,Props ...> {
static_assert( std::is_same<typename PolicyTraits<void, Props ...>::schedule_type, void>::value,
"ExecutionPolicy: Only one Schedule<..> template argument may be used.");
typedef typename PolicyTraits<void, Props ...>::execution_space execution_space;
typedef ScheduleType schedule_type;
typedef typename PolicyTraits<void, Props ...>::index_type index_type;
typedef typename PolicyTraits<void, Props ...>::tag_type tag_type;
};
//Strip off IndexType
template<typename iType, class ... Props>
struct PolicyTraits<void, IndexType<iType>,Props ...> {
static_assert( std::is_same<typename PolicyTraits<void, Props ...>::index_type, void>::value,
"ExecutionPolicy: Only one IndexType<..> template argument may be used.");
typedef typename PolicyTraits<void, Props ...>::execution_space execution_space;
typedef typename PolicyTraits<void, Props ...>::schedule_type schedule_type;
typedef iType index_type;
typedef typename PolicyTraits<void, Props ...>::tag_type tag_type;
};
//Strip off raw IndexType
template<typename iType, class ... Props>
struct PolicyTraits<typename std::enable_if<std::is_integral<iType>::value>::type, iType,Props ...> {
static_assert( std::is_same<typename PolicyTraits<void, Props ...>::index_type, void>::value,
"ExecutionPolicy: Only one IndexType<..> template argument may be used.");
typedef typename PolicyTraits<void, Props ...>::execution_space execution_space;
typedef typename PolicyTraits<void, Props ...>::schedule_type schedule_type;
typedef iType index_type;
typedef typename PolicyTraits<void, Props ...>::tag_type tag_type;
};
//Strip off TagType
template<class TagType, class ... Props>
struct PolicyTraits<typename std::enable_if<!is_schedule_type<TagType>::value &&
!is_execution_space<TagType>::value &&
!is_index_type<TagType>::value &&
!std::is_integral<TagType>::value
>::type,
TagType,Props ...> {
static_assert( std::is_same<typename PolicyTraits<void, Props ...>::tag_type, void>::value,
"ExecutionPolicy: Only one tag type template argument may be used.");
typedef typename PolicyTraits<void, Props ...>::execution_space execution_space;
typedef typename PolicyTraits<void, Props ...>::schedule_type schedule_type;
typedef typename PolicyTraits<void, Props ...>::index_type index_type;
typedef TagType tag_type;
};
template<class ... Props>
struct PolicyTraits {
#ifdef KOKKOS_DIRECT_VARIADIC_EXPANSION
typedef typename std::conditional<std::is_same<void, typename PolicyTraits<void, Props ...>::execution_space>::value,
Kokkos::DefaultExecutionSpace, typename PolicyTraits<void,Props ...>::execution_space>::type execution_space;
typedef typename std::conditional<std::is_same<void, typename PolicyTraits<void, Props ...>::schedule_type>::value,
Kokkos::Static, typename PolicyTraits<void,Props ...>::schedule_type>::type schedule_type;
typedef typename std::conditional<std::is_same<void, typename PolicyTraits<void, Props ...>::index_type>::value,
typename execution_space::size_type, typename PolicyTraits<void,Props ...>::index_type>::type index_type;
typedef typename std::conditional<std::is_same<void, typename PolicyTraits<void, Props ...>::tag_type>::value,
void, typename PolicyTraits<void,Props ...>::tag_type>::type work_tag;
#else
typedef typename has_condition<Kokkos::DefaultExecutionSpace,is_execution_space,Props ...>::type execution_space;
typedef typename has_condition<Kokkos::Schedule<Kokkos::Static>,is_schedule_type,Props ...>::type schedule_type;
typedef typename has_condition<void,is_tag_type,Props ...>::type work_tag;
typedef typename has_condition<typename execution_space::size_type, std::is_integral, Props ... >::type default_index_type;
typedef typename has_condition<Kokkos::IndexType<default_index_type>,is_index_type,Props ...>::type::type index_type;
#endif
};
}
}
namespace Kokkos {
/** \brief Execution policy for work over a range of an integral type.
*
* Valid template argument options:
@ -230,7 +78,9 @@ namespace Kokkos {
* Blocking is the granularity of partitioning the range among threads.
*/
template<class ... Properties>
class RangePolicy: public Impl::PolicyTraits<Properties ... > {
class RangePolicy
: public Impl::PolicyTraits<Properties ... >
{
private:
typedef Impl::PolicyTraits<Properties ... > traits;
@ -243,6 +93,7 @@ private:
public:
//! Tag this class as an execution policy
typedef RangePolicy execution_policy;
typedef typename traits::index_type member_type ;
KOKKOS_INLINE_FUNCTION const typename traits::execution_space & space() const { return m_space ; }
@ -348,7 +199,7 @@ public:
: m_begin(0), m_end(0)
{
if ( part_size ) {
// Split evenly among partitions, then round up to the granularity.
const member_type work_part =
( ( ( ( range.end() - range.begin() ) + ( part_size - 1 ) ) / part_size )
@ -356,7 +207,7 @@ public:
m_begin = range.begin() + work_part * part_rank ;
m_end = m_begin + work_part ;
if ( range.end() < m_begin ) m_begin = range.end() ;
if ( range.end() < m_end ) m_end = range.end() ;
}
@ -366,10 +217,11 @@ public:
member_type m_end ;
WorkRange();
WorkRange & operator = ( const WorkRange & );
};
};
} // namespace Kokkos
//----------------------------------------------------------------------------
@ -377,38 +229,6 @@ public:
namespace Kokkos {
namespace Experimental {
/** \brief Scratch memory request accepting per team and per thread value
*
* An instance of this class can be given as the last argument to a
* TeamPolicy constructor. It sets the amount of user requested shared
* memory for the team.
*/
template< class MemorySpace >
class TeamScratchRequest {
size_t m_per_team;
size_t m_per_thread;
public:
TeamScratchRequest(size_t per_team_, size_t per_thread_ = 0):
m_per_team(per_team_), m_per_thread(per_thread_) {
}
size_t per_team() const {
return m_per_team;
}
size_t per_thread() const {
return m_per_thread;
}
size_t total(const size_t team_size) const {
return m_per_team + m_per_thread * team_size;
}
};
}
namespace Impl {
@ -451,11 +271,9 @@ public:
TeamPolicyInternal( int league_size_request , const Kokkos::AUTO_t & , int vector_length_request = 1 );
template<class MemorySpace>
TeamPolicyInternal( int league_size_request , int team_size_request , const Experimental::TeamScratchRequest<MemorySpace>& team_scratch_memory_request );
/* TeamPolicyInternal( int league_size_request , int team_size_request );
template<class MemorySpace>
TeamPolicyInternal( int league_size_request , const Kokkos::AUTO_t & , const Experimental::TeamScratchRequest<MemorySpace>& team_scratch_memory_request );
TeamPolicyInternal( int league_size_request , const Kokkos::AUTO_t & );*/
/** \brief The actual league size (number of teams) of the policy.
*
@ -574,12 +392,14 @@ class TeamPolicy: public
typedef Impl::TeamPolicyInternal<
typename Impl::PolicyTraits<Properties ... >::execution_space,
Properties ...> internal_policy;
typedef Impl::PolicyTraits<Properties ... > traits;
public:
typedef TeamPolicy execution_policy;
TeamPolicy& operator = (const TeamPolicy&) = default;
/** \brief Construct policy with the given instance of the execution space */
TeamPolicy( const typename traits::execution_space & , int league_size_request , int team_size_request , int vector_length_request = 1 )
: internal_policy(typename traits::execution_space(),league_size_request,team_size_request, vector_length_request) {}
@ -594,13 +414,11 @@ public:
TeamPolicy( int league_size_request , const Kokkos::AUTO_t & , int vector_length_request = 1 )
: internal_policy(league_size_request,Kokkos::AUTO(), vector_length_request) {}
template<class MemorySpace>
TeamPolicy( int league_size_request , int team_size_request , const Experimental::TeamScratchRequest<MemorySpace>& team_scratch_memory_request )
: internal_policy(league_size_request,team_size_request, team_scratch_memory_request) {}
/* TeamPolicy( int league_size_request , int team_size_request )
: internal_policy(league_size_request,team_size_request) {}
template<class MemorySpace>
TeamPolicy( int league_size_request , const Kokkos::AUTO_t & , const Experimental::TeamScratchRequest<MemorySpace>& team_scratch_memory_request )
: internal_policy(league_size_request,Kokkos::AUTO(), team_scratch_memory_request) {}
TeamPolicy( int league_size_request , const Kokkos::AUTO_t & )
: internal_policy(league_size_request,Kokkos::AUTO()) {}*/
private:
TeamPolicy(const internal_policy& p):internal_policy(p) {}
@ -744,6 +562,7 @@ Impl::ThreadVectorRangeBoundariesStruct<iType,TeamMemberType> ThreadVectorRange(
} // namespace Kokkos
#endif /* #define KOKKOS_EXECPOLICY_HPP */
//----------------------------------------------------------------------------

View File

@ -120,21 +120,6 @@ public:
//! This memory space preferred device_type
typedef Kokkos::Device<execution_space,memory_space> device_type;
/*--------------------------------*/
#if ! KOKKOS_USING_EXP_VIEW
typedef Impl::HBWMallocAllocator allocator ;
/** \brief Allocate a contiguous block of memory.
*
* The input label is associated with the block of memory.
* The block of memory is tracked via reference counting where
* allocation gives it a reference count of one.
*/
static Kokkos::Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
/*--------------------------------*/
/* Functions unique to the HBWSpace */
static int in_parallel();

View File

@ -55,9 +55,6 @@
#include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_Error.hpp>
#include <impl/Kokkos_AllocationTracker.hpp>
#include <impl/Kokkos_BasicAllocators.hpp>
#include <impl/KokkosExp_SharedAlloc.hpp>
/*--------------------------------------------------------------------------*/
@ -128,25 +125,6 @@ public:
//! This memory space preferred device_type
typedef Kokkos::Device<execution_space,memory_space> device_type;
/*--------------------------------*/
#if ! KOKKOS_USING_EXP_VIEW
#if defined( KOKKOS_USE_PAGE_ALIGNED_HOST_MEMORY )
typedef Impl::PageAlignedAllocator allocator ;
#else
typedef Impl::AlignedAllocator allocator ;
#endif
/** \brief Allocate a contiguous block of memory.
*
* The input label is associated with the block of memory.
* The block of memory is tracked via reference counting where
* allocation gives it a reference count of one.
*/
static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
/*--------------------------------*/
/* Functions unique to the HostSpace */
static int in_parallel();

View File

@ -133,11 +133,23 @@
// still identifies as 7.0
#error "Cuda version 7.5 or greater required for host-to-device Lambda support"
#endif
#if ( CUDA_VERSION < 8000 )
#define KOKKOS_LAMBDA [=]__device__
#else
#define KOKKOS_LAMBDA [=]__host__ __device__
#endif
#define KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA 1
#endif
#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( __CUDACC__ ) */
#if defined(KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA)
// Cuda version 8.0 still needs the functor wrapper
#if (KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA /* && (CUDA_VERSION < 8000) */ )
#define KOKKOS_IMPL_NEED_FUNCTOR_WRAPPER
#endif
#endif
/*--------------------------------------------------------------------------*/
/* Language info: C++, CUDA, OPENMP */
@ -440,27 +452,16 @@
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
/* Transitional macro to change between old and new View,
* default to use new View.
/* Transitional macro to change between old and new View
* are no longer supported.
*/
#if ! defined( KOKKOS_USING_EXP_VIEW )
#if defined( KOKKOS_USING_DEPRECATED_VIEW )
#define KOKKOS_USING_EXP_VIEW 0
#else
#define KOKKOS_USING_EXP_VIEW 1
#endif
#error "Kokkos deprecated View has been removed"
#endif
#if KOKKOS_USING_EXP_VIEW
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
#define KOKKOS_USING_EXP_VIEW 1
#define KOKKOS_USING_EXPERIMENTAL_VIEW
#endif
#else /* ! KOKKOS_USING_EXP_VIEW */
#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
#error "KOKKOS_USING_EXP_VIEW and KOKKOS_USING_EXPERIMENAL_VIEW are both defined and are incompatible"
#endif
#endif
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

File diff suppressed because it is too large Load Diff

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -58,9 +58,11 @@
#endif
#include <Kokkos_ScratchSpace.hpp>
#include <Kokkos_Parallel.hpp>
#include <Kokkos_TaskPolicy.hpp>
#include <Kokkos_Layout.hpp>
#include <impl/Kokkos_Tags.hpp>
#include <KokkosExp_MDRangePolicy.hpp>
/*--------------------------------------------------------------------------*/
namespace Kokkos {
@ -177,6 +179,7 @@ struct VerifyExecutionCanAccessMemorySpace
#include <OpenMP/Kokkos_OpenMPexec.hpp>
#include <OpenMP/Kokkos_OpenMP_Parallel.hpp>
#include <OpenMP/Kokkos_OpenMP_Task.hpp>
/*--------------------------------------------------------------------------*/

View File

@ -1,12 +1,12 @@
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -35,7 +35,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
@ -125,17 +125,26 @@ struct pair
return *this;
}
/// \brief Assignment operator.
/// \brief Assignment operator, for volatile <tt>*this</tt>.
///
/// This calls the assignment operators of T1 and T2. It won't
/// \param p [in] Input; right-hand side of the assignment.
///
/// This calls the assignment operators of T1 and T2. It will not
/// compile if the assignment operators are not defined and public.
///
/// This operator returns \c void instead of <tt>volatile pair<T1,
/// T2>& </tt>. See Kokkos Issue #177 for the explanation. In
/// practice, this means that you should not chain assignments with
/// volatile lvalues.
template <class U, class V>
KOKKOS_FORCEINLINE_FUNCTION
volatile pair<T1, T2> & operator=(const volatile pair<U,V> &p) volatile
void operator=(const volatile pair<U,V> &p) volatile
{
first = p.first;
second = p.second;
return *this;
// We deliberately do not return anything here. See explanation
// in public documentation above.
}
// from std::pair<U,V>

View File

@ -57,7 +57,6 @@
#include <typeinfo>
#endif
#include <impl/Kokkos_AllocationTracker.hpp>
#include <impl/Kokkos_Tags.hpp>
#include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_FunctorAdapter.hpp>
@ -178,8 +177,8 @@ void parallel_for( const ExecPolicy & policy
{
#if (KOKKOS_ENABLE_PROFILING)
uint64_t kpID = 0;
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
}
#endif
@ -190,8 +189,8 @@ void parallel_for( const ExecPolicy & policy
closure.execute();
#if (KOKKOS_ENABLE_PROFILING)
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::endParallelFor(kpID);
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::endParallelFor(kpID);
}
#endif
}
@ -210,8 +209,8 @@ void parallel_for( const size_t work_count
#if (KOKKOS_ENABLE_PROFILING)
uint64_t kpID = 0;
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
}
#endif
@ -222,8 +221,8 @@ void parallel_for( const size_t work_count
closure.execute();
#if (KOKKOS_ENABLE_PROFILING)
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::endParallelFor(kpID);
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::endParallelFor(kpID);
}
#endif
}
@ -248,405 +247,9 @@ void parallel_for( const std::string & str
(void) str;
}
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
/** \brief Parallel reduction
*
* Example of a parallel_reduce functor for a POD (plain old data) value type:
* \code
* class FunctorType { // For POD value type
* public:
* typedef ... execution_space ;
* typedef <podType> value_type ;
* void operator()( <intType> iwork , <podType> & update ) const ;
* void init( <podType> & update ) const ;
* void join( volatile <podType> & update ,
* volatile const <podType> & input ) const ;
*
* typedef true_type has_final ;
* void final( <podType> & update ) const ;
* };
* \endcode
*
* Example of a parallel_reduce functor for an array of POD (plain old data) values:
* \code
* class FunctorType { // For array of POD value
* public:
* typedef ... execution_space ;
* typedef <podType> value_type[] ;
* void operator()( <intType> , <podType> update[] ) const ;
* void init( <podType> update[] ) const ;
* void join( volatile <podType> update[] ,
* volatile const <podType> input[] ) const ;
*
* typedef true_type has_final ;
* void final( <podType> update[] ) const ;
* };
* \endcode
*/
template< class ExecPolicy , class FunctorType >
inline
void parallel_reduce( const ExecPolicy & policy
, const FunctorType & functor
, const std::string& str = ""
, typename Impl::enable_if< ! Impl::is_integral< ExecPolicy >::value >::type * = 0
)
{
// typedef typename
// Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
// execution_space ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag > ValueTraits ;
typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
, typename ValueTraits::value_type
, typename ValueTraits::pointer_type
>::type value_type ;
Kokkos::View< value_type
, HostSpace
, Kokkos::MemoryUnmanaged
>
result_view ;
#if (KOKKOS_ENABLE_PROFILING)
uint64_t kpID = 0;
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
}
#endif
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
Impl::ParallelReduce< FunctorType , ExecPolicy > closure( functor , policy , result_view );
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
closure.execute();
#if (KOKKOS_ENABLE_PROFILING)
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::endParallelReduce(kpID);
}
#endif
}
// integral range policy
template< class FunctorType >
inline
void parallel_reduce( const size_t work_count
, const FunctorType & functor
, const std::string& str = ""
)
{
typedef typename
Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
execution_space ;
typedef RangePolicy< execution_space > policy ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
, typename ValueTraits::value_type
, typename ValueTraits::pointer_type
>::type value_type ;
Kokkos::View< value_type
, HostSpace
, Kokkos::MemoryUnmanaged
>
result_view ;
#if (KOKKOS_ENABLE_PROFILING)
uint64_t kpID = 0;
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
}
#endif
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
Impl::ParallelReduce< FunctorType , policy > closure( functor , policy(0,work_count) , result_view );
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
closure.execute();
#if (KOKKOS_ENABLE_PROFILING)
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::endParallelReduce(kpID);
}
#endif
}
// general policy and view ouput
template< class ExecPolicy , class FunctorType , class ViewType >
inline
void parallel_reduce( const ExecPolicy & policy
, const FunctorType & functor
, const ViewType & result_view
, const std::string& str = ""
, typename Impl::enable_if<
( Kokkos::is_view<ViewType>::value && ! Impl::is_integral< ExecPolicy >::value
#ifdef KOKKOS_HAVE_CUDA
&& ! Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value
#endif
)>::type * = 0 )
{
#if (KOKKOS_ENABLE_PROFILING)
uint64_t kpID = 0;
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
}
#endif
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
Impl::ParallelReduce< FunctorType, ExecPolicy > closure( functor , policy , result_view );
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
closure.execute();
#if (KOKKOS_ENABLE_PROFILING)
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::endParallelReduce(kpID);
}
#endif
}
// general policy and pod or array of pod output
template< class ExecPolicy , class FunctorType >
void parallel_reduce( const ExecPolicy & policy
, const FunctorType & functor
#ifdef KOKKOS_HAVE_CUDA
, typename Impl::enable_if<
( ! Impl::is_integral< ExecPolicy >::value &&
! Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value )
, typename Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >::reference_type>::type result_ref
, const std::string& str = ""
, typename Impl::enable_if<! Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value >::type* = 0
)
#else
, typename Impl::enable_if<
( ! Impl::is_integral< ExecPolicy >::value)
, typename Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >::reference_type
>::type result_ref
, const std::string& str = ""
)
#endif
{
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueOps< FunctorType , typename ExecPolicy::work_tag > ValueOps ;
// Wrap the result output request in a view to inform the implementation
// of the type and memory space.
typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
, typename ValueTraits::value_type
, typename ValueTraits::pointer_type
>::type value_type ;
Kokkos::View< value_type
, HostSpace
, Kokkos::MemoryUnmanaged
>
result_view( ValueOps::pointer( result_ref )
, ValueTraits::value_count( functor )
);
#if (KOKKOS_ENABLE_PROFILING)
uint64_t kpID = 0;
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
}
#endif
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
Impl::ParallelReduce< FunctorType, ExecPolicy > closure( functor , policy , result_view );
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
closure.execute();
#if (KOKKOS_ENABLE_PROFILING)
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::endParallelReduce(kpID);
}
#endif
}
// integral range policy and view ouput
template< class FunctorType , class ViewType >
inline
void parallel_reduce( const size_t work_count
, const FunctorType & functor
, const ViewType & result_view
, const std::string& str = ""
, typename Impl::enable_if<( Kokkos::is_view<ViewType>::value
#ifdef KOKKOS_HAVE_CUDA
&& ! Impl::is_same<
typename Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space,
Kokkos::Cuda>::value
#endif
)>::type * = 0 )
{
typedef typename
Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
execution_space ;
typedef RangePolicy< execution_space > ExecPolicy ;
#if (KOKKOS_ENABLE_PROFILING)
uint64_t kpID = 0;
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
}
#endif
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
Impl::ParallelReduce< FunctorType, ExecPolicy > closure( functor , ExecPolicy(0,work_count) , result_view );
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
closure.execute();
#if (KOKKOS_ENABLE_PROFILING)
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::endParallelReduce(kpID);
}
#endif
}
// integral range policy and pod or array of pod output
template< class FunctorType >
inline
void parallel_reduce( const size_t work_count
, const FunctorType & functor
, typename Kokkos::Impl::FunctorValueTraits<
typename Impl::if_c<Impl::is_execution_policy<FunctorType>::value ||
Impl::is_integral<FunctorType>::value,
void,FunctorType>::type
, void >::reference_type result
, const std::string& str = ""
, typename Impl::enable_if< true
#ifdef KOKKOS_HAVE_CUDA
&& ! Impl::is_same<
typename Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space,
Kokkos::Cuda>::value
#endif
>::type * = 0 )
{
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
typedef Kokkos::Impl::FunctorValueOps< FunctorType , void > ValueOps ;
typedef typename
Kokkos::Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
execution_space ;
typedef Kokkos::RangePolicy< execution_space > policy ;
// Wrap the result output request in a view to inform the implementation
// of the type and memory space.
typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
, typename ValueTraits::value_type
, typename ValueTraits::pointer_type
>::type value_type ;
Kokkos::View< value_type
, HostSpace
, Kokkos::MemoryUnmanaged
>
result_view( ValueOps::pointer( result )
, ValueTraits::value_count( functor )
);
#if (KOKKOS_ENABLE_PROFILING)
uint64_t kpID = 0;
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
}
#endif
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
Impl::ParallelReduce< FunctorType , policy > closure( functor , policy(0,work_count) , result_view );
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
closure.execute();
#if (KOKKOS_ENABLE_PROFILING)
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::endParallelReduce(kpID);
}
#endif
}
#ifndef KOKKOS_HAVE_CUDA
template< class ExecPolicy , class FunctorType , class ResultType >
inline
void parallel_reduce( const std::string & str
, const ExecPolicy & policy
, const FunctorType & functor
, ResultType * result)
{
#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
Kokkos::fence();
std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl;
#endif
parallel_reduce(policy,functor,result,str);
#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
Kokkos::fence();
std::cout << "KOKKOS_DEBUG End parallel_reduce kernel: " << str << std::endl;
#endif
(void) str;
}
template< class ExecPolicy , class FunctorType , class ResultType >
inline
void parallel_reduce( const std::string & str
, const ExecPolicy & policy
, const FunctorType & functor
, ResultType & result)
{
#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
Kokkos::fence();
std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl;
#endif
parallel_reduce(policy,functor,result,str);
#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
Kokkos::fence();
std::cout << "KOKKOS_DEBUG End parallel_reduce kernel: " << str << std::endl;
#endif
(void) str;
}
template< class ExecPolicy , class FunctorType >
inline
void parallel_reduce( const std::string & str
, const ExecPolicy & policy
, const FunctorType & functor)
{
#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
Kokkos::fence();
std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl;
#endif
parallel_reduce(policy,functor,str);
#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
Kokkos::fence();
std::cout << "KOKKOS_DEBUG End parallel_reduce kernel: " << str << std::endl;
#endif
(void) str;
}
#endif
} // namespace Kokkos
#include <Kokkos_Parallel_Reduce.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
@ -816,8 +419,8 @@ void parallel_scan( const ExecutionPolicy & policy
{
#if (KOKKOS_ENABLE_PROFILING)
uint64_t kpID = 0;
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
}
#endif
@ -828,8 +431,8 @@ void parallel_scan( const ExecutionPolicy & policy
closure.execute();
#if (KOKKOS_ENABLE_PROFILING)
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::endParallelScan(kpID);
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::endParallelScan(kpID);
}
#endif
@ -849,8 +452,8 @@ void parallel_scan( const size_t work_count
#if (KOKKOS_ENABLE_PROFILING)
uint64_t kpID = 0;
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
}
#endif
@ -861,8 +464,8 @@ void parallel_scan( const size_t work_count
closure.execute();
#if (KOKKOS_ENABLE_PROFILING)
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::endParallelScan(kpID);
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::endParallelScan(kpID);
}
#endif

File diff suppressed because it is too large Load Diff

View File

@ -66,11 +66,15 @@ public:
private:
mutable char * m_iter ;
char * m_end ;
mutable char * m_iter_L0 ;
char * m_end_L0 ;
mutable char * m_iter_L1 ;
char * m_end_L1 ;
mutable int m_multiplier;
mutable int m_offset;
mutable int m_default_level;
ScratchMemorySpace();
ScratchMemorySpace & operator = ( const ScratchMemorySpace & );
@ -95,34 +99,58 @@ public:
template< typename IntType >
KOKKOS_INLINE_FUNCTION
void* get_shmem (const IntType& size) const {
void* tmp = m_iter + m_offset * align (size);
if (m_end < (m_iter += align (size) * m_multiplier)) {
m_iter -= align (size) * m_multiplier; // put it back like it was
#ifdef KOKKOS_HAVE_DEBUG
// mfh 23 Jun 2015: printf call consumes 25 registers
// in a CUDA build, so only print in debug mode. The
// function still returns NULL if not enough memory.
printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate "
"%ld byte(s); remaining capacity is %ld byte(s)\n", long(size),
long(m_end-m_iter));
#endif // KOKKOS_HAVE_DEBUG
tmp = 0;
void* get_shmem (const IntType& size, int level = -1) const {
if(level == -1)
level = m_default_level;
if(level == 0) {
void* tmp = m_iter_L0 + m_offset * align (size);
if (m_end_L0 < (m_iter_L0 += align (size) * m_multiplier)) {
m_iter_L0 -= align (size) * m_multiplier; // put it back like it was
#ifdef KOKKOS_HAVE_DEBUG
// mfh 23 Jun 2015: printf call consumes 25 registers
// in a CUDA build, so only print in debug mode. The
// function still returns NULL if not enough memory.
printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate "
"%ld byte(s); remaining capacity is %ld byte(s)\n", long(size),
long(m_end_L0-m_iter_L0));
#endif // KOKKOS_HAVE_DEBUG
tmp = 0;
}
return tmp;
} else {
void* tmp = m_iter_L1 + m_offset * align (size);
if (m_end_L1 < (m_iter_L1 += align (size) * m_multiplier)) {
m_iter_L1 -= align (size) * m_multiplier; // put it back like it was
#ifdef KOKKOS_HAVE_DEBUG
// mfh 23 Jun 2015: printf call consumes 25 registers
// in a CUDA build, so only print in debug mode. The
// function still returns NULL if not enough memory.
printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate "
"%ld byte(s); remaining capacity is %ld byte(s)\n", long(size),
long(m_end_L1-m_iter_L1));
#endif // KOKKOS_HAVE_DEBUG
tmp = 0;
}
return tmp;
}
return tmp;
}
template< typename IntType >
KOKKOS_INLINE_FUNCTION
ScratchMemorySpace( void * ptr , const IntType & size )
: m_iter( (char *) ptr )
, m_end( m_iter + size )
ScratchMemorySpace( void * ptr_L0 , const IntType & size_L0 , void * ptr_L1 = NULL , const IntType & size_L1 = 0)
: m_iter_L0( (char *) ptr_L0 )
, m_end_L0( m_iter_L0 + size_L0 )
, m_iter_L1( (char *) ptr_L1 )
, m_end_L1( m_iter_L1 + size_L1 )
, m_multiplier( 1 )
, m_offset( 0 )
, m_default_level( 0 )
{}
KOKKOS_INLINE_FUNCTION
const ScratchMemorySpace& set_team_thread_mode(const int& multiplier, const int& offset) const {
const ScratchMemorySpace& set_team_thread_mode(const int& level, const int& multiplier, const int& offset) const {
m_default_level = level;
m_multiplier = multiplier;
m_offset = offset;
return *this;

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -50,12 +50,17 @@
#include <cstddef>
#include <iosfwd>
#include <Kokkos_Parallel.hpp>
#include <Kokkos_TaskPolicy.hpp>
#include <Kokkos_Layout.hpp>
#include <Kokkos_HostSpace.hpp>
#include <Kokkos_ScratchSpace.hpp>
#include <Kokkos_MemoryTraits.hpp>
#include <impl/Kokkos_Tags.hpp>
#include <impl/Kokkos_FunctorAdapter.hpp>
#include <impl/Kokkos_Profiling_Interface.hpp>
#include <KokkosExp_MDRangePolicy.hpp>
#if defined( KOKKOS_HAVE_SERIAL )
@ -142,7 +147,9 @@ public:
// Init the array of locks used for arbitrarily sized atomics
Impl::init_lock_array_host_space();
#if (KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::initialize();
#endif
}
static int is_initialized() { return 1 ; }
@ -151,7 +158,11 @@ public:
static int concurrency() {return 1;};
//! Free any resources being consumed by the device.
static void finalize() {}
static void finalize() {
#if (KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::finalize();
#endif
}
//! Print configuration information to the given output stream.
static void print_configuration( std::ostream & , const bool /* detail */ = false ) {}
@ -307,8 +318,8 @@ class TeamPolicyInternal< Kokkos::Serial , Properties ... >:public PolicyTraits<
{
private:
size_t m_team_scratch_size ;
size_t m_thread_scratch_size ;
size_t m_team_scratch_size[2] ;
size_t m_thread_scratch_size[2] ;
int m_league_size ;
int m_chunk_size;
@ -324,8 +335,10 @@ public:
TeamPolicyInternal& operator = (const TeamPolicyInternal& p) {
m_league_size = p.m_league_size;
m_team_scratch_size = p.m_team_scratch_size;
m_thread_scratch_size = p.m_thread_scratch_size;
m_team_scratch_size[0] = p.m_team_scratch_size[0];
m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
m_team_scratch_size[1] = p.m_team_scratch_size[1];
m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
m_chunk_size = p.m_chunk_size;
return *this;
}
@ -348,15 +361,15 @@ public:
inline int team_size() const { return 1 ; }
inline int league_size() const { return m_league_size ; }
inline size_t scratch_size() const { return m_team_scratch_size + m_thread_scratch_size; }
inline size_t scratch_size(const int& level, int = 0) const { return m_team_scratch_size[level] + m_thread_scratch_size[level]; }
/** \brief Specify league size, request team size */
TeamPolicyInternal( execution_space &
, int league_size_request
, int /* team_size_request */
, int /* vector_length_request */ = 1 )
: m_team_scratch_size ( 0 )
, m_thread_scratch_size ( 0 )
: m_team_scratch_size { 0 , 0 }
, m_thread_scratch_size { 0 , 0 }
, m_league_size( league_size_request )
, m_chunk_size ( 32 )
{}
@ -365,8 +378,8 @@ public:
, int league_size_request
, const Kokkos::AUTO_t & /* team_size_request */
, int /* vector_length_request */ = 1 )
: m_team_scratch_size ( 0 )
, m_thread_scratch_size ( 0 )
: m_team_scratch_size { 0 , 0 }
, m_thread_scratch_size { 0 , 0 }
, m_league_size( league_size_request )
, m_chunk_size ( 32 )
{}
@ -374,8 +387,8 @@ public:
TeamPolicyInternal( int league_size_request
, int /* team_size_request */
, int /* vector_length_request */ = 1 )
: m_team_scratch_size ( 0 )
, m_thread_scratch_size ( 0 )
: m_team_scratch_size { 0 , 0 }
, m_thread_scratch_size { 0 , 0 }
, m_league_size( league_size_request )
, m_chunk_size ( 32 )
{}
@ -383,8 +396,8 @@ public:
TeamPolicyInternal( int league_size_request
, const Kokkos::AUTO_t & /* team_size_request */
, int /* vector_length_request */ = 1 )
: m_team_scratch_size ( 0 )
, m_thread_scratch_size ( 0 )
: m_team_scratch_size { 0 , 0 }
, m_thread_scratch_size { 0 , 0 }
, m_league_size( league_size_request )
, m_chunk_size ( 32 )
{}
@ -401,26 +414,23 @@ public:
/** \brief set per team scratch size for a specific level of the scratch hierarchy */
inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
(void) level;
TeamPolicyInternal p = *this;
p.m_team_scratch_size = per_team.value;
p.m_team_scratch_size[level] = per_team.value;
return p;
};
/** \brief set per thread scratch size for a specific level of the scratch hierarchy */
inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
(void) level;
TeamPolicyInternal p = *this;
p.m_thread_scratch_size = per_thread.value;
p.m_thread_scratch_size[level] = per_thread.value;
return p;
};
/** \brief set per thread and per team scratch size for a specific level of the scratch hierarchy */
inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
(void) level;
TeamPolicyInternal p = *this;
p.m_team_scratch_size = per_team.value;
p.m_thread_scratch_size = per_thread.value;
p.m_team_scratch_size[level] = per_team.value;
p.m_thread_scratch_size[level] = per_thread.value;
return p;
};
@ -440,7 +450,7 @@ namespace Kokkos {
namespace Impl {
template< class FunctorType , class ... Traits >
class ParallelFor< FunctorType ,
class ParallelFor< FunctorType ,
Kokkos::RangePolicy< Traits ... > ,
Kokkos::Serial
>
@ -489,9 +499,10 @@ public:
/*--------------------------------------------------------------------------*/
template< class FunctorType , class ... Traits >
template< class FunctorType , class ReducerType , class ... Traits >
class ParallelReduce< FunctorType
, Kokkos::RangePolicy< Traits ... >
, ReducerType
, Kokkos::Serial
>
{
@ -499,14 +510,19 @@ private:
typedef Kokkos::RangePolicy< Traits ... > Policy ;
typedef typename Policy::work_tag WorkTag ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType , WorkTag > ValueInit ;
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
typedef typename ReducerConditional::type ReducerTypeFwd;
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTag > ValueInit ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
const FunctorType m_functor ;
const Policy m_policy ;
const ReducerType m_reducer ;
const pointer_type m_result_ptr ;
@ -515,15 +531,15 @@ private:
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec( pointer_type ptr ) const
{
reference_type update = ValueInit::init( m_functor , ptr );
reference_type update = ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , ptr );
const typename Policy::member_type e = m_policy.end();
for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
m_functor( i , update );
}
Kokkos::Impl::FunctorFinal< FunctorType , TagType >::
final( m_functor , ptr );
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
final( ReducerConditional::select(m_functor , m_reducer) , ptr );
}
template< class TagType >
@ -532,15 +548,15 @@ private:
exec( pointer_type ptr ) const
{
const TagType t{} ;
reference_type update = ValueInit::init( m_functor , ptr );
reference_type update = ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , ptr );
const typename Policy::member_type e = m_policy.end();
for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
m_functor( t , i , update );
}
Kokkos::Impl::FunctorFinal< FunctorType , TagType >::
final( m_functor , ptr );
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
final( ReducerConditional::select(m_functor , m_reducer) , ptr );
}
public:
@ -549,25 +565,43 @@ public:
void execute() const
{
pointer_type ptr = (pointer_type) Kokkos::Serial::scratch_memory_resize
( ValueTraits::value_size( m_functor ) , 0 );
( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
this-> template exec< WorkTag >( m_result_ptr ? m_result_ptr : ptr );
}
template< class ViewType >
template< class HostViewType >
ParallelReduce( const FunctorType & arg_functor ,
const Policy & arg_policy ,
const HostViewType & arg_result_view ,
typename std::enable_if<
Kokkos::is_view< HostViewType >::value &&
!Kokkos::is_reducer_type<ReducerType>::value
,void*>::type = NULL)
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( InvalidType() )
, m_result_ptr( arg_result_view.ptr_on_device() )
{
static_assert( Kokkos::is_view< HostViewType >::value
, "Kokkos::Serial reduce result must be a View" );
static_assert( std::is_same< typename HostViewType::memory_space , HostSpace >::value
, "Kokkos::Serial reduce result must be a View in HostSpace" );
}
inline
ParallelReduce( const FunctorType & arg_functor
, const Policy & arg_policy
, const ViewType & arg_result )
, Policy arg_policy
, const ReducerType& reducer )
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_result_ptr( arg_result.ptr_on_device() )
, m_reducer( reducer )
, m_result_ptr( reducer.result_view().data() )
{
static_assert( Kokkos::is_view< ViewType >::value
, "Reduction result on Kokkos::Serial must be a Kokkos::View" );
static_assert( std::is_same< typename ViewType::memory_space
/*static_assert( std::is_same< typename ViewType::memory_space
, Kokkos::HostSpace >::value
, "Reduction result on Kokkos::Serial must be a Kokkos::View in HostSpace" );
, "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
}
};
@ -697,15 +731,16 @@ public:
, const Policy & arg_policy )
: m_functor( arg_functor )
, m_league( arg_policy.league_size() )
, m_shared( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) )
, m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) )
{ }
};
/*--------------------------------------------------------------------------*/
template< class FunctorType , class ... Properties >
template< class FunctorType , class ReducerType , class ... Properties >
class ParallelReduce< FunctorType
, Kokkos::TeamPolicy< Properties ... >
, ReducerType
, Kokkos::Serial
>
{
@ -714,30 +749,35 @@ private:
typedef TeamPolicyInternal< Kokkos::Serial, Properties ... > Policy ;
typedef typename Policy::member_type Member ;
typedef typename Policy::work_tag WorkTag ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType , WorkTag > ValueInit ;
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
typedef typename ReducerConditional::type ReducerTypeFwd;
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTag > ValueInit ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
const FunctorType m_functor ;
const int m_league ;
const int m_shared ;
const ReducerType m_reducer ;
pointer_type m_result_ptr ;
const int m_shared ;
template< class TagType >
inline
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec( pointer_type ptr ) const
{
reference_type update = ValueInit::init( m_functor , ptr );
reference_type update = ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , ptr );
for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
m_functor( Member(ileague,m_league,m_shared) , update );
}
Kokkos::Impl::FunctorFinal< FunctorType , TagType >::
final( m_functor , ptr );
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
final( ReducerConditional::select(m_functor , m_reducer) , ptr );
}
template< class TagType >
@ -747,14 +787,14 @@ private:
{
const TagType t{} ;
reference_type update = ValueInit::init( m_functor , ptr );
reference_type update = ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , ptr );
for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
m_functor( t , Member(ileague,m_league,m_shared) , update );
}
Kokkos::Impl::FunctorFinal< FunctorType , TagType >::
final( m_functor , ptr );
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
final( ReducerConditional::select(m_functor , m_reducer) , ptr );
}
public:
@ -763,7 +803,7 @@ public:
void execute() const
{
pointer_type ptr = (pointer_type) Kokkos::Serial::scratch_memory_resize
( ValueTraits::value_size( m_functor ) , m_shared );
( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , m_shared );
this-> template exec< WorkTag >( m_result_ptr ? m_result_ptr : ptr );
}
@ -771,12 +811,16 @@ public:
template< class ViewType >
ParallelReduce( const FunctorType & arg_functor
, const Policy & arg_policy
, const ViewType & arg_result
)
, const ViewType & arg_result ,
typename std::enable_if<
Kokkos::is_view< ViewType >::value &&
!Kokkos::is_reducer_type<ReducerType>::value
,void*>::type = NULL)
: m_functor( arg_functor )
, m_league( arg_policy.league_size() )
, m_shared( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( m_functor , 1 ) )
, m_reducer( InvalidType() )
, m_result_ptr( arg_result.ptr_on_device() )
, m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( m_functor , 1 ) )
{
static_assert( Kokkos::is_view< ViewType >::value
, "Reduction result on Kokkos::Serial must be a Kokkos::View" );
@ -786,6 +830,21 @@ public:
, "Reduction result on Kokkos::Serial must be a Kokkos::View in HostSpace" );
}
inline
ParallelReduce( const FunctorType & arg_functor
, Policy arg_policy
, const ReducerType& reducer )
: m_functor( arg_functor )
, m_league( arg_policy.league_size() )
, m_reducer( reducer )
, m_result_ptr( reducer.result_view().data() )
, m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
{
/*static_assert( std::is_same< typename ViewType::memory_space
, Kokkos::HostSpace >::value
, "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
}
};
} // namespace Impl
@ -1045,6 +1104,10 @@ void single(const Impl::ThreadSingleStruct<Impl::SerialTeamMember>& , const Func
}
}
//----------------------------------------------------------------------------
#include <impl/Kokkos_Serial_Task.hpp>
#endif // defined( KOKKOS_HAVE_SERIAL )
#endif /* #define KOKKOS_SERIAL_HPP */

View File

@ -1,4 +1,3 @@
/*
//@HEADER
// ************************************************************************
@ -47,13 +46,655 @@
#ifndef KOKKOS_TASKPOLICY_HPP
#define KOKKOS_TASKPOLICY_HPP
#include <Kokkos_Core_fwd.hpp>
#include <Kokkos_MemoryPool.hpp>
#include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_Tags.hpp>
#include <impl/Kokkos_StaticAssert.hpp>
#include <impl/Kokkos_AllocationTracker.hpp>
//----------------------------------------------------------------------------
#include <Kokkos_Core_fwd.hpp>
// If compiling with CUDA then must be using CUDA 8 or better
// and use relocateable device code to enable the task policy.
// nvcc relocatable device code option: --relocatable-device-code=true
#if ( defined( KOKKOS_COMPILER_NVCC ) )
#if ( 8000 <= CUDA_VERSION ) && \
defined( KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE )
#define KOKKOS_ENABLE_TASKPOLICY
#endif
#else
#define KOKKOS_ENABLE_TASKPOLICY
#endif
#if defined( KOKKOS_ENABLE_TASKPOLICY )
//----------------------------------------------------------------------------
#include <Kokkos_MemoryPool.hpp>
#include <impl/Kokkos_Tags.hpp>
#include <impl/Kokkos_TaskQueue.hpp>
//----------------------------------------------------------------------------
namespace Kokkos {
enum TaskType { TaskTeam = Impl::TaskBase<void,void,void>::TaskTeam
, TaskSingle = Impl::TaskBase<void,void,void>::TaskSingle };
enum TaskPriority { TaskHighPriority = 0
, TaskRegularPriority = 1
, TaskLowPriority = 2 };
template< typename Space >
class TaskPolicy ;
template< typename Space >
void wait( TaskPolicy< Space > const & );
} // namespace Kokkos
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
/*\brief Implementation data for task data management, access, and execution.
*
* CRTP Inheritance structure to allow static_cast from the
* task root type and a task's FunctorType.
*
* TaskBase< Space , ResultType , FunctorType >
* : TaskBase< Space , ResultType , void >
* , FunctorType
* { ... };
*
* TaskBase< Space , ResultType , void >
* : TaskBase< Space , void , void >
* { ... };
*/
template< typename Space , typename ResultType , typename FunctorType >
class TaskBase ;
template< typename Space >
class TaskExec ;
}} // namespace Kokkos::Impl
//----------------------------------------------------------------------------
namespace Kokkos {
/**
*
* Future< space > // value_type == void
* Future< value > // space == Default
* Future< value , space >
*
*/
template< typename Arg1 /* = void */ , typename Arg2 /* = void */ >
class Future {
private:
template< typename > friend class TaskPolicy ;
template< typename , typename > friend class Future ;
template< typename , typename , typename > friend class Impl::TaskBase ;
enum { Arg1_is_space = Kokkos::Impl::is_space< Arg1 >::value };
enum { Arg2_is_space = Kokkos::Impl::is_space< Arg2 >::value };
enum { Arg1_is_value = ! Arg1_is_space &&
! std::is_same< Arg1 , void >::value };
enum { Arg2_is_value = ! Arg2_is_space &&
! std::is_same< Arg2 , void >::value };
static_assert( ! ( Arg1_is_space && Arg2_is_space )
, "Future cannot be given two spaces" );
static_assert( ! ( Arg1_is_value && Arg2_is_value )
, "Future cannot be given two value types" );
using ValueType =
typename std::conditional< Arg1_is_value , Arg1 ,
typename std::conditional< Arg2_is_value , Arg2 , void
>::type >::type ;
using Space =
typename std::conditional< Arg1_is_space , Arg1 ,
typename std::conditional< Arg2_is_space , Arg2 , void
>::type >::type ;
using task_base = Impl::TaskBase< Space , ValueType , void > ;
using queue_type = Impl::TaskQueue< Space > ;
task_base * m_task ;
KOKKOS_INLINE_FUNCTION explicit
Future( task_base * task ) : m_task(0)
{ if ( task ) queue_type::assign( & m_task , task ); }
//----------------------------------------
public:
using execution_space = typename Space::execution_space ;
using value_type = ValueType ;
//----------------------------------------
KOKKOS_INLINE_FUNCTION
bool is_null() const { return 0 == m_task ; }
KOKKOS_INLINE_FUNCTION
int reference_count() const
{ return 0 != m_task ? m_task->reference_count() : 0 ; }
//----------------------------------------
KOKKOS_INLINE_FUNCTION
~Future() { if ( m_task ) queue_type::assign( & m_task , (task_base*)0 ); }
//----------------------------------------
KOKKOS_INLINE_FUNCTION
constexpr Future() noexcept : m_task(0) {}
KOKKOS_INLINE_FUNCTION
Future( Future && rhs )
: m_task( rhs.m_task ) { rhs.m_task = 0 ; }
KOKKOS_INLINE_FUNCTION
Future( const Future & rhs )
: m_task(0)
{ if ( rhs.m_task ) queue_type::assign( & m_task , rhs.m_task ); }
KOKKOS_INLINE_FUNCTION
Future & operator = ( Future && rhs )
{
if ( m_task ) queue_type::assign( & m_task , (task_base*)0 );
m_task = rhs.m_task ;
rhs.m_task = 0 ;
return *this ;
}
KOKKOS_INLINE_FUNCTION
Future & operator = ( const Future & rhs )
{
if ( m_task || rhs.m_task ) queue_type::assign( & m_task , rhs.m_task );
return *this ;
}
//----------------------------------------
template< class A1 , class A2 >
KOKKOS_INLINE_FUNCTION
Future( Future<A1,A2> && rhs )
: m_task( rhs.m_task )
{
static_assert
( std::is_same< Space , void >::value ||
std::is_same< Space , typename Future<A1,A2>::Space >::value
, "Assigned Futures must have the same space" );
static_assert
( std::is_same< value_type , void >::value ||
std::is_same< value_type , typename Future<A1,A2>::value_type >::value
, "Assigned Futures must have the same value_type" );
rhs.m_task = 0 ;
}
template< class A1 , class A2 >
KOKKOS_INLINE_FUNCTION
Future( const Future<A1,A2> & rhs )
: m_task(0)
{
static_assert
( std::is_same< Space , void >::value ||
std::is_same< Space , typename Future<A1,A2>::Space >::value
, "Assigned Futures must have the same space" );
static_assert
( std::is_same< value_type , void >::value ||
std::is_same< value_type , typename Future<A1,A2>::value_type >::value
, "Assigned Futures must have the same value_type" );
if ( rhs.m_task ) queue_type::assign( & m_task , rhs.m_task );
}
template< class A1 , class A2 >
KOKKOS_INLINE_FUNCTION
Future & operator = ( const Future<A1,A2> & rhs )
{
static_assert
( std::is_same< Space , void >::value ||
std::is_same< Space , typename Future<A1,A2>::Space >::value
, "Assigned Futures must have the same space" );
static_assert
( std::is_same< value_type , void >::value ||
std::is_same< value_type , typename Future<A1,A2>::value_type >::value
, "Assigned Futures must have the same value_type" );
if ( m_task || rhs.m_task ) queue_type::assign( & m_task , rhs.m_task );
return *this ;
}
template< class A1 , class A2 >
KOKKOS_INLINE_FUNCTION
Future & operator = ( Future<A1,A2> && rhs )
{
static_assert
( std::is_same< Space , void >::value ||
std::is_same< Space , typename Future<A1,A2>::Space >::value
, "Assigned Futures must have the same space" );
static_assert
( std::is_same< value_type , void >::value ||
std::is_same< value_type , typename Future<A1,A2>::value_type >::value
, "Assigned Futures must have the same value_type" );
if ( m_task ) queue_type::assign( & m_task , (task_base*) 0 );
m_task = rhs.m_task ;
rhs.m_task = 0 ;
return *this ;
}
//----------------------------------------
KOKKOS_INLINE_FUNCTION
typename task_base::get_return_type
get() const
{
if ( 0 == m_task ) {
Kokkos::abort( "Kokkos:::Future::get ERROR: is_null()");
}
return m_task->get();
}
};
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
template< typename ExecSpace >
class TaskPolicy
{
private:
using track_type = Kokkos::Experimental::Impl::SharedAllocationTracker ;
using queue_type = Kokkos::Impl::TaskQueue< ExecSpace > ;
using task_base = Impl::TaskBase< ExecSpace , void , void > ;
track_type m_track ;
queue_type * m_queue ;
//----------------------------------------
// Process optional arguments to spawn and respawn functions
KOKKOS_INLINE_FUNCTION static
void assign( task_base * const ) {}
// TaskTeam or TaskSingle
template< typename ... Options >
KOKKOS_INLINE_FUNCTION static
void assign( task_base * const task
, TaskType const & arg
, Options const & ... opts )
{
task->m_task_type = arg ;
assign( task , opts ... );
}
// TaskHighPriority or TaskRegularPriority or TaskLowPriority
template< typename ... Options >
KOKKOS_INLINE_FUNCTION static
void assign( task_base * const task
, TaskPriority const & arg
, Options const & ... opts )
{
task->m_priority = arg ;
assign( task , opts ... );
}
// Future for a dependence
template< typename A1 , typename A2 , typename ... Options >
KOKKOS_INLINE_FUNCTION static
void assign( task_base * const task
, Future< A1 , A2 > const & arg
, Options const & ... opts )
{
// Assign dependence to task->m_next
// which will be processed within subsequent call to schedule.
// Error if the dependence is reset.
if ( 0 != Kokkos::atomic_exchange(& task->m_next, arg.m_task) ) {
Kokkos::abort("TaskPolicy ERROR: resetting task dependence");
}
if ( 0 != arg.m_task ) {
// The future may be destroyed upon returning from this call
// so increment reference count to track this assignment.
Kokkos::atomic_fetch_add( &(arg.m_task->m_ref_count) , 1 );
}
assign( task , opts ... );
}
//----------------------------------------
public:
using execution_policy = TaskPolicy ;
using execution_space = ExecSpace ;
using memory_space = typename queue_type::memory_space ;
using member_type = Kokkos::Impl::TaskExec< ExecSpace > ;
KOKKOS_INLINE_FUNCTION
TaskPolicy() : m_track(), m_queue(0) {}
KOKKOS_INLINE_FUNCTION
TaskPolicy( TaskPolicy && rhs ) = default ;
KOKKOS_INLINE_FUNCTION
TaskPolicy( TaskPolicy const & rhs ) = default ;
KOKKOS_INLINE_FUNCTION
TaskPolicy & operator = ( TaskPolicy && rhs ) = default ;
KOKKOS_INLINE_FUNCTION
TaskPolicy & operator = ( TaskPolicy const & rhs ) = default ;
TaskPolicy( memory_space const & arg_memory_space
, unsigned const arg_memory_pool_capacity
, unsigned const arg_memory_pool_log2_superblock = 12 )
: m_track()
, m_queue(0)
{
typedef Kokkos::Experimental::Impl::SharedAllocationRecord
< memory_space , typename queue_type::Destroy >
record_type ;
record_type * record =
record_type::allocate( arg_memory_space
, "TaskQueue"
, sizeof(queue_type)
);
m_queue = new( record->data() )
queue_type( arg_memory_space
, arg_memory_pool_capacity
, arg_memory_pool_log2_superblock );
record->m_destroy.m_queue = m_queue ;
m_track.assign_allocated_record_to_uninitialized( record );
}
//----------------------------------------
/**\brief Allocation size for a spawned task */
template< typename FunctorType >
KOKKOS_FUNCTION
size_t spawn_allocation_size() const
{
using task_type = Impl::TaskBase< execution_space
, typename FunctorType::value_type
, FunctorType > ;
return m_queue->allocate_block_size( sizeof(task_type) );
}
/**\brief Allocation size for a when_all aggregate */
KOKKOS_FUNCTION
size_t when_all_allocation_size( int narg ) const
{
using task_base = Kokkos::Impl::TaskBase< ExecSpace , void , void > ;
return m_queue->allocate_block_size( sizeof(task_base) + narg * sizeof(task_base*) );
}
//----------------------------------------
/**\brief A task spawns a task with options
*
* 1) High, Normal, or Low priority
* 2) With or without dependence
* 3) Team or Serial
*/
template< typename FunctorType , typename ... Options >
KOKKOS_FUNCTION
Future< typename FunctorType::value_type , ExecSpace >
task_spawn( FunctorType const & arg_functor
, Options const & ... arg_options
) const
{
using value_type = typename FunctorType::value_type ;
using future_type = Future< value_type , execution_space > ;
using task_type = Impl::TaskBase< execution_space
, value_type
, FunctorType > ;
//----------------------------------------
// Give single-thread back-ends an opportunity to clear
// queue of ready tasks before allocating a new task
m_queue->iff_single_thread_recursive_execute();
//----------------------------------------
future_type f ;
// Allocate task from memory pool
f.m_task =
reinterpret_cast< task_type * >(m_queue->allocate(sizeof(task_type)));
if ( f.m_task ) {
// Placement new construction
new ( f.m_task ) task_type( arg_functor );
// Reference count starts at two
// +1 for matching decrement when task is complete
// +1 for future
f.m_task->m_queue = m_queue ;
f.m_task->m_ref_count = 2 ;
f.m_task->m_alloc_size = sizeof(task_type);
assign( f.m_task , arg_options... );
// Spawning from within the execution space so the
// apply function pointer is guaranteed to be valid
f.m_task->m_apply = task_type::apply ;
m_queue->schedule( f.m_task );
// this task may be updated or executed at any moment
}
return f ;
}
/**\brief The host process spawns a task with options
*
* 1) High, Normal, or Low priority
* 2) With or without dependence
* 3) Team or Serial
*/
template< typename FunctorType , typename ... Options >
inline
Future< typename FunctorType::value_type , ExecSpace >
host_spawn( FunctorType const & arg_functor
, Options const & ... arg_options
) const
{
using value_type = typename FunctorType::value_type ;
using future_type = Future< value_type , execution_space > ;
using task_type = Impl::TaskBase< execution_space
, value_type
, FunctorType > ;
future_type f ;
// Allocate task from memory pool
f.m_task =
reinterpret_cast<task_type*>( m_queue->allocate(sizeof(task_type)) );
if ( f.m_task ) {
// Placement new construction
new( f.m_task ) task_type( arg_functor );
// Reference count starts at two:
// +1 to match decrement when task completes
// +1 for the future
f.m_task->m_queue = m_queue ;
f.m_task->m_ref_count = 2 ;
f.m_task->m_alloc_size = sizeof(task_type);
assign( f.m_task , arg_options... );
// Potentially spawning outside execution space so the
// apply function pointer must be obtained from execution space.
// Required for Cuda execution space function pointer.
queue_type::specialization::template
proc_set_apply< FunctorType >( & f.m_task->m_apply );
m_queue->schedule( f.m_task );
}
return f ;
}
/**\brief Return a future that is complete
* when all input futures are complete.
*/
template< typename A1 , typename A2 >
KOKKOS_FUNCTION
Future< ExecSpace >
when_all( int narg , Future< A1 , A2 > const * const arg ) const
{
static_assert
( std::is_same< execution_space
, typename Future< A1 , A2 >::execution_space
>::value
, "Future must have same execution space" );
using future_type = Future< ExecSpace > ;
using task_base = Kokkos::Impl::TaskBase< ExecSpace , void , void > ;
future_type f ;
size_t const size = sizeof(task_base) + narg * sizeof(task_base*);
f.m_task =
reinterpret_cast< task_base * >( m_queue->allocate( size ) );
if ( f.m_task ) {
new( f.m_task ) task_base();
// Reference count starts at two:
// +1 to match decrement when task completes
// +1 for the future
f.m_task->m_queue = m_queue ;
f.m_task->m_ref_count = 2 ;
f.m_task->m_alloc_size = size ;
f.m_task->m_dep_count = narg ;
f.m_task->m_task_type = task_base::Aggregate ;
task_base ** const dep = f.m_task->aggregate_dependences();
// Assign dependences to increment their reference count
// The futures may be destroyed upon returning from this call
// so increment reference count to track this assignment.
for ( int i = 0 ; i < narg ; ++i ) {
task_base * const t = dep[i] = arg[i].m_task ;
if ( 0 != t ) {
Kokkos::atomic_fetch_add( &(t->m_ref_count) , 1 );
}
}
m_queue->schedule( f.m_task );
// this when_all may be processed at any moment
}
return f ;
}
/**\brief An executing task respawns itself with options
*
* 1) High, Normal, or Low priority
* 2) With or without dependence
*/
template< class FunctorType , typename ... Options >
KOKKOS_FUNCTION
void respawn( FunctorType * task_self
, Options const & ... arg_options ) const
{
using value_type = typename FunctorType::value_type ;
using task_type = Impl::TaskBase< execution_space
, value_type
, FunctorType > ;
task_base * const zero = (task_base *) 0 ;
task_base * const lock = (task_base *) task_base::LockTag ;
task_type * const task = static_cast< task_type * >( task_self );
// Precondition:
// task is in Executing state
// therefore m_next == LockTag
//
// Change to m_next == 0 for no dependence
if ( lock != Kokkos::atomic_exchange( & task->m_next, zero ) ) {
Kokkos::abort("TaskPolicy::respawn ERROR: already respawned");
}
assign( task , arg_options... );
// Postcondition:
// task is in Executing-Respawn state
// therefore m_next == dependece or 0
}
//----------------------------------------
template< typename S >
friend
void Kokkos::wait( Kokkos::TaskPolicy< S > const & );
//----------------------------------------
inline
int allocation_capacity() const noexcept
{ return m_queue->m_memory.get_mem_size(); }
KOKKOS_INLINE_FUNCTION
int allocated_task_count() const noexcept
{ return m_queue->m_count_alloc ; }
KOKKOS_INLINE_FUNCTION
int allocated_task_count_max() const noexcept
{ return m_queue->m_max_alloc ; }
KOKKOS_INLINE_FUNCTION
long allocated_task_count_accum() const noexcept
{ return m_queue->m_accum_alloc ; }
};
template< typename ExecSpace >
inline
void wait( TaskPolicy< ExecSpace > const & policy )
{ policy.m_queue->execute(); }
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
@ -463,5 +1104,6 @@ void wait( TaskPolicy< ExecSpace > & );
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #define KOKKOS_TASKPOLICY_HPP */
#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
#endif /* #ifndef KOKKOS_TASKPOLICY_HPP */

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -211,6 +211,8 @@ struct VerifyExecutionCanAccessMemorySpace
#include <Threads/Kokkos_ThreadsTeam.hpp>
#include <Threads/Kokkos_Threads_Parallel.hpp>
#include <KokkosExp_MDRangePolicy.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

File diff suppressed because it is too large Load Diff

View File

@ -178,9 +178,10 @@ public:
namespace Kokkos {
namespace Impl {
template< class FunctorType , class ... Traits >
template< class FunctorType , class ReducerType, class ... Traits >
class ParallelReduce< FunctorType
, Kokkos::RangePolicy< Traits ...>
, ReducerType
, Kokkos::OpenMP
>
{
@ -192,15 +193,21 @@ private:
typedef typename Policy::WorkRange WorkRange ;
typedef typename Policy::member_type Member ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;
typedef Kokkos::Impl::FunctorValueJoin< FunctorType, WorkTag > ValueJoin ;
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
typedef typename ReducerConditional::type ReducerTypeFwd;
// Static Assert WorkTag void if ReducerType not InvalidType
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ;
typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, WorkTag > ValueJoin ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
const FunctorType m_functor ;
const Policy m_policy ;
const ReducerType m_reducer ;
const pointer_type m_result_ptr ;
template< class TagType >
@ -252,7 +259,7 @@ public:
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
OpenMPexec::resize_scratch( ValueTraits::value_size( m_functor ) , 0 );
OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
#pragma omp parallel
{
@ -260,7 +267,7 @@ public:
const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
ParallelReduce::template exec_range< WorkTag >
( m_functor , range.begin() , range.end()
, ValueInit::init( m_functor , exec.scratch_reduce() ) );
, ValueInit::init( ReducerConditional::select(m_functor , m_reducer), exec.scratch_reduce() ) );
}
/* END #pragma omp parallel */
@ -269,13 +276,13 @@ public:
const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
ValueJoin::join( m_functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
}
Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( m_functor , ptr );
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
if ( m_result_ptr ) {
const int n = ValueTraits::value_count( m_functor );
const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
}
@ -289,7 +296,7 @@ public:
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
OpenMPexec::resize_scratch( ValueTraits::value_size( m_functor ) , 0 );
OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
#pragma omp parallel
{
@ -302,7 +309,7 @@ public:
long work_index = exec.get_work_index();
reference_type update = ValueInit::init( m_functor , exec.scratch_reduce() );
reference_type update = ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , exec.scratch_reduce() );
while(work_index != -1) {
const Member begin = static_cast<Member>(work_index) * m_policy.chunk_size();
const Member end = begin + m_policy.chunk_size() < m_policy.end()?begin+m_policy.chunk_size():m_policy.end();
@ -319,13 +326,13 @@ public:
const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
ValueJoin::join( m_functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
}
Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( m_functor , ptr );
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
if ( m_result_ptr ) {
const int n = ValueTraits::value_count( m_functor );
const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
}
@ -337,18 +344,35 @@ public:
inline
ParallelReduce( const FunctorType & arg_functor
, Policy arg_policy
, const ViewType & arg_result_view )
, const ViewType & arg_result_view
, typename std::enable_if<
Kokkos::is_view< ViewType >::value &&
!Kokkos::is_reducer_type<ReducerType>::value
,void*>::type = NULL)
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_result_ptr( arg_result_view.ptr_on_device() )
, m_reducer( InvalidType() )
, m_result_ptr( arg_result_view.data() )
{
static_assert( Kokkos::is_view< ViewType >::value
, "Reduction result on Kokkos::OpenMP must be a Kokkos::View" );
static_assert( std::is_same< typename ViewType::memory_space
/*static_assert( std::is_same< typename ViewType::memory_space
, Kokkos::HostSpace >::value
, "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );
, "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
}
inline
ParallelReduce( const FunctorType & arg_functor
, Policy arg_policy
, const ReducerType& reducer )
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( reducer )
, m_result_ptr( reducer.result_view().data() )
{
/*static_assert( std::is_same< typename ViewType::memory_space
, Kokkos::HostSpace >::value
, "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
}
};
} // namespace Impl
@ -568,13 +592,13 @@ public:
const size_t team_reduce_size = Policy::member_type::team_reduce_size();
OpenMPexec::resize_scratch( 0 , team_reduce_size + m_shmem_size );
OpenMPexec::resize_scratch( 0 , team_reduce_size + m_shmem_size + m_policy.scratch_size(1));
#pragma omp parallel
{
ParallelFor::template exec_team< WorkTag, typename Policy::schedule_type::type>
( m_functor
, Member( * OpenMPexec::get_thread_omp(), m_policy, m_shmem_size) );
, Member( * OpenMPexec::get_thread_omp(), m_policy, m_shmem_size, 0) );
}
/* END #pragma omp parallel */
}
@ -584,14 +608,15 @@ public:
const Policy & arg_policy )
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_shmem_size( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
, m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
{}
};
template< class FunctorType , class ... Properties >
template< class FunctorType , class ReducerType, class ... Properties >
class ParallelReduce< FunctorType
, Kokkos::TeamPolicy< Properties ... >
, ReducerType
, Kokkos::OpenMP
>
{
@ -602,15 +627,19 @@ private:
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::member_type Member ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType , WorkTag > ValueInit ;
typedef Kokkos::Impl::FunctorValueJoin< FunctorType , WorkTag > ValueJoin ;
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
typedef typename ReducerConditional::type ReducerTypeFwd;
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTag > ValueInit ;
typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd , WorkTag > ValueJoin ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
const FunctorType m_functor ;
const Policy m_policy ;
const ReducerType m_reducer ;
const pointer_type m_result_ptr ;
const int m_shmem_size ;
@ -644,7 +673,7 @@ public:
const size_t team_reduce_size = Policy::member_type::team_reduce_size();
OpenMPexec::resize_scratch( ValueTraits::value_size( m_functor ) , team_reduce_size + m_shmem_size );
OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , team_reduce_size + m_shmem_size );
#pragma omp parallel
{
@ -652,8 +681,8 @@ public:
ParallelReduce::template exec_team< WorkTag >
( m_functor
, Member( exec , m_policy , m_shmem_size )
, ValueInit::init( m_functor , exec.scratch_reduce() ) );
, Member( exec , m_policy , m_shmem_size, 0 )
, ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , exec.scratch_reduce() ) );
}
/* END #pragma omp parallel */
@ -665,13 +694,13 @@ public:
max_active_threads = m_policy.league_size()* m_policy.team_size();
for ( int i = 1 ; i < max_active_threads ; ++i ) {
ValueJoin::join( m_functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
}
Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( m_functor , ptr );
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
if ( m_result_ptr ) {
const int n = ValueTraits::value_count( m_functor );
const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
}
@ -682,12 +711,33 @@ public:
inline
ParallelReduce( const FunctorType & arg_functor ,
const Policy & arg_policy ,
const ViewType & arg_result )
const ViewType & arg_result ,
typename std::enable_if<
Kokkos::is_view< ViewType >::value &&
!Kokkos::is_reducer_type<ReducerType>::value
,void*>::type = NULL)
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( InvalidType() )
, m_result_ptr( arg_result.ptr_on_device() )
, m_shmem_size( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
, m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
{}
inline
ParallelReduce( const FunctorType & arg_functor
, Policy arg_policy
, const ReducerType& reducer )
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( reducer )
, m_result_ptr( reducer.result_view().data() )
, m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
{
/*static_assert( std::is_same< typename ViewType::memory_space
, Kokkos::HostSpace >::value
, "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
}
};
} // namespace Impl

View File

@ -0,0 +1,329 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core.hpp>
#if defined( KOKKOS_HAVE_OPENMP ) && defined( KOKKOS_ENABLE_TASKPOLICY )
#include <impl/Kokkos_TaskQueue_impl.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template class TaskQueue< Kokkos::OpenMP > ;
//----------------------------------------------------------------------------
TaskExec< Kokkos::OpenMP >::
TaskExec()
: m_self_exec( 0 )
, m_team_exec( 0 )
, m_sync_mask( 0 )
, m_sync_value( 0 )
, m_sync_step( 0 )
, m_group_rank( 0 )
, m_team_rank( 0 )
, m_team_size( 1 )
{
}
TaskExec< Kokkos::OpenMP >::
TaskExec( Kokkos::Impl::OpenMPexec & arg_exec , int const arg_team_size )
: m_self_exec( & arg_exec )
, m_team_exec( arg_exec.pool_rev(arg_exec.pool_rank_rev() / arg_team_size) )
, m_sync_mask( 0 )
, m_sync_value( 0 )
, m_sync_step( 0 )
, m_group_rank( arg_exec.pool_rank_rev() / arg_team_size )
, m_team_rank( arg_exec.pool_rank_rev() % arg_team_size )
, m_team_size( arg_team_size )
{
// This team spans
// m_self_exec->pool_rev( team_size * group_rank )
// m_self_exec->pool_rev( team_size * ( group_rank + 1 ) - 1 )
int64_t volatile * const sync = (int64_t *) m_self_exec->scratch_reduce();
sync[0] = int64_t(0) ;
sync[1] = int64_t(0) ;
for ( int i = 0 ; i < m_team_size ; ++i ) {
m_sync_value |= int64_t(1) << (8*i);
m_sync_mask |= int64_t(3) << (8*i);
}
Kokkos::memory_fence();
}
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
void TaskExec< Kokkos::OpenMP >::team_barrier_impl() const
{
if ( m_team_exec->scratch_reduce_size() < int(2 * sizeof(int64_t)) ) {
Kokkos::abort("TaskQueue<OpenMP> scratch_reduce memory too small");
}
// Use team shared memory to synchronize.
// Alternate memory locations between barriers to avoid a sequence
// of barriers overtaking one another.
int64_t volatile * const sync =
((int64_t *) m_team_exec->scratch_reduce()) + ( m_sync_step & 0x01 );
// This team member sets one byte within the sync variable
int8_t volatile * const sync_self =
((int8_t *) sync) + m_team_rank ;
#if 0
fprintf( stdout
, "barrier group(%d) member(%d) step(%d) wait(%lx) : before(%lx)\n"
, m_group_rank
, m_team_rank
, m_sync_step
, m_sync_value
, *sync
);
fflush(stdout);
#endif
*sync_self = int8_t( m_sync_value & 0x03 ); // signal arrival
while ( m_sync_value != *sync ); // wait for team to arrive
#if 0
fprintf( stdout
, "barrier group(%d) member(%d) step(%d) wait(%lx) : after(%lx)\n"
, m_group_rank
, m_team_rank
, m_sync_step
, m_sync_value
, *sync
);
fflush(stdout);
#endif
++m_sync_step ;
if ( 0 == ( 0x01 & m_sync_step ) ) { // Every other step
m_sync_value ^= m_sync_mask ;
if ( 1000 < m_sync_step ) m_sync_step = 0 ;
}
}
#endif
//----------------------------------------------------------------------------
void TaskQueueSpecialization< Kokkos::OpenMP >::execute
( TaskQueue< Kokkos::OpenMP > * const queue )
{
using execution_space = Kokkos::OpenMP ;
using queue_type = TaskQueue< execution_space > ;
using task_root_type = TaskBase< execution_space , void , void > ;
using PoolExec = Kokkos::Impl::OpenMPexec ;
using Member = TaskExec< execution_space > ;
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
// Required: team_size <= 8
const int team_size = PoolExec::pool_size(2); // Threads per core
// const int team_size = PoolExec::pool_size(1); // Threads per NUMA
if ( 8 < team_size ) {
Kokkos::abort("TaskQueue<OpenMP> unsupported team size");
}
#pragma omp parallel
{
PoolExec & self = *PoolExec::get_thread_omp();
Member single_exec ;
Member team_exec( self , team_size );
// Team shared memory
task_root_type * volatile * const task_shared =
(task_root_type **) team_exec.m_team_exec->scratch_thread();
// Barrier across entire OpenMP thread pool to insure initialization
#pragma omp barrier
// Loop until all queues are empty and no tasks in flight
do {
task_root_type * task = 0 ;
// Each team lead attempts to acquire either a thread team task
// or a single thread task for the team.
if ( 0 == team_exec.team_rank() ) {
task = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
// Loop by priority and then type
for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
for ( int j = 0 ; j < 2 && end == task ; ++j ) {
task = queue_type::pop_task( & queue->m_ready[i][j] );
}
}
}
// Team lead broadcast acquired task to team members:
if ( 1 < team_exec.team_size() ) {
if ( 0 == team_exec.team_rank() ) *task_shared = task ;
// Fence to be sure task_shared is stored before the barrier
Kokkos::memory_fence();
// Whole team waits for every team member to reach this statement
team_exec.team_barrier();
// Fence to be sure task_shared is stored
Kokkos::memory_fence();
task = *task_shared ;
}
#if 0
fprintf( stdout
, "\nexecute group(%d) member(%d) task_shared(0x%lx) task(0x%lx)\n"
, team_exec.m_group_rank
, team_exec.m_team_rank
, uintptr_t(task_shared)
, uintptr_t(task)
);
fflush(stdout);
#endif
if ( 0 == task ) break ; // 0 == m_ready_count
if ( end == task ) {
// All team members wait for whole team to reach this statement.
// Is necessary to prevent task_shared from being updated
// before it is read by all threads.
team_exec.team_barrier();
}
else if ( task_root_type::TaskTeam == task->m_task_type ) {
// Thread Team Task
(*task->m_apply)( task , & team_exec );
// The m_apply function performs a barrier
if ( 0 == team_exec.team_rank() ) {
// team member #0 completes the task, which may delete the task
queue->complete( task );
}
}
else {
// Single Thread Task
if ( 0 == team_exec.team_rank() ) {
(*task->m_apply)( task , & single_exec );
queue->complete( task );
}
// All team members wait for whole team to reach this statement.
// Not necessary to complete the task.
// Is necessary to prevent task_shared from being updated
// before it is read by all threads.
team_exec.team_barrier();
}
} while(1);
}
// END #pragma omp parallel
}
void TaskQueueSpecialization< Kokkos::OpenMP >::
iff_single_thread_recursive_execute
( TaskQueue< Kokkos::OpenMP > * const queue )
{
using execution_space = Kokkos::OpenMP ;
using queue_type = TaskQueue< execution_space > ;
using task_root_type = TaskBase< execution_space , void , void > ;
using Member = TaskExec< execution_space > ;
if ( 1 == omp_get_num_threads() ) {
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
Member single_exec ;
task_root_type * task = end ;
do {
task = end ;
// Loop by priority and then type
for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
for ( int j = 0 ; j < 2 && end == task ; ++j ) {
task = queue_type::pop_task( & queue->m_ready[i][j] );
}
}
if ( end == task ) break ;
(*task->m_apply)( task , & single_exec );
queue->complete( task );
} while(1);
}
}
}} /* namespace Kokkos::Impl */
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_HAVE_OPENMP ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */

View File

@ -0,0 +1,356 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_IMPL_OPENMP_TASK_HPP
#define KOKKOS_IMPL_OPENMP_TASK_HPP
#if defined( KOKKOS_ENABLE_TASKPOLICY )
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template<>
class TaskQueueSpecialization< Kokkos::OpenMP >
{
public:
using execution_space = Kokkos::OpenMP ;
using queue_type = Kokkos::Impl::TaskQueue< execution_space > ;
using task_base_type = Kokkos::Impl::TaskBase< execution_space , void , void > ;
// Must specify memory space
using memory_space = Kokkos::HostSpace ;
static
void iff_single_thread_recursive_execute( queue_type * const );
// Must provide task queue execution function
static void execute( queue_type * const );
// Must provide mechanism to set function pointer in
// execution space from the host process.
template< typename FunctorType >
static
void proc_set_apply( task_base_type::function_type * ptr )
{
using TaskType = TaskBase< Kokkos::OpenMP
, typename FunctorType::value_type
, FunctorType
> ;
*ptr = TaskType::apply ;
}
};
extern template class TaskQueue< Kokkos::OpenMP > ;
//----------------------------------------------------------------------------
template<>
class TaskExec< Kokkos::OpenMP >
{
private:
TaskExec( TaskExec && ) = delete ;
TaskExec( TaskExec const & ) = delete ;
TaskExec & operator = ( TaskExec && ) = delete ;
TaskExec & operator = ( TaskExec const & ) = delete ;
using PoolExec = Kokkos::Impl::OpenMPexec ;
friend class Kokkos::Impl::TaskQueue< Kokkos::OpenMP > ;
friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::OpenMP > ;
PoolExec * const m_self_exec ; ///< This thread's thread pool data structure
PoolExec * const m_team_exec ; ///< Team thread's thread pool data structure
int64_t m_sync_mask ;
int64_t mutable m_sync_value ;
int mutable m_sync_step ;
int m_group_rank ; ///< Which "team" subset of thread pool
int m_team_rank ; ///< Which thread within a team
int m_team_size ;
TaskExec();
TaskExec( PoolExec & arg_exec , int arg_team_size );
void team_barrier_impl() const ;
public:
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
void * team_shared() const
{ return m_team_exec ? m_team_exec->scratch_thread() : (void*) 0 ; }
int team_shared_size() const
{ return m_team_exec ? m_team_exec->scratch_thread_size() : 0 ; }
/**\brief Whole team enters this function call
* before any teeam member returns from
* this function call.
*/
void team_barrier() const { if ( 1 < m_team_size ) team_barrier_impl(); }
#else
KOKKOS_INLINE_FUNCTION void team_barrier() const {}
KOKKOS_INLINE_FUNCTION void * team_shared() const { return 0 ; }
KOKKOS_INLINE_FUNCTION int team_shared_size() const { return 0 ; }
#endif
KOKKOS_INLINE_FUNCTION
int team_rank() const { return m_team_rank ; }
KOKKOS_INLINE_FUNCTION
int team_size() const { return m_team_size ; }
};
}} /* namespace Kokkos::Impl */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >
TeamThreadRange
( Impl::TaskExec< Kokkos::OpenMP > & thread
, const iType & count )
{
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >(thread,count);
}
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >
TeamThreadRange
( Impl:: TaskExec< Kokkos::OpenMP > & thread
, const iType & start
, const iType & end )
{
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >(thread,start,end);
}
/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all threads of the the calling thread team.
* This functionality requires C++11 support.
*/
template<typename iType, class Lambda>
KOKKOS_INLINE_FUNCTION
void parallel_for
( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >& loop_boundaries
, const Lambda& lambda
)
{
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i);
}
}
template<typename iType, class Lambda, typename ValueType>
KOKKOS_INLINE_FUNCTION
void parallel_reduce
( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >& loop_boundaries
, const Lambda& lambda
, ValueType& initialized_result)
{
int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
ValueType result = initialized_result;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i, result);
}
if ( 1 < loop_boundaries.thread.team_size() ) {
ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
loop_boundaries.thread.team_barrier();
shared[team_rank] = result;
loop_boundaries.thread.team_barrier();
// reduce across threads to thread 0
if (team_rank == 0) {
for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
shared[0] += shared[i];
}
}
loop_boundaries.thread.team_barrier();
// broadcast result
initialized_result = shared[0];
}
else {
initialized_result = result ;
}
}
template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
const Lambda & lambda,
const JoinType & join,
ValueType& initialized_result)
{
int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
ValueType result = initialized_result;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i, result);
}
if ( 1 < loop_boundaries.thread.team_size() ) {
ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
loop_boundaries.thread.team_barrier();
shared[team_rank] = result;
loop_boundaries.thread.team_barrier();
// reduce across threads to thread 0
if (team_rank == 0) {
for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
join(shared[0], shared[i]);
}
}
loop_boundaries.thread.team_barrier();
// broadcast result
initialized_result = shared[0];
}
else {
initialized_result = result ;
}
}
// placeholder for future function
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
const Lambda & lambda,
ValueType& initialized_result)
{
}
// placeholder for future function
template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
const Lambda & lambda,
const JoinType & join,
ValueType& initialized_result)
{
}
template< typename ValueType, typename iType, class Lambda >
KOKKOS_INLINE_FUNCTION
void parallel_scan
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
const Lambda & lambda)
{
ValueType accum = 0 ;
ValueType val, local_total;
ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
int team_size = loop_boundaries.thread.team_size();
int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
// Intra-member scan
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
local_total = 0;
lambda(i,local_total,false);
val = accum;
lambda(i,val,true);
accum += local_total;
}
shared[team_rank] = accum;
loop_boundaries.thread.team_barrier();
// Member 0 do scan on accumulated totals
if (team_rank == 0) {
for( iType i = 1; i < team_size; i+=1) {
shared[i] += shared[i-1];
}
accum = 0; // Member 0 set accum to 0 in preparation for inter-member scan
}
loop_boundaries.thread.team_barrier();
// Inter-member scan adding in accumulated totals
if (team_rank != 0) { accum = shared[team_rank-1]; }
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
local_total = 0;
lambda(i,local_total,false);
val = accum;
lambda(i,val,true);
accum += local_total;
}
}
// placeholder for future function
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_scan
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
const Lambda & lambda)
{
}
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
#endif /* #ifndef KOKKOS_IMPL_OPENMP_TASK_HPP */

View File

@ -49,6 +49,7 @@
#include <impl/Kokkos_Error.hpp>
#include <iostream>
#include <impl/Kokkos_CPUDiscovery.hpp>
#include <impl/Kokkos_Profiling_Interface.hpp>
#ifdef KOKKOS_HAVE_OPENMP
@ -85,16 +86,8 @@ int OpenMPexec::m_map_rank[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
int OpenMPexec::m_pool_topo[ 4 ] = { 0 };
#if ! KOKKOS_USING_EXP_VIEW
OpenMPexec::Pool OpenMPexec::m_pool;
#else
OpenMPexec * OpenMPexec::m_pool[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
#endif
void OpenMPexec::verify_is_process( const char * const label )
{
if ( omp_in_parallel() ) {
@ -125,16 +118,12 @@ void OpenMPexec::clear_scratch()
#pragma omp parallel
{
const int rank_rev = m_map_rank[ omp_get_thread_num() ];
#if KOKKOS_USING_EXP_VIEW
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
if ( m_pool[ rank_rev ] ) {
Record * const r = Record::get_record( m_pool[ rank_rev ] );
m_pool[ rank_rev ] = 0 ;
Record::decrement( r );
}
#else
m_pool.at(rank_rev).clear();
#endif
}
/* END #pragma omp parallel */
}
@ -172,8 +161,6 @@ void OpenMPexec::resize_scratch( size_t reduce_size , size_t thread_size )
const int rank_rev = m_map_rank[ omp_get_thread_num() ];
const int rank = pool_size - ( rank_rev + 1 );
#if KOKKOS_USING_EXP_VIEW
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
Record * const r = Record::allocate( Kokkos::HostSpace()
@ -184,15 +171,6 @@ void OpenMPexec::resize_scratch( size_t reduce_size , size_t thread_size )
m_pool[ rank_rev ] = reinterpret_cast<OpenMPexec*>( r->data() );
#else
#pragma omp critical
{
m_pool.at(rank_rev) = HostSpace::allocate_and_track( "openmp_scratch", alloc_size );
}
#endif
new ( m_pool[ rank_rev ] ) OpenMPexec( rank , ALLOC_EXEC , reduce_size , thread_size );
}
/* END #pragma omp parallel */
@ -330,6 +308,10 @@ void OpenMP::initialize( unsigned thread_count ,
}
// Init the array for used for arbitrarily sized atomics
Impl::init_lock_array_host_space();
#if (KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::initialize();
#endif
}
//----------------------------------------------------------------------------
@ -350,6 +332,10 @@ void OpenMP::finalize()
if ( Impl::s_using_hwloc && Kokkos::hwloc::can_bind_threads() ) {
hwloc::unbind_this_thread();
}
#if (KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::finalize();
#endif
}
//----------------------------------------------------------------------------

View File

@ -46,7 +46,6 @@
#include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_spinwait.hpp>
#include <impl/Kokkos_AllocationTracker.hpp>
#include <Kokkos_Atomic.hpp>
#include <iostream>
@ -63,38 +62,10 @@ public:
enum { MAX_THREAD_COUNT = 4096 };
#if ! KOKKOS_USING_EXP_VIEW
struct Pool
{
Pool() : m_trackers() {}
AllocationTracker m_trackers[ MAX_THREAD_COUNT ];
OpenMPexec * operator[](int i)
{
return reinterpret_cast<OpenMPexec *>(m_trackers[i].alloc_ptr());
}
AllocationTracker & at(int i)
{
return m_trackers[i];
}
};
private:
static Pool m_pool; // Indexed by: m_pool_rank_rev
#else
private:
static OpenMPexec * m_pool[ MAX_THREAD_COUNT ]; // Indexed by: m_pool_rank_rev
#endif
static int m_pool_topo[ 4 ];
static int m_map_rank[ MAX_THREAD_COUNT ];
@ -145,6 +116,12 @@ public:
inline long team_work_index() const { return m_team_work_index ; }
inline int scratch_reduce_size() const
{ return m_scratch_reduce_end - m_scratch_exec_end ; }
inline int scratch_thread_size() const
{ return m_scratch_thread_end - m_scratch_reduce_end ; }
inline void * scratch_reduce() const { return ((char *) this) + m_scratch_exec_end ; }
inline void * scratch_thread() const { return ((char *) this) + m_scratch_reduce_end ; }
@ -157,15 +134,15 @@ public:
~OpenMPexec() {}
OpenMPexec( const int poolRank
, const int scratch_exec_size
, const int scratch_reduce_size
, const int scratch_thread_size )
: m_pool_rank( poolRank )
, m_pool_rank_rev( pool_size() - ( poolRank + 1 ) )
, m_scratch_exec_end( scratch_exec_size )
, m_scratch_reduce_end( m_scratch_exec_end + scratch_reduce_size )
, m_scratch_thread_end( m_scratch_reduce_end + scratch_thread_size )
OpenMPexec( const int arg_poolRank
, const int arg_scratch_exec_size
, const int arg_scratch_reduce_size
, const int arg_scratch_thread_size )
: m_pool_rank( arg_poolRank )
, m_pool_rank_rev( pool_size() - ( arg_poolRank + 1 ) )
, m_scratch_exec_end( arg_scratch_exec_size )
, m_scratch_reduce_end( m_scratch_exec_end + arg_scratch_reduce_size )
, m_scratch_thread_end( m_scratch_reduce_end + arg_scratch_thread_size )
, m_barrier_state(0)
{}
@ -330,7 +307,7 @@ public:
Impl::OpenMPexec & m_exec ;
scratch_memory_space m_team_shared ;
int m_team_shmem ;
int m_team_scratch_size[2] ;
int m_team_base_rev ;
int m_team_rank_rev ;
int m_team_rank ;
@ -378,15 +355,15 @@ public:
KOKKOS_INLINE_FUNCTION
const execution_space::scratch_memory_space& team_shmem() const
{ return m_team_shared.set_team_thread_mode(1,0) ; }
{ return m_team_shared.set_team_thread_mode(0,1,0) ; }
KOKKOS_INLINE_FUNCTION
const execution_space::scratch_memory_space& team_scratch(int) const
{ return m_team_shared.set_team_thread_mode(1,0) ; }
{ return m_team_shared.set_team_thread_mode(0,1,0) ; }
KOKKOS_INLINE_FUNCTION
const execution_space::scratch_memory_space& thread_scratch(int) const
{ return m_team_shared.set_team_thread_mode(team_size(),team_rank()) ; }
{ return m_team_shared.set_team_thread_mode(0,team_size(),team_rank()) ; }
KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
@ -568,11 +545,12 @@ public:
inline
OpenMPexecTeamMember( Impl::OpenMPexec & exec
, const TeamPolicyInternal< OpenMP, Properties ...> & team
, const int shmem_size
, const int shmem_size_L1
, const int shmem_size_L2
)
: m_exec( exec )
, m_team_shared(0,0)
, m_team_shmem( shmem_size )
, m_team_scratch_size{ shmem_size_L1 , shmem_size_L2 }
, m_team_base_rev(0)
, m_team_rank_rev(0)
, m_team_rank(0)
@ -580,7 +558,7 @@ public:
, m_league_rank(0)
, m_league_end(0)
, m_league_size( team.league_size() )
, m_chunk_size( team.chunk_size() )
, m_chunk_size( team.chunk_size()>0?team.chunk_size():team.team_iter() )
, m_league_chunk_end(0)
, m_team_lead_exec( *exec.pool_rev( team.team_alloc() * (m_exec.pool_rank_rev()/team.team_alloc()) ))
, m_team_alloc( team.team_alloc())
@ -589,10 +567,9 @@ public:
const int pool_team_rank_rev = pool_rank_rev % team.team_alloc();
const int pool_league_rank_rev = pool_rank_rev / team.team_alloc();
const int pool_num_teams = OpenMP::thread_pool_size(0)/team.team_alloc();
const int chunk_size = team.chunk_size()>0?team.chunk_size():team.team_iter();
const int chunks_per_team = ( team.league_size() + chunk_size*pool_num_teams-1 ) / (chunk_size*pool_num_teams);
int league_iter_end = team.league_size() - pool_league_rank_rev * chunks_per_team * chunk_size;
int league_iter_begin = league_iter_end - chunks_per_team * chunk_size;
const int chunks_per_team = ( team.league_size() + m_chunk_size*pool_num_teams-1 ) / (m_chunk_size*pool_num_teams);
int league_iter_end = team.league_size() - pool_league_rank_rev * chunks_per_team * m_chunk_size;
int league_iter_begin = league_iter_end - chunks_per_team * m_chunk_size;
if (league_iter_begin < 0) league_iter_begin = 0;
if (league_iter_end>team.league_size()) league_iter_end = team.league_size();
@ -611,7 +588,9 @@ public:
m_team_rank = m_team_size - ( m_team_rank_rev + 1 );
m_league_end = league_iter_end ;
m_league_rank = league_iter_begin ;
new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_shmem );
new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] ,
( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0],
0 );
}
if ( (m_team_rank_rev == 0) && (m_invalid_thread == 0) ) {
@ -627,10 +606,13 @@ public:
void next_static()
{
if ( ++m_league_rank < m_league_end ) {
if ( m_league_rank < m_league_end ) {
team_barrier();
new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_shmem );
new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] ,
( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0],
0);
}
m_league_rank++;
}
bool valid_dynamic() {
@ -661,10 +643,13 @@ public:
if(m_invalid_thread)
return;
team_barrier();
if ( ++m_league_rank < m_league_chunk_end ) {
new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_shmem );
if ( m_league_rank < m_league_chunk_end ) {
team_barrier();
new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] ,
( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0],
0);
}
m_league_rank++;
}
static inline int team_reduce_size() { return TEAM_REDUCE_SIZE ; }
@ -687,8 +672,10 @@ public:
m_team_size = p.m_team_size;
m_team_alloc = p.m_team_alloc;
m_team_iter = p.m_team_iter;
m_team_scratch_size = p.m_team_scratch_size;
m_thread_scratch_size = p.m_thread_scratch_size;
m_team_scratch_size[0] = p.m_team_scratch_size[0];
m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
m_team_scratch_size[1] = p.m_team_scratch_size[1];
m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
m_chunk_size = p.m_chunk_size;
return *this;
}
@ -719,8 +706,8 @@ private:
int m_team_alloc ;
int m_team_iter ;
size_t m_team_scratch_size;
size_t m_thread_scratch_size;
size_t m_team_scratch_size[2];
size_t m_thread_scratch_size[2];
int m_chunk_size;
@ -753,15 +740,19 @@ public:
inline int team_size() const { return m_team_size ; }
inline int league_size() const { return m_league_size ; }
inline size_t scratch_size() const { return m_team_scratch_size + m_team_size*m_thread_scratch_size ; }
inline size_t scratch_size(const int& level, int team_size_ = -1) const {
if(team_size_ < 0)
team_size_ = m_team_size;
return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level] ;
}
/** \brief Specify league size, request team size */
TeamPolicyInternal( typename traits::execution_space &
, int league_size_request
, int team_size_request
, int /* vector_length_request */ = 1 )
: m_team_scratch_size ( 0 )
, m_thread_scratch_size ( 0 )
: m_team_scratch_size { 0 , 0 }
, m_thread_scratch_size { 0 , 0 }
, m_chunk_size(0)
{ init( league_size_request , team_size_request ); }
@ -769,24 +760,24 @@ public:
, int league_size_request
, const Kokkos::AUTO_t & /* team_size_request */
, int /* vector_length_request */ = 1)
: m_team_scratch_size ( 0 )
, m_thread_scratch_size ( 0 )
: m_team_scratch_size { 0 , 0 }
, m_thread_scratch_size { 0 , 0 }
, m_chunk_size(0)
{ init( league_size_request , traits::execution_space::thread_pool_size(2) ); }
TeamPolicyInternal( int league_size_request
, int team_size_request
, int /* vector_length_request */ = 1 )
: m_team_scratch_size ( 0 )
, m_thread_scratch_size ( 0 )
: m_team_scratch_size { 0 , 0 }
, m_thread_scratch_size { 0 , 0 }
, m_chunk_size(0)
{ init( league_size_request , team_size_request ); }
TeamPolicyInternal( int league_size_request
, const Kokkos::AUTO_t & /* team_size_request */
, int /* vector_length_request */ = 1 )
: m_team_scratch_size ( 0 )
, m_thread_scratch_size ( 0 )
: m_team_scratch_size { 0 , 0 }
, m_thread_scratch_size { 0 , 0 }
, m_chunk_size(0)
{ init( league_size_request , traits::execution_space::thread_pool_size(2) ); }
@ -803,24 +794,21 @@ public:
}
inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
(void) level;
TeamPolicyInternal p = *this;
p.m_team_scratch_size = per_team.value;
p.m_team_scratch_size[level] = per_team.value;
return p;
};
inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
(void) level;
TeamPolicyInternal p = *this;
p.m_thread_scratch_size = per_thread.value;
p.m_thread_scratch_size[level] = per_thread.value;
return p;
};
inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
(void) level;
TeamPolicyInternal p = *this;
p.m_team_scratch_size = per_team.value;
p.m_thread_scratch_size = per_thread.value;
p.m_team_scratch_size[level] = per_team.value;
p.m_thread_scratch_size[level] = per_thread.value;
return p;
};

View File

@ -104,7 +104,7 @@ namespace Kokkos {
int Qthread::is_initialized()
{
Impl::s_number_workers != 0 ;
return Impl::s_number_workers != 0 ;
}
int Qthread::concurrency()

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -113,7 +113,7 @@ public:
m_worker_state = QthreadExec::Inactive ;
Impl::spinwait( m_worker_state , QthreadExec::Inactive );
}
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
m_worker_base[j]->m_worker_state = QthreadExec::Active ;
}
@ -136,7 +136,7 @@ public:
m_worker_state = QthreadExec::Inactive ;
Impl::spinwait( m_worker_state , QthreadExec::Inactive );
}
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
}
@ -145,11 +145,13 @@ public:
//----------------------------------------
/** Reduce across all workers participating in the 'exec_all' */
template< class FunctorType , class ArgTag >
template< class FunctorType , class ReducerType , class ArgTag >
inline
void exec_all_reduce( const FunctorType & func ) const
void exec_all_reduce( const FunctorType & func, const ReducerType & reduce ) const
{
typedef Kokkos::Impl::FunctorValueJoin< FunctorType , ArgTag > ValueJoin ;
typedef Kokkos::Impl::if_c< std::is_same<InvalidType, ReducerType>::value, FunctorType, ReducerType > ReducerConditional;
typedef typename ReducerConditional::type ReducerTypeFwd;
typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, ArgTag > ValueJoin ;
const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
@ -160,14 +162,14 @@ public:
Impl::spinwait( fan.m_worker_state , QthreadExec::Active );
ValueJoin::join( func , m_scratch_alloc , fan.m_scratch_alloc );
ValueJoin::join( ReducerConditional::select(func , reduce) , m_scratch_alloc , fan.m_scratch_alloc );
}
if ( rev_rank ) {
m_worker_state = QthreadExec::Inactive ;
Impl::spinwait( m_worker_state , QthreadExec::Inactive );
}
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
m_worker_base[j]->m_worker_state = QthreadExec::Active ;
}
@ -197,7 +199,7 @@ public:
}
else {
// Root thread scans across values before releasing threads
// Worker data is in reverse order, so m_worker_base[0] is the
// Worker data is in reverse order, so m_worker_base[0] is the
// highest ranking thread.
// Copy from lower ranking to higher ranking worker.
@ -216,7 +218,7 @@ public:
ValueJoin::join( func , m_worker_base[i-1]->m_scratch_alloc , m_worker_base[i]->m_scratch_alloc );
}
}
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
m_worker_base[j]->m_worker_state = QthreadExec::Active ;
}
@ -349,7 +351,7 @@ public:
}
else {
// Root thread scans across values before releasing threads
// Worker data is in reverse order, so m_shepherd_base[0] is the
// Worker data is in reverse order, so m_shepherd_base[0] is the
// highest ranking thread.
// Copy from lower ranking to higher ranking worker.
@ -371,7 +373,7 @@ public:
memory_fence();
}
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
}

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -130,9 +130,10 @@ public:
//----------------------------------------------------------------------------
template< class FunctorType , class ... Traits >
template< class FunctorType , class ReducerType , class ... Traits >
class ParallelReduce< FunctorType
, Kokkos::RangePolicy< Traits ... >
, ReducerType
, Kokkos::Qthread
>
{
@ -141,18 +142,24 @@ private:
typedef Kokkos::RangePolicy< Traits ... > Policy ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::member_type Member ;
typedef typename Policy::WorkRange WorkRange ;
typedef typename Policy::member_type Member ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;
typedef Kokkos::Impl::if_c< std::is_same<InvalidType, ReducerType>::value, FunctorType, ReducerType > ReducerConditional;
typedef typename ReducerConditional::type ReducerTypeFwd;
// Static Assert WorkTag void if ReducerType not InvalidType
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
const FunctorType m_functor ;
const Policy m_policy ;
const pointer_type m_result_ptr ;
const FunctorType m_functor ;
const Policy m_policy ;
const ReducerType m_reducer ;
const pointer_type m_result_ptr ;
template< class TagType >
inline static
@ -187,9 +194,10 @@ private:
ParallelReduce::template exec_range< WorkTag >(
self.m_functor, range.begin(), range.end(),
ValueInit::init( self.m_functor , exec.exec_all_reduce_value() ) );
ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer)
, exec.exec_all_reduce_value() ) );
exec.template exec_all_reduce<FunctorType, WorkTag >( self.m_functor );
exec.template exec_all_reduce< FunctorType, ReducerType, WorkTag >( self.m_functor, self.m_reducer );
}
public:
@ -197,26 +205,39 @@ public:
inline
void execute() const
{
QthreadExec::resize_worker_scratch( ValueTraits::value_size( m_functor ) , 0 );
QthreadExec::resize_worker_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::exec , this );
const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();
Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( m_functor , data );
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , data );
if ( m_result_ptr ) {
const unsigned n = ValueTraits::value_count( m_functor );
const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
}
}
template< class HostViewType >
template< class ViewType >
ParallelReduce( const FunctorType & arg_functor
, const Policy & arg_policy
, const HostViewType & arg_result_view )
, const ViewType & arg_result_view
, typename std::enable_if<Kokkos::is_view< ViewType >::value &&
!Kokkos::is_reducer_type< ReducerType >::value
, void*>::type = NULL)
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_result_ptr( arg_result_view.ptr_on_device() )
, m_policy( arg_policy )
, m_reducer( InvalidType() )
, m_result_ptr( arg_result_view.data() )
{ }
ParallelReduce( const FunctorType & arg_functor
, Policy arg_policy
, const ReducerType& reducer )
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( reducer )
, m_result_ptr( reducer.result_view().data() )
{ }
};
@ -291,10 +312,12 @@ public:
//----------------------------------------------------------------------------
template< class FunctorType , class ... Properties >
template< class FunctorType , class ReducerType , class ... Properties >
class ParallelReduce< FunctorType
, TeamPolicy< Properties... >
, Kokkos::Qthread >
, ReducerType
, Kokkos::Qthread
>
{
private:
@ -303,14 +326,18 @@ private:
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::member_type Member ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
typedef typename ReducerConditional::type ReducerTypeFwd;
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTag > ValueInit ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
const FunctorType m_functor ;
const Policy m_policy ;
const ReducerType m_reducer ;
const pointer_type m_result_ptr ;
template< class TagType >
@ -345,9 +372,10 @@ private:
ParallelReduce::template exec_team< WorkTag >
( self.m_functor
, Member( exec , self.m_policy )
, ValueInit::init( self.m_functor , exec.exec_all_reduce_value() ) );
, ValueInit::init( ReducerConditional::select( self.m_functor , self.m_reducer )
, exec.exec_all_reduce_value() ) );
exec.template exec_all_reduce< FunctorType , WorkTag >( self.m_functor );
exec.template exec_all_reduce< FunctorType, ReducerType, WorkTag >( self.m_functor, self.m_reducer );
}
public:
@ -356,29 +384,43 @@ public:
void execute() const
{
QthreadExec::resize_worker_scratch
( /* reduction memory */ ValueTraits::value_size( m_functor )
( /* reduction memory */ ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) )
, /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( m_functor , m_policy.team_size() ) );
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::exec , this );
const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();
Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( m_functor , data );
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer), data );
if ( m_result_ptr ) {
const unsigned n = ValueTraits::value_count( m_functor );
const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
}
}
template< class ViewType >
ParallelReduce( const FunctorType & arg_functor ,
const Policy & arg_policy ,
const ViewType & arg_result )
ParallelReduce( const FunctorType & arg_functor
, const Policy & arg_policy
, const ViewType & arg_result
, typename std::enable_if<Kokkos::is_view< ViewType >::value &&
!Kokkos::is_reducer_type< ReducerType >::value
, void*>::type = NULL)
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_policy( arg_policy )
, m_reducer( InvalidType() )
, m_result_ptr( arg_result.ptr_on_device() )
{ }
inline
ParallelReduce( const FunctorType & arg_functor
, Policy arg_policy
, const ReducerType& reducer )
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( reducer )
, m_result_ptr( reducer.result_view().data() )
{ }
};
//----------------------------------------------------------------------------
@ -395,8 +437,8 @@ private:
typedef Kokkos::RangePolicy< Traits ... > Policy ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::member_type Member ;
typedef typename Policy::WorkRange WorkRange ;
typedef typename Policy::member_type Member ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;

View File

@ -58,6 +58,8 @@
#include <Kokkos_Atomic.hpp>
#include <Qthread/Kokkos_Qthread_TaskPolicy.hpp>
#if defined( KOKKOS_ENABLE_TASKPOLICY )
//----------------------------------------------------------------------------
namespace Kokkos {
@ -120,13 +122,13 @@ Task::~TaskMember()
}
Task::TaskMember( const function_verify_type arg_verify
, const function_dealloc_type arg_dealloc
, const function_apply_single_type arg_apply_single
, const function_apply_team_type arg_apply_team
, volatile int & arg_active_count
, const unsigned arg_sizeof_derived
, const unsigned arg_dependence_capacity
Task::TaskMember( const function_verify_type arg_verify
, const function_dealloc_type arg_dealloc
, const function_single_type arg_apply_single
, const function_team_type arg_apply_team
, volatile int & arg_active_count
, const unsigned arg_sizeof_derived
, const unsigned arg_dependence_capacity
)
: m_dealloc( arg_dealloc )
, m_verify( arg_verify )
@ -144,12 +146,12 @@ Task::TaskMember( const function_verify_type arg_verify
for ( unsigned i = 0 ; i < arg_dependence_capacity ; ++i ) m_dep[i] = 0 ;
}
Task::TaskMember( const function_dealloc_type arg_dealloc
, const function_apply_single_type arg_apply_single
, const function_apply_team_type arg_apply_team
, volatile int & arg_active_count
, const unsigned arg_sizeof_derived
, const unsigned arg_dependence_capacity
Task::TaskMember( const function_dealloc_type arg_dealloc
, const function_single_type arg_apply_single
, const function_team_type arg_apply_team
, volatile int & arg_active_count
, const unsigned arg_sizeof_derived
, const unsigned arg_dependence_capacity
)
: m_dealloc( arg_dealloc )
, m_verify( & Task::verify_type<void> )
@ -316,12 +318,8 @@ aligned_t Task::qthread_func( void * arg )
, int(Kokkos::Experimental::TASK_STATE_EXECUTING)
);
// It is a single thread's responsibility to close out
// this task's execution.
bool close_out = false ;
if ( task->m_apply_team && ! task->m_apply_single ) {
const Kokkos::Impl::QthreadTeamPolicyMember::TaskTeam task_team_tag ;
Kokkos::Impl::QthreadTeamPolicyMember::TaskTeam task_team_tag ;
// Initialize team size and rank with shephered info
Kokkos::Impl::QthreadTeamPolicyMember member( task_team_tag );
@ -344,7 +342,7 @@ fflush(stdout);
if ( member.team_rank() == 0 ) task->closeout();
member.team_barrier();
}
else if ( task->m_apply_team && task->m_apply_single == reinterpret_cast<function_apply_single_type>(1) ) {
else if ( task->m_apply_team && task->m_apply_single == reinterpret_cast<function_single_type>(1) ) {
// Team hard-wired to one, no cloning
Kokkos::Impl::QthreadTeamPolicyMember member ;
(*task->m_apply_team)( task , member );
@ -488,5 +486,6 @@ void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Qthread > & policy )
} // namespace Experimental
} // namespace Kokkos
#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
#endif /* #if defined( KOKKOS_HAVE_QTHREAD ) */

View File

@ -69,6 +69,8 @@
#include <impl/Kokkos_FunctorAdapter.hpp>
#if defined( KOKKOS_ENABLE_TASKPOLICY )
//----------------------------------------------------------------------------
namespace Kokkos {
@ -80,24 +82,24 @@ class TaskMember< Kokkos::Qthread , void , void >
{
public:
typedef void (* function_apply_single_type) ( TaskMember * );
typedef void (* function_apply_team_type) ( TaskMember * , Kokkos::Impl::QthreadTeamPolicyMember & );
typedef void (* function_dealloc_type)( TaskMember * );
typedef TaskMember * (* function_verify_type) ( TaskMember * );
typedef void (* function_single_type) ( TaskMember * );
typedef void (* function_team_type) ( TaskMember * , Kokkos::Impl::QthreadTeamPolicyMember & );
typedef void (* function_dealloc_type)( TaskMember * );
private:
const function_dealloc_type m_dealloc ; ///< Deallocation
const function_verify_type m_verify ; ///< Result type verification
const function_apply_single_type m_apply_single ; ///< Apply function
const function_apply_team_type m_apply_team ; ///< Apply function
int volatile * const m_active_count ; ///< Count of active tasks on this policy
aligned_t m_qfeb ; ///< Qthread full/empty bit
TaskMember ** const m_dep ; ///< Dependences
const int m_dep_capacity ; ///< Capacity of dependences
int m_dep_size ; ///< Actual count of dependences
int m_ref_count ; ///< Reference count
int m_state ; ///< State of the task
const function_dealloc_type m_dealloc ; ///< Deallocation
const function_verify_type m_verify ; ///< Result type verification
const function_single_type m_apply_single ; ///< Apply function
const function_team_type m_apply_team ; ///< Apply function
int volatile * const m_active_count ; ///< Count of active tasks on this policy
aligned_t m_qfeb ; ///< Qthread full/empty bit
TaskMember ** const m_dep ; ///< Dependences
const int m_dep_capacity ; ///< Capacity of dependences
int m_dep_size ; ///< Actual count of dependences
int m_ref_count ; ///< Reference count
int m_state ; ///< State of the task
TaskMember() /* = delete */ ;
TaskMember( const TaskMember & ) /* = delete */ ;
@ -128,22 +130,22 @@ protected :
~TaskMember();
// Used by TaskMember< Qthread , ResultType , void >
TaskMember( const function_verify_type arg_verify
, const function_dealloc_type arg_dealloc
, const function_apply_single_type arg_apply_single
, const function_apply_team_type arg_apply_team
, volatile int & arg_active_count
, const unsigned arg_sizeof_derived
, const unsigned arg_dependence_capacity
TaskMember( const function_verify_type arg_verify
, const function_dealloc_type arg_dealloc
, const function_single_type arg_apply_single
, const function_team_type arg_apply_team
, volatile int & arg_active_count
, const unsigned arg_sizeof_derived
, const unsigned arg_dependence_capacity
);
// Used for TaskMember< Qthread , void , void >
TaskMember( const function_dealloc_type arg_dealloc
, const function_apply_single_type arg_apply_single
, const function_apply_team_type arg_apply_team
, volatile int & arg_active_count
, const unsigned arg_sizeof_derived
, const unsigned arg_dependence_capacity
TaskMember( const function_dealloc_type arg_dealloc
, const function_single_type arg_apply_single
, const function_team_type arg_apply_team
, volatile int & arg_active_count
, const unsigned arg_sizeof_derived
, const unsigned arg_dependence_capacity
);
public:
@ -221,7 +223,7 @@ public:
typedef typename DerivedTaskType::functor_type functor_type ;
typedef typename functor_type::value_type value_type ;
const function_apply_single_type flag = reinterpret_cast<function_apply_single_type>( arg_is_team ? 0 : 1 );
const function_single_type flag = reinterpret_cast<function_single_type>( arg_is_team ? 0 : 1 );
DerivedTaskType * const task =
new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) )
@ -379,16 +381,16 @@ protected:
typedef TaskMember< Kokkos::Qthread , void , void > task_root_type ;
typedef task_root_type::function_dealloc_type function_dealloc_type ;
typedef task_root_type::function_apply_single_type function_apply_single_type ;
typedef task_root_type::function_apply_team_type function_apply_team_type ;
typedef task_root_type::function_single_type function_single_type ;
typedef task_root_type::function_team_type function_team_type ;
inline
TaskMember( const function_dealloc_type arg_dealloc
, const function_apply_single_type arg_apply_single
, const function_apply_team_type arg_apply_team
, volatile int & arg_active_count
, const unsigned arg_sizeof_derived
, const unsigned arg_dependence_capacity
TaskMember( const function_dealloc_type arg_dealloc
, const function_single_type arg_apply_single
, const function_team_type arg_apply_team
, volatile int & arg_active_count
, const unsigned arg_sizeof_derived
, const unsigned arg_dependence_capacity
)
: task_root_type( & task_root_type::template verify_type< ResultType >
, arg_dealloc
@ -413,17 +415,17 @@ public:
typedef TaskMember< Kokkos::Qthread , void , void > task_root_type ;
typedef TaskMember< Kokkos::Qthread , ResultType , void > task_base_type ;
typedef task_root_type::function_dealloc_type function_dealloc_type ;
typedef task_root_type::function_apply_single_type function_apply_single_type ;
typedef task_root_type::function_apply_team_type function_apply_team_type ;
typedef task_root_type::function_single_type function_single_type ;
typedef task_root_type::function_team_type function_team_type ;
inline
TaskMember( const function_dealloc_type arg_dealloc
, const function_apply_single_type arg_apply_single
, const function_apply_team_type arg_apply_team
, volatile int & arg_active_count
, const unsigned arg_sizeof_derived
, const unsigned arg_dependence_capacity
, const functor_type & arg_functor
TaskMember( const function_dealloc_type arg_dealloc
, const function_single_type arg_apply_single
, const function_team_type arg_apply_team
, volatile int & arg_active_count
, const unsigned arg_sizeof_derived
, const unsigned arg_dependence_capacity
, const functor_type & arg_functor
)
: task_base_type( arg_dealloc
, arg_apply_single
@ -453,6 +455,7 @@ class TaskPolicy< Kokkos::Qthread >
public:
typedef Kokkos::Qthread execution_space ;
typedef TaskPolicy execution_policy ;
typedef Kokkos::Impl::QthreadTeamPolicyMember member_type ;
private:
@ -489,14 +492,17 @@ public:
, const unsigned arg_task_team_size = 0 /* choose default */
);
TaskPolicy() = default ;
TaskPolicy( TaskPolicy && rhs ) = default ;
TaskPolicy( const TaskPolicy & rhs ) = default ;
TaskPolicy & operator = ( TaskPolicy && rhs ) = default ;
TaskPolicy & operator = ( const TaskPolicy & rhs ) = default ;
KOKKOS_FUNCTION TaskPolicy() = default ;
KOKKOS_FUNCTION TaskPolicy( TaskPolicy && rhs ) = default ;
KOKKOS_FUNCTION TaskPolicy( const TaskPolicy & rhs ) = default ;
KOKKOS_FUNCTION TaskPolicy & operator = ( TaskPolicy && rhs ) = default ;
KOKKOS_FUNCTION TaskPolicy & operator = ( const TaskPolicy & rhs ) = default ;
//----------------------------------------
KOKKOS_INLINE_FUNCTION
int allocated_task_count() const { return m_active_count ; }
template< class ValueType >
const Future< ValueType , execution_space > &
spawn( const Future< ValueType , execution_space > & f
@ -653,5 +659,6 @@ public:
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
#endif /* #define KOKKOS_QTHREAD_TASK_HPP */

View File

@ -3,26 +3,23 @@
# Cloning repository and branch:
git clone https://github.com/stelleg/qthreads qthreads-with-clone
git clone git@github.com:Qthreads/qthreads.git qthreads
cd qthreads-with-clone
cd qthreads
# Added to ./git/config
#
# [branch "cloned_tasks"]
# remote = origin
# merge = refs/heads/cloned_tasks
#
# checkout branch with "cloned tasks"
git branch cloned_tasks
git checkout cloned_tasks
git pull
git checkout dev-kokkos
# Configure/autogen
sh autogen.sh
# configurure with 'hwloc' installation:
# configure with 'hwloc' installation:
./configure CFLAGS="-DCLONED_TASKS -DQTHREAD_LOCAL_PRIORITY" --with-hwloc=${HWLOCDIR} --prefix=${INSTALLDIR}
# install
make install

View File

@ -53,6 +53,7 @@
#include <Kokkos_Core.hpp>
#include <impl/Kokkos_Error.hpp>
#include <impl/Kokkos_CPUDiscovery.hpp>
#include <impl/Kokkos_Profiling_Interface.hpp>
//----------------------------------------------------------------------------
@ -134,11 +135,7 @@ void ThreadsExec::driver(void)
ThreadsExec::ThreadsExec()
: m_pool_base(0)
#if ! KOKKOS_USING_EXP_VIEW
, m_scratch()
#else
, m_scratch(0)
#endif
, m_scratch_reduce_end(0)
, m_scratch_thread_end(0)
, m_numa_rank(0)
@ -198,8 +195,6 @@ ThreadsExec::~ThreadsExec()
{
const unsigned entry = m_pool_size - ( m_pool_rank + 1 );
#if KOKKOS_USING_EXP_VIEW
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
if ( m_scratch ) {
@ -210,12 +205,6 @@ ThreadsExec::~ThreadsExec()
Record::decrement( r );
}
#else
m_scratch.clear();
#endif
m_pool_base = 0 ;
m_scratch_reduce_end = 0 ;
m_scratch_thread_end = 0 ;
@ -439,8 +428,6 @@ void * ThreadsExec::root_reduce_scratch()
void ThreadsExec::execute_resize_scratch( ThreadsExec & exec , const void * )
{
#if KOKKOS_USING_EXP_VIEW
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
if ( exec.m_scratch ) {
@ -451,19 +438,11 @@ void ThreadsExec::execute_resize_scratch( ThreadsExec & exec , const void * )
Record::decrement( r );
}
#else
exec.m_scratch.clear();
#endif
exec.m_scratch_reduce_end = s_threads_process.m_scratch_reduce_end ;
exec.m_scratch_thread_end = s_threads_process.m_scratch_thread_end ;
if ( s_threads_process.m_scratch_thread_end ) {
#if KOKKOS_USING_EXP_VIEW
// Allocate tracked memory:
{
Record * const r = Record::allocate( Kokkos::HostSpace() , "thread_scratch" , s_threads_process.m_scratch_thread_end );
@ -475,15 +454,6 @@ void ThreadsExec::execute_resize_scratch( ThreadsExec & exec , const void * )
unsigned * ptr = reinterpret_cast<unsigned *>( exec.m_scratch );
#else
exec.m_scratch =
HostSpace::allocate_and_track( "thread_scratch" , s_threads_process.m_scratch_thread_end );
unsigned * ptr = reinterpret_cast<unsigned *>( exec.m_scratch.alloc_ptr() );
#endif
unsigned * const end = ptr + s_threads_process.m_scratch_thread_end / sizeof(unsigned);
// touch on this thread
@ -520,11 +490,7 @@ void * ThreadsExec::resize_scratch( size_t reduce_size , size_t thread_size )
s_threads_process.m_scratch = s_threads_exec[0]->m_scratch ;
}
#if KOKKOS_USING_EXP_VIEW
return s_threads_process.m_scratch ;
#else
return s_threads_process.m_scratch.alloc_ptr() ;
#endif
}
//----------------------------------------------------------------------------
@ -758,6 +724,9 @@ void ThreadsExec::initialize( unsigned thread_count ,
// Init the array for used for arbitrarily sized atomics
Impl::init_lock_array_host_space();
#if (KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::initialize();
#endif
}
//----------------------------------------------------------------------------
@ -807,6 +776,10 @@ void ThreadsExec::finalize()
s_threads_process.m_pool_size = 1 ;
s_threads_process.m_pool_fan_size = 0 ;
s_threads_process.m_pool_state = ThreadsExec::Inactive ;
#if (KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::finalize();
#endif
}
//----------------------------------------------------------------------------

View File

@ -49,7 +49,6 @@
#include <utility>
#include <impl/Kokkos_spinwait.hpp>
#include <impl/Kokkos_FunctorAdapter.hpp>
#include <impl/Kokkos_AllocationTracker.hpp>
#include <Kokkos_Atomic.hpp>
@ -89,11 +88,7 @@ private:
ThreadsExec * const * m_pool_base ; ///< Base for pool fan-in
#if ! KOKKOS_USING_EXP_VIEW
Impl::AllocationTracker m_scratch ;
#else
void * m_scratch ;
#endif
int m_scratch_reduce_end ;
int m_scratch_thread_end ;
int m_numa_rank ;
@ -138,19 +133,10 @@ public:
static int get_thread_count();
static ThreadsExec * get_thread( const int init_thread_rank );
#if ! KOKKOS_USING_EXP_VIEW
inline void * reduce_memory() const { return reinterpret_cast<unsigned char *>(m_scratch.alloc_ptr()); }
KOKKOS_INLINE_FUNCTION void * scratch_memory() const { return reinterpret_cast<unsigned char *>(m_scratch.alloc_ptr()) + m_scratch_reduce_end ; }
#else
inline void * reduce_memory() const { return m_scratch ; }
KOKKOS_INLINE_FUNCTION void * scratch_memory() const
{ return reinterpret_cast<unsigned char *>(m_scratch) + m_scratch_reduce_end ; }
#endif
KOKKOS_INLINE_FUNCTION int volatile & state() { return m_pool_state ; }
KOKKOS_INLINE_FUNCTION ThreadsExec * const * pool_base() const { return m_pool_base ; }

View File

@ -129,15 +129,15 @@ public:
KOKKOS_INLINE_FUNCTION
const execution_space::scratch_memory_space & team_shmem() const
{ return m_team_shared.set_team_thread_mode(1,0) ; }
{ return m_team_shared.set_team_thread_mode(0,1,0) ; }
KOKKOS_INLINE_FUNCTION
const execution_space::scratch_memory_space & team_scratch(int) const
{ return m_team_shared.set_team_thread_mode(1,0) ; }
{ return m_team_shared.set_team_thread_mode(0,1,0) ; }
KOKKOS_INLINE_FUNCTION
const execution_space::scratch_memory_space & thread_scratch(int) const
{ return m_team_shared.set_team_thread_mode(team_size(),team_rank()) ; }
{ return m_team_shared.set_team_thread_mode(0,team_size(),team_rank()) ; }
KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
@ -433,10 +433,11 @@ public:
void next_static()
{
if ( ++m_league_rank < m_league_end ) {
if ( m_league_rank < m_league_end ) {
team_barrier();
set_team_shared();
}
m_league_rank++;
}
bool valid_dynamic() {
@ -468,10 +469,11 @@ public:
if(m_invalid_thread)
return;
team_barrier();
if ( ++m_league_rank < m_league_chunk_end ) {
if ( m_league_rank < m_league_chunk_end ) {
team_barrier();
set_team_shared();
}
m_league_rank++;
}
void set_league_shmem( const int arg_league_rank
@ -504,8 +506,8 @@ private:
int m_team_alloc ;
int m_team_iter ;
size_t m_team_scratch_size;
size_t m_thread_scratch_size;
size_t m_team_scratch_size[2];
size_t m_thread_scratch_size[2];
int m_chunk_size;
@ -549,8 +551,10 @@ public:
m_team_size = p.m_team_size;
m_team_alloc = p.m_team_alloc;
m_team_iter = p.m_team_iter;
m_team_scratch_size = p.m_team_scratch_size;
m_thread_scratch_size = p.m_thread_scratch_size;
m_team_scratch_size[0] = p.m_team_scratch_size[0];
m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
m_team_scratch_size[1] = p.m_team_scratch_size[1];
m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
m_chunk_size = p.m_chunk_size;
return *this;
}
@ -577,7 +581,12 @@ public:
inline int team_size() const { return m_team_size ; }
inline int team_alloc() const { return m_team_alloc ; }
inline int league_size() const { return m_league_size ; }
inline size_t scratch_size() const { return m_team_scratch_size + m_team_size*m_thread_scratch_size ; }
inline size_t scratch_size(const int& level, int team_size_ = -1 ) const {
if(team_size_ < 0)
team_size_ = m_team_size;
return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level] ;
}
inline int team_iter() const { return m_team_iter ; }
/** \brief Specify league size, request team size */
@ -588,8 +597,8 @@ public:
: m_league_size(0)
, m_team_size(0)
, m_team_alloc(0)
, m_team_scratch_size ( 0 )
, m_thread_scratch_size ( 0 )
, m_team_scratch_size { 0 , 0 }
, m_thread_scratch_size { 0 , 0 }
, m_chunk_size(0)
{ init(league_size_request,team_size_request); (void) vector_length_request; }
@ -601,8 +610,8 @@ public:
: m_league_size(0)
, m_team_size(0)
, m_team_alloc(0)
, m_team_scratch_size ( 0 )
, m_thread_scratch_size ( 0 )
, m_team_scratch_size { 0 , 0 }
, m_thread_scratch_size { 0 , 0 }
, m_chunk_size(0)
{ init(league_size_request,traits::execution_space::thread_pool_size(2)); }
@ -612,8 +621,8 @@ public:
: m_league_size(0)
, m_team_size(0)
, m_team_alloc(0)
, m_team_scratch_size ( 0 )
, m_thread_scratch_size ( 0 )
, m_team_scratch_size { 0 , 0 }
, m_thread_scratch_size { 0 , 0 }
, m_chunk_size(0)
{ init(league_size_request,team_size_request); }
@ -623,8 +632,8 @@ public:
: m_league_size(0)
, m_team_size(0)
, m_team_alloc(0)
, m_team_scratch_size ( 0 )
, m_thread_scratch_size ( 0 )
, m_team_scratch_size { 0 , 0 }
, m_thread_scratch_size { 0 , 0 }
, m_chunk_size(0)
{ init(league_size_request,traits::execution_space::thread_pool_size(2)); }
@ -639,26 +648,23 @@ public:
/** \brief set per team scratch size for a specific level of the scratch hierarchy */
inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
(void) level;
TeamPolicyInternal p = *this;
p.m_team_scratch_size = per_team.value;
p.m_team_scratch_size[level] = per_team.value;
return p;
};
/** \brief set per thread scratch size for a specific level of the scratch hierarchy */
inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
(void) level;
TeamPolicyInternal p = *this;
p.m_thread_scratch_size = per_thread.value;
p.m_thread_scratch_size[level] = per_thread.value;
return p;
};
/** \brief set per thread and per team scratch size for a specific level of the scratch hierarchy */
inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
(void) level;
TeamPolicyInternal p = *this;
p.m_team_scratch_size = per_team.value;
p.m_thread_scratch_size = per_thread.value;
p.m_team_scratch_size[level] = per_team.value;
p.m_thread_scratch_size[level] = per_thread.value;
return p;
};

View File

@ -264,7 +264,7 @@ public:
, const Policy & arg_policy )
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_shared( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
, m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
{ }
};
@ -272,9 +272,10 @@ public:
//----------------------------------------------------------------------------
/* ParallelReduce with Kokkos::Threads and RangePolicy */
template< class FunctorType , class ... Traits >
template< class FunctorType , class ReducerType, class ... Traits >
class ParallelReduce< FunctorType
, Kokkos::RangePolicy< Traits ... >
, ReducerType
, Kokkos::Threads
>
{
@ -286,14 +287,18 @@ private:
typedef typename Policy::WorkRange WorkRange ;
typedef typename Policy::member_type Member ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
typedef typename ReducerConditional::type ReducerTypeFwd;
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
const FunctorType m_functor ;
const Policy m_policy ;
const ReducerType m_reducer ;
const pointer_type m_result_ptr ;
template< class TagType >
@ -344,9 +349,9 @@ private:
ParallelReduce::template exec_range< WorkTag >
( self.m_functor , range.begin() , range.end()
, ValueInit::init( self.m_functor , exec.reduce_memory() ) );
, ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() ) );
exec.template fan_in_reduce< FunctorType , WorkTag >( self.m_functor );
exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) );
}
template<class Schedule>
@ -362,7 +367,7 @@ private:
exec.barrier();
long work_index = exec.get_work_index();
reference_type update = ValueInit::init( self.m_functor , exec.reduce_memory() );
reference_type update = ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() );
while(work_index != -1) {
const Member begin = static_cast<Member>(work_index) * self.m_policy.chunk_size();
const Member end = begin + self.m_policy.chunk_size() < self.m_policy.end()?begin+self.m_policy.chunk_size():self.m_policy.end();
@ -372,7 +377,7 @@ private:
work_index = exec.get_work_index();
}
exec.template fan_in_reduce< FunctorType , WorkTag >( self.m_functor );
exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) );
}
public:
@ -380,7 +385,7 @@ public:
inline
void execute() const
{
ThreadsExec::resize_scratch( ValueTraits::value_size( m_functor ) , 0 );
ThreadsExec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
ThreadsExec::start( & ParallelReduce::exec , this );
@ -391,7 +396,7 @@ public:
const pointer_type data =
(pointer_type) ThreadsExec::root_reduce_scratch();
const unsigned n = ValueTraits::value_count( m_functor );
const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
}
}
@ -399,9 +404,14 @@ public:
template< class HostViewType >
ParallelReduce( const FunctorType & arg_functor ,
const Policy & arg_policy ,
const HostViewType & arg_result_view )
const HostViewType & arg_result_view ,
typename std::enable_if<
Kokkos::is_view< HostViewType >::value &&
!Kokkos::is_reducer_type<ReducerType>::value
,void*>::type = NULL)
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( InvalidType() )
, m_result_ptr( arg_result_view.ptr_on_device() )
{
static_assert( Kokkos::is_view< HostViewType >::value
@ -410,14 +420,30 @@ public:
static_assert( std::is_same< typename HostViewType::memory_space , HostSpace >::value
, "Kokkos::Threads reduce result must be a View in HostSpace" );
}
inline
ParallelReduce( const FunctorType & arg_functor
, Policy arg_policy
, const ReducerType& reducer )
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( reducer )
, m_result_ptr( reducer.result_view().data() )
{
/*static_assert( std::is_same< typename ViewType::memory_space
, Kokkos::HostSpace >::value
, "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
}
};
//----------------------------------------------------------------------------
/* ParallelReduce with Kokkos::Threads and TeamPolicy */
template< class FunctorType , class ... Properties >
template< class FunctorType , class ReducerType, class ... Properties >
class ParallelReduce< FunctorType
, Kokkos::TeamPolicy< Properties ... >
, ReducerType
, Kokkos::Threads
>
{
@ -426,14 +452,19 @@ private:
typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Threads, Properties ... > Policy ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::member_type Member ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
typedef typename ReducerConditional::type ReducerTypeFwd;
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
const FunctorType m_functor ;
const Policy m_policy ;
const ReducerType m_reducer ;
const pointer_type m_result_ptr ;
const int m_shared ;
@ -464,9 +495,9 @@ private:
ParallelReduce::template exec_team< WorkTag >
( self.m_functor , Member( & exec , self.m_policy , self.m_shared )
, ValueInit::init( self.m_functor , exec.reduce_memory() ) );
, ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() ) );
exec.template fan_in_reduce< FunctorType , WorkTag >( self.m_functor );
exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) );
}
public:
@ -474,7 +505,7 @@ public:
inline
void execute() const
{
ThreadsExec::resize_scratch( ValueTraits::value_size( m_functor ) , Policy::member_type::team_reduce_size() + m_shared );
ThreadsExec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , Policy::member_type::team_reduce_size() + m_shared );
ThreadsExec::start( & ParallelReduce::exec , this );
@ -484,20 +515,41 @@ public:
const pointer_type data = (pointer_type) ThreadsExec::root_reduce_scratch();
const unsigned n = ValueTraits::value_count( m_functor );
const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
}
}
template< class ViewType >
ParallelReduce( const FunctorType & arg_functor
, const Policy & arg_policy
, const ViewType & arg_result )
inline
ParallelReduce( const FunctorType & arg_functor ,
const Policy & arg_policy ,
const ViewType & arg_result ,
typename std::enable_if<
Kokkos::is_view< ViewType >::value &&
!Kokkos::is_reducer_type<ReducerType>::value
,void*>::type = NULL)
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_policy( arg_policy )
, m_reducer( InvalidType() )
, m_result_ptr( arg_result.ptr_on_device() )
, m_shared( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
{ }
, m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
{}
inline
ParallelReduce( const FunctorType & arg_functor
, Policy arg_policy
, const ReducerType& reducer )
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( reducer )
, m_result_ptr( reducer.result_view().data() )
, m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
{
/*static_assert( std::is_same< typename ViewType::memory_space
, Kokkos::HostSpace >::value
, "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
}
};
//----------------------------------------------------------------------------

View File

@ -46,9 +46,10 @@
#include <stdio.h>
#include <iostream>
#include <sstream>
#include <Kokkos_Core.hpp>
#include <Threads/Kokkos_Threads_TaskPolicy.hpp>
#if defined( KOKKOS_HAVE_PTHREAD )
#if defined( KOKKOS_HAVE_PTHREAD ) && defined( KOKKOS_ENABLE_TASKPOLICY )
#define QLOCK (reinterpret_cast<void*>( ~((uintptr_t)0) ))
#define QDENIED (reinterpret_cast<void*>( ~((uintptr_t)0) - 1 ))
@ -87,9 +88,8 @@ ThreadsTaskPolicyQueue::ThreadsTaskPolicyQueue
, const unsigned arg_task_team_size
)
: m_space( Kokkos::Threads::memory_space()
, arg_task_max_size
, arg_task_max_size * arg_task_max_count
, 1 /* only one level of memory pool */
, arg_task_max_size * arg_task_max_count * 1.2
, 16 /* log2(superblock size) */
)
, m_team { 0 , 0 , 0 }
, m_serial { 0 , 0 , 0 }
@ -624,10 +624,10 @@ ThreadsTaskPolicyQueue::allocate_task
// User created task memory pool with an estimate,
// if estimate is to low then report and throw exception.
if ( m_space.get_min_chunk_size() < size_alloc ) {
if ( m_space.get_min_block_size() < size_alloc ) {
fprintf(stderr,"TaskPolicy<Threads> task allocation requires %d bytes on memory pool with %d byte chunk size\n"
, int(size_alloc)
, int(m_space.get_min_chunk_size())
, int(m_space.get_min_block_size())
);
fflush(stderr);
Kokkos::Impl::throw_runtime_exception("TaskMember< Threads >::task_allocate");
@ -926,5 +926,5 @@ void Task::clear_dependence()
} /* namespace Experimental */
} /* namespace Kokkos */
#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */
#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */

View File

@ -50,7 +50,7 @@
#include <Kokkos_Threads.hpp>
#include <Kokkos_TaskPolicy.hpp>
#if defined( KOKKOS_HAVE_PTHREAD )
#if defined( KOKKOS_HAVE_PTHREAD ) && defined( KOKKOS_ENABLE_TASKPOLICY )
//----------------------------------------------------------------------------
@ -737,10 +737,9 @@ public:
} /* namespace Experimental */
} /* namespace Kokkos */
#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
#endif /* #ifndef KOKKOS_THREADS_TASKPOLICY_HPP */

View File

@ -246,8 +246,8 @@ private:
enum : uintptr_t { DO_NOT_DEREF_FLAG = 0x01ul };
// The allocation record resides in Host memory space
Record * m_record ;
uintptr_t m_record_bits ;
Record * m_record ;
public:

View File

@ -47,8 +47,6 @@
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#if KOKKOS_USING_EXP_VIEW
namespace Kokkos {
/* For backward compatibility */
@ -68,8 +66,6 @@ struct ViewAllocateWithoutInitializing {
} /* namespace Kokkos */
#endif
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

View File

@ -2604,18 +2604,24 @@ class ViewMapping< DstTraits , SrcTraits ,
&&
std::is_same< typename DstTraits::specialize , void >::value
&&
(
std::is_same< typename DstTraits::array_layout , Kokkos::LayoutLeft >::value ||
std::is_same< typename DstTraits::array_layout , Kokkos::LayoutRight >::value ||
std::is_same< typename DstTraits::array_layout , Kokkos::LayoutStride >::value
)
&&
std::is_same< typename SrcTraits::specialize , void >::value
&&
(
std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value ||
std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value ||
std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutStride >::value
std::is_same< typename DstTraits::array_layout , typename SrcTraits::array_layout >::value
||
(
(
std::is_same< typename DstTraits::array_layout , Kokkos::LayoutLeft >::value ||
std::is_same< typename DstTraits::array_layout , Kokkos::LayoutRight >::value ||
std::is_same< typename DstTraits::array_layout , Kokkos::LayoutStride >::value
)
&&
(
std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value ||
std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value ||
std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutStride >::value
)
)
)
)>::type >
{

View File

@ -1,848 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core_fwd.hpp>
#if ! KOKKOS_USING_EXP_VIEW
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
#include <Kokkos_Atomic.hpp>
#include <impl/Kokkos_Singleton.hpp>
#include <impl/Kokkos_AllocationTracker.hpp>
#include <impl/Kokkos_Error.hpp>
#include <string>
#include <vector>
#include <sstream>
#include <algorithm>
#include <utility>
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <iomanip>
/* Enable clean up of memory leaks */
#define CLEAN_UP_MEMORY_LEAKS 0
namespace Kokkos { namespace Impl {
namespace {
//-----------------------------------------------------------------------------
// AllocationRecord
//-----------------------------------------------------------------------------
//
// Used to track details about an allocation and provide a ref count
// sizeof(AllocationRecord) == 128
struct AllocationRecord
{
enum {
OFFSET = sizeof(AllocatorBase*) // allocator
+ sizeof(void*) // alloc_ptr
+ sizeof(uint64_t) // alloc_size
+ sizeof(AllocatorAttributeBase*) // attribute
+ sizeof(uint32_t) // node_index
+ sizeof(uint32_t) // ref_count
, LABEL_LENGTH = 128 - OFFSET
};
AllocatorBase * const allocator;
void * const alloc_ptr;
const uint64_t alloc_size;
AllocatorAttributeBase * const attribute;
const int32_t node_index;
volatile uint32_t ref_count;
const char label[LABEL_LENGTH];
AllocationRecord( AllocatorBase * const arg_allocator
, void * arg_alloc_ptr
, uint64_t arg_alloc_size
, int32_t arg_node_index
, const std::string & arg_label
)
: allocator(arg_allocator)
, alloc_ptr(arg_alloc_ptr)
, alloc_size(arg_alloc_size)
, attribute(NULL)
, node_index(arg_node_index)
, ref_count(1)
, label() // zero fill
{
const size_t length = static_cast<size_t>(LABEL_LENGTH-1u) < arg_label.size() ? static_cast<size_t>(LABEL_LENGTH-1u) : arg_label.size();
strncpy( const_cast<char *>(label), arg_label.c_str(), length );
}
~AllocationRecord()
{
if (attribute) {
delete attribute;
}
}
uint32_t increment_ref_count()
{
uint32_t old_value = atomic_fetch_add( &ref_count, static_cast<uint32_t>(1) );
return old_value + 1u;
}
uint32_t decrement_ref_count()
{
uint32_t old_value = atomic_fetch_sub( &ref_count, static_cast<uint32_t>(1) );
return old_value - 1u;
}
void print( std::ostream & oss ) const
{
oss << "{ " << allocator->name()
<< " } : \"" << label
<< "\" ref_count(" << ref_count
<< ") memory[ " << alloc_ptr
<< " + " << alloc_size
<< " ]" ;
}
bool set_attribute( AllocatorAttributeBase * attr )
{
bool result = false;
if (attribute == NULL) {
result = NULL == atomic_compare_exchange( const_cast<AllocatorAttributeBase **>(&attribute)
, reinterpret_cast<AllocatorAttributeBase *>(NULL)
, attr );
}
return result;
}
// disallow copy and assignment
AllocationRecord( const AllocationRecord & );
AllocationRecord & operator=(const AllocationRecord &);
};
template <int NumBlocks>
struct Bitset
{
enum { blocks = NumBlocks };
enum { size = blocks * 64 };
enum { block_mask = 63u };
enum { block_shift = 6 };
// used to find free bits in a bitset
static int count_trailing_zeros(uint64_t x)
{
#if defined( KOKKOS_COMPILER_GNU ) || defined( KOKKOS_COMPILER_CLANG ) || defined( KOKKOS_COMPILER_APPLECC )
return x ? __builtin_ctzll(x) : 64;
#elif defined( KOKKOS_COMPILER_INTEL )
enum { shift = 32 };
enum { mask = (static_cast<uint64_t>(1) << shift) - 1u };
return (x & mask) ? _bit_scan_forward(static_cast<int>(x & mask)) :
(x >> shift) ? shift + _bit_scan_forward(static_cast<int>(x >> shift)) :
64 ;
#elif defined( KOKKOS_COMPILER_IBM )
return x ? __cnttz8(x) : 64;
#else
int i = 0;
for (; ((x & (static_cast<uint64_t>(1) << i)) == 0u) && i < 64; ++i ) {}
return i;
#endif
}
Bitset()
: m_bits()
{
for (int i=0; i < blocks; ++i) {
m_bits[i] = 0u;
}
}
bool set( int i )
{
const uint64_t bit = static_cast<uint64_t>(1) << ( i & block_mask );
return !( atomic_fetch_or( m_bits + (i >> block_shift), bit ) & bit );
}
bool reset( int i )
{
const uint64_t bit = static_cast<uint64_t>(1) << ( i & block_mask );
return atomic_fetch_and( m_bits + (i >> block_shift), ~bit ) & bit;
}
bool test( int i )
{
const uint64_t block = m_bits[ i >> block_shift ];
const uint64_t bit = static_cast<uint64_t>(1) << ( i & block_mask );
return block & bit;
}
int find_first_unset() const
{
for (int i=0; i < blocks; ++i) {
const uint64_t block = m_bits[i];
int b = count_trailing_zeros( ~block );
if ( b < 64 ) {
return (i << block_shift) + b;
}
}
return size;
}
volatile uint64_t m_bits[blocks];
};
//-----------------------------------------------------------------------------
// AllocationRecordPool -- singleton class
//
// global_alloc_rec_pool is the ONLY instance of this class
//
//-----------------------------------------------------------------------------
// Record AllocationRecords in a lock-free circular list.
// Each node in the list has a buffer with space for 959 ((15*64)-1) records
// managed by a bitset. Atomics are used to set and reset bits in the bit set.
// The head of the list is atomically updated to the last node found with
// unused space.
//
// Cost time to create an allocation record: amortized O(1), worst case O(num nodes)
// Cost to destroy an allocation recored: O(1)
//
// Singleton allocations are pushed onto a lock-free stack that is destroyed
// after the circular list of allocation records.
struct AllocationRecordPool
{
enum { BITSET_BLOCKS = 15 };
typedef Bitset<BITSET_BLOCKS> bitset_type;
enum { BUFFER_SIZE = (bitset_type::size - 1) * sizeof(AllocationRecord) };
struct AllocationNode
{
AllocationNode()
: next()
, bitset()
, buffer()
{
// set the first bit to used
bitset.set(0);
}
void * get_buffer( int32_t node_index )
{
return buffer + (node_index-1) * sizeof(AllocationRecord);
}
// return 0 if no space is available in the node
int32_t get_node_index()
{
int32_t node_index = 0;
do {
node_index = bitset.find_first_unset();
// successfully claimed a bit
if ( node_index != bitset.size && bitset.set(node_index) )
{
return node_index;
}
} while ( node_index != bitset.size );
return 0;
}
void clear_node_index( int32_t node_index )
{
bitset.reset(node_index);
}
AllocationNode * next;
bitset_type bitset;
char buffer[BUFFER_SIZE];
};
struct SingletonNode
{
void * buffer;
SingletonNode * next;
Impl::singleton_destroy_function_type destroy;
SingletonNode( size_t size, Impl::singleton_create_function_type create_func, Impl::singleton_destroy_function_type destroy_func )
: buffer(NULL)
, next(NULL)
, destroy(destroy_func)
{
if (size) {
buffer = malloc(size);
create_func(buffer);
}
}
~SingletonNode()
{
if (buffer) {
try {
destroy(buffer);
} catch(...) {}
free(buffer);
}
}
};
AllocationRecordPool()
: head( new AllocationNode() )
, singleton_head(NULL)
{
// setup ring
head->next = head;
}
~AllocationRecordPool()
{
// delete allocation records
{
AllocationNode * start = head;
AllocationNode * curr = start;
std::vector< std::string > string_vec;
do {
AllocationNode * next = curr->next;
#if defined( KOKKOS_DEBUG_PRINT_ALLOCATION_BITSET )
// print node bitset
for (int i=0; i < bitset_type::blocks; ++i ) {
std::cout << std::hex << std::showbase << curr->bitset.m_bits[i] << " ";
}
std::cout << std::endl;
#endif
// bit zero does not map to an AllocationRecord
for ( int32_t i=1; i < bitset_type::size; ++i )
{
if (curr->bitset.test(i)) {
AllocationRecord * alloc_rec = reinterpret_cast<AllocationRecord *>( curr->get_buffer(i) );
std::ostringstream oss;
alloc_rec->print( oss );
string_vec.push_back( oss.str() );
#if CLEAN_UP_MEMORY_LEAKS
/* Cleaning up memory leaks prevents memory error detection tools
* from reporting the original source of allocation, which can
* impede debugging with such tools.
*/
try {
destroy(alloc_rec);
}
catch(...) {}
#endif
}
}
curr->next = NULL;
delete curr;
curr = next;
} while ( curr != start );
//if ( !string_vec.empty() ) {
// std::sort( string_vec.begin(), string_vec.end() );
//
// std::ostringstream oss;
// oss << "Error: Allocation pool destroyed with the following memory leak(s):\n";
// for (size_t i=0; i< string_vec.size(); ++i)
// {
// oss << " " << string_vec[i] << std::endl;
// }
//
// std::cerr << oss.str() << std::endl;
//}
}
// delete singletons
{
SingletonNode * curr = singleton_head;
while (curr) {
SingletonNode * next = curr->next;
delete curr;
curr = next;
}
}
}
AllocationRecord * create( AllocatorBase * arg_allocator
, void * arg_alloc_ptr
, size_t arg_alloc_size
, const std::string & arg_label
)
{
AllocationNode * start = volatile_load(&head);
AllocationNode * curr = start;
int32_t node_index = curr->get_node_index();
if (node_index == 0) {
curr = volatile_load(&curr->next);
}
while (node_index == 0 && curr != start)
{
node_index = curr->get_node_index();
if (node_index == 0) {
curr = volatile_load(&curr->next);
}
}
// Need to allocate and insert a new node
if (node_index == 0 && curr == start)
{
AllocationNode * new_node = new AllocationNode();
node_index = new_node->get_node_index();
AllocationNode * next = NULL;
do {
next = volatile_load(&curr->next);
new_node->next = next;
memory_fence();
} while ( next != atomic_compare_exchange( &(curr->next), next, new_node ) );
curr = new_node;
}
void * buffer = curr->get_buffer(node_index);
// try to set head to curr
if ( start != curr )
{
atomic_compare_exchange( & head, start, curr );
}
return new (buffer) AllocationRecord( arg_allocator
, arg_alloc_ptr
, arg_alloc_size
, node_index
, arg_label
);
}
void destroy( AllocationRecord * alloc_rec )
{
if (alloc_rec) {
const int32_t node_index = alloc_rec->node_index;
AllocationNode * node = get_node( alloc_rec );
// deallocate memory
alloc_rec->allocator->deallocate( alloc_rec->alloc_ptr, alloc_rec->alloc_size );
// call destructor
alloc_rec->~AllocationRecord();
// wait for writes to complete
memory_fence();
// clear node index
node->clear_node_index( node_index );
}
}
void * create_singleton( size_t size, Impl::singleton_create_function_type create_func, Impl::singleton_destroy_function_type destroy_func )
{
SingletonNode * node = new SingletonNode( size, create_func, destroy_func );
SingletonNode * next;
// insert new node at the head of the list
do {
next = volatile_load(&singleton_head);
node->next = next;
} while ( next != atomic_compare_exchange( &singleton_head, next, node ) );
return node->buffer;
}
void print_memory( std::ostream & out ) const
{
AllocationNode * start = head;
AllocationNode * curr = start;
std::vector< std::string > string_vec;
do {
AllocationNode * next = curr->next;
// bit zero does not map to an AllocationRecord
for ( int32_t i=1; i < bitset_type::size; ++i )
{
if (curr->bitset.test(i)) {
AllocationRecord * alloc_rec = reinterpret_cast<AllocationRecord *>( curr->get_buffer(i) );
std::ostringstream oss;
alloc_rec->print( oss );
string_vec.push_back( oss.str() );
}
}
curr = next;
} while ( curr != start );
if ( !string_vec.empty() ) {
std::sort( string_vec.begin(), string_vec.end() );
std::ostringstream oss;
oss << "Tracked Memory:" << std::endl;
for (size_t i=0; i< string_vec.size(); ++i)
{
oss << " " << string_vec[i] << std::endl;
}
out << oss.str() << std::endl;
}
else {
out << "No Tracked Memory" << std::endl;
}
}
// find an AllocationRecord such that
// alloc_ptr <= ptr < alloc_ptr + alloc_size
// otherwise return NULL
AllocationRecord * find( void const * ptr, AllocatorBase const * allocator ) const
{
AllocationNode * start = head;
AllocationNode * curr = start;
char const * const char_ptr = reinterpret_cast<const char *>(ptr);
do {
AllocationNode * next = curr->next;
// bit zero does not map to an AllocationRecord
for ( int32_t i=1; i < bitset_type::size; ++i )
{
if (curr->bitset.test(i)) {
AllocationRecord * alloc_rec = reinterpret_cast<AllocationRecord *>( curr->get_buffer(i) );
char const * const alloc_ptr = reinterpret_cast<char const *>(alloc_rec->alloc_ptr);
if ( (allocator == alloc_rec->allocator)
&& (alloc_ptr <= char_ptr)
&& (char_ptr < (alloc_ptr + alloc_rec->alloc_size)) )
{
return alloc_rec;
}
}
}
curr = next;
} while ( curr != start );
return NULL;
}
private:
AllocationNode * get_node( AllocationRecord * alloc_rec )
{
return reinterpret_cast<AllocationNode *>( alloc_rec - alloc_rec->node_index);
}
AllocationNode * head;
SingletonNode * singleton_head;
};
// create the global pool for allocation records
AllocationRecordPool global_alloc_rec_pool;
// convert a uintptr_t to an AllocationRecord pointer
inline
AllocationRecord * to_alloc_rec( uintptr_t alloc_rec )
{
return reinterpret_cast<AllocationRecord *>( alloc_rec & ~static_cast<uintptr_t>(1) );
}
} // unnamed namespace
//-----------------------------------------------------------------------------
// Allocation Tracker methods
//-----------------------------------------------------------------------------
// Create a reference counted AllocationTracker
void AllocationTracker::initalize( AllocatorBase * arg_allocator
, void * arg_alloc_ptr
, size_t arg_alloc_size
, const std::string & arg_label
)
{
if ( arg_allocator && arg_alloc_ptr && arg_alloc_size) {
// create record
AllocationRecord * alloc_rec = global_alloc_rec_pool.create( arg_allocator
, arg_alloc_ptr
, arg_alloc_size
, arg_label
);
m_alloc_rec = reinterpret_cast<uintptr_t>(alloc_rec) | REF_COUNT_BIT;
}
}
void AllocationTracker::reallocate( size_t size ) const
{
AllocationRecord * rec = to_alloc_rec( m_alloc_rec );
void * the_alloc_ptr = rec->allocator->reallocate( rec->alloc_ptr, rec->alloc_size, size );
if ( NULL != the_alloc_ptr )
{
*const_cast<void **>(&rec->alloc_ptr) = the_alloc_ptr;
*const_cast<uint64_t *>(&rec->alloc_size) = size;
}
else {
Impl::throw_runtime_exception( "Error: unable to reallocate allocation tracker");
}
}
void AllocationTracker::increment_ref_count() const
{
to_alloc_rec( m_alloc_rec )->increment_ref_count();
}
void AllocationTracker::decrement_ref_count() const
{
AllocationRecord * alloc_rec = to_alloc_rec( m_alloc_rec );
uint32_t the_ref_count = alloc_rec->decrement_ref_count();
if (the_ref_count == 0u) {
try {
global_alloc_rec_pool.destroy( alloc_rec );
}
catch(...) {}
}
}
namespace {
struct NullAllocator { static const char * name() { return "Null Allocator"; } };
}
AllocatorBase * AllocationTracker::allocator() const
{
if (m_alloc_rec & REF_COUNT_MASK) {
return to_alloc_rec(m_alloc_rec)->allocator;
}
return Allocator<NullAllocator>::singleton();
}
void * AllocationTracker::alloc_ptr() const
{
if (m_alloc_rec & REF_COUNT_MASK) {
return to_alloc_rec(m_alloc_rec)->alloc_ptr;
}
return NULL;
}
size_t AllocationTracker::alloc_size() const
{
if (m_alloc_rec & REF_COUNT_MASK) {
return to_alloc_rec(m_alloc_rec)->alloc_size;
}
return 0u;
}
size_t AllocationTracker::ref_count() const
{
if (m_alloc_rec & REF_COUNT_MASK) {
return to_alloc_rec(m_alloc_rec)->ref_count;
}
return 0u;
}
char const * AllocationTracker::label() const
{
if (m_alloc_rec & REF_COUNT_MASK) {
return to_alloc_rec(m_alloc_rec)->label;
}
return "[Empty Allocation Tracker]";
}
void AllocationTracker::print( std::ostream & oss) const
{
if (m_alloc_rec & REF_COUNT_MASK) {
to_alloc_rec(m_alloc_rec)->print(oss);
}
else {
oss << label();
}
}
bool AllocationTracker::set_attribute( AllocatorAttributeBase * attr ) const
{
bool result = false;
if (m_alloc_rec & REF_COUNT_MASK) {
result = to_alloc_rec(m_alloc_rec)->set_attribute(attr);
}
return result;
}
AllocatorAttributeBase * AllocationTracker::attribute() const
{
if (m_alloc_rec & REF_COUNT_MASK) {
return to_alloc_rec(m_alloc_rec)->attribute;
}
return NULL;
}
void AllocationTracker::print_tracked_memory( std::ostream & out )
{
global_alloc_rec_pool.print_memory( out );
}
AllocationTracker AllocationTracker::find( void const * ptr, AllocatorBase const * arg_allocator )
{
AllocationRecord * alloc_rec = global_alloc_rec_pool.find(ptr, arg_allocator);
AllocationTracker tracker;
if ( alloc_rec != NULL )
{
if ( tracking_enabled() ) {
alloc_rec->increment_ref_count();
tracker.m_alloc_rec = reinterpret_cast<uintptr_t>(alloc_rec) | REF_COUNT_BIT;
}
else {
tracker.m_alloc_rec = reinterpret_cast<uintptr_t>(alloc_rec);
}
}
return tracker ;
}
//-----------------------------------------------------------------------------
// static AllocationTracker
//-----------------------------------------------------------------------------
#if defined( KOKKOS_USE_DECENTRALIZED_HOST )
namespace {
// TODO : Detect compiler support for thread local variables
#if defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP )
bool g_thread_local_tracking_enabled = true;
#pragma omp threadprivate(g_thread_local_tracking_enabled)
#elif defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS )
__thread bool g_thread_local_tracking_enabled = true;
#elif defined( KOKKOS_HAVE_OPENMP )
bool g_thread_local_tracking_enabled = true;
#pragma omp threadprivate(g_thread_local_tracking_enabled)
#elif defined( KOKKOS_HAVE_PTHREAD )
__thread bool g_thread_local_tracking_enabled = true;
#elif defined( KOKKOS_HAVE_SERIAL )
bool g_thread_local_tracking_enabled = true;
#endif
} // unnamed namespace
void AllocationTracker::disable_tracking()
{
g_thread_local_tracking_enabled = false;
}
void AllocationTracker::enable_tracking()
{
g_thread_local_tracking_enabled = true;
}
bool AllocationTracker::tracking_enabled()
{
return g_thread_local_tracking_enabled;
}
#else
namespace {
enum TrackingEnum { TRACKING_ENABLED, TRACKING_DISABLED };
volatile TrackingEnum g_tracking_enabled = TRACKING_ENABLED;
}
void AllocationTracker::disable_tracking()
{
if ( TRACKING_ENABLED != atomic_compare_exchange( &g_tracking_enabled, TRACKING_ENABLED, TRACKING_DISABLED ) ) {
Impl::throw_runtime_exception("Error: Tracking already disabled");
}
}
void AllocationTracker::enable_tracking()
{
if ( TRACKING_DISABLED != atomic_compare_exchange( &g_tracking_enabled, TRACKING_DISABLED, TRACKING_ENABLED ) ) {
Impl::throw_runtime_exception("Error: Tracking already enabled");
}
}
bool AllocationTracker::tracking_enabled()
{
return g_tracking_enabled == TRACKING_ENABLED;
}
#endif
//-----------------------------------------------------------------------------
// create singleton free function
//-----------------------------------------------------------------------------
void * create_singleton( size_t size
, Impl::singleton_create_function_type create_func
, Impl::singleton_destroy_function_type destroy_func )
{
return global_alloc_rec_pool.create_singleton( size, create_func, destroy_func );
}
}} // namespace Kokkos::Impl
#endif /* #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) */
#endif /* #if ! KOKKOS_USING_EXP_VIEW */

View File

@ -1,574 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_ALLOCATION_TRACKER_HPP
#define KOKKOS_ALLOCATION_TRACKER_HPP
#include <Kokkos_Macros.hpp>
#if ! KOKKOS_USING_EXP_VIEW
#include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_Error.hpp>
#include <stdint.h>
#include <cstdlib>
#include <string>
#include <iosfwd>
namespace Kokkos { namespace Impl {
//-----------------------------------------------------------------------------
// Create Singleton objects
//-----------------------------------------------------------------------------
typedef void * (*singleton_create_function_type)(void * buffer);
typedef void (*singleton_destroy_function_type)(void *);
void * create_singleton( size_t size
, singleton_create_function_type create_func
, singleton_destroy_function_type destroy_func
);
/// class Singleton
///
/// Default construct a singleton type. This method is used to circumvent
/// order of construction issues. Singleton objects are destroyed after all
/// other allocations in the reverse order of their creation.
template <typename Type>
class Singleton
{
public:
/// Get a pointer to the Singleton. Default construct the singleton if it does not already exist
static Type * get()
{
static Type * singleton = NULL;
if (singleton == NULL) {
Impl::singleton_create_function_type create_func = &create;
Impl::singleton_destroy_function_type destroy_func = &destroy;
singleton = reinterpret_cast<Type*>( Impl::create_singleton( sizeof(Type), create_func, destroy_func ) );
}
return singleton;
}
private:
/// Call the Type constructor
static void destroy(void * ptr)
{
reinterpret_cast<Type*>(ptr)->~Type();
}
/// placement new the Type in buffer
static void * create(void * buffer)
{
return new (buffer) Type();
}
};
//-----------------------------------------------------------------------------
// AllocatorBase
//-----------------------------------------------------------------------------
/// class AllocatorBase
///
/// Abstract base class for all Allocators.
/// Allocators should be singleton objects, use Singleton<Allocator>::get to create
/// to avoid order of destruction issues
class AllocatorBase
{
public:
/// name of the allocator
/// used to report memory leaks
virtual const char * name() const = 0;
/// Allocate a buffer of size number of bytes
virtual void* allocate(size_t size) const = 0;
/// Deallocate a buffer with size number of bytes
/// The pointer must have been allocated with a call to corresponding allocate
virtual void deallocate(void * ptr, size_t size) const = 0;
/// Changes the size of the memory block pointed to by ptr.
/// Ptr must have been allocated with the corresponding allocate call
/// The function may move the memory block to a new location
/// (whose address is returned by the function).
///
/// The content of the memory block is preserved up to the lesser of the new and
/// old sizes, even if the block is moved to a new location. If the new size is larger,
/// the value of the newly allocated portion is indeterminate.
///
/// In case that ptr is a null pointer, the function behaves like allocate, assigning a
/// new block of size bytes and returning a pointer to its beginning.
virtual void * reallocate(void * old_ptr, size_t old_size, size_t new_size) const = 0;
/// can a texture object be bound to the allocated memory
virtual bool support_texture_binding() const = 0;
/// virtual destructor
virtual ~AllocatorBase() {}
};
/// class AllocatorAttributeBase
class AllocatorAttributeBase
{
public:
virtual ~AllocatorAttributeBase() {}
};
//-----------------------------------------------------------------------------
// Allocator< StaticAllocator > : public AllocatorBase
//-----------------------------------------------------------------------------
// HasStaticName
template<typename T>
class HasStaticName
{
typedef const char * (*static_method)();
template<typename U, static_method> struct SFINAE {};
template<typename U> static char Test(SFINAE<U, &U::name>*);
template<typename U> static int Test(...);
public:
enum { value = sizeof(Test<T>(0)) == sizeof(char) };
};
template <typename T>
inline
typename enable_if<HasStaticName<T>::value, const char *>::type
allocator_name()
{
return T::name();
}
template <typename T>
inline
typename enable_if<!HasStaticName<T>::value, const char *>::type
allocator_name()
{
return "Unnamed Allocator";
}
// HasStaticAllocate
template<typename T>
class HasStaticAllocate
{
typedef void * (*static_method)(size_t);
template<typename U, static_method> struct SFINAE {};
template<typename U> static char Test(SFINAE<U, &U::allocate>*);
template<typename U> static int Test(...);
public:
enum { value = sizeof(Test<T>(0)) == sizeof(char) };
};
template <typename T>
inline
typename enable_if<HasStaticAllocate<T>::value, void *>::type
allocator_allocate(size_t size)
{
return T::allocate(size);
}
template <typename T>
inline
typename enable_if<!HasStaticAllocate<T>::value, void *>::type
allocator_allocate(size_t)
{
throw_runtime_exception( std::string("Error: ")
+ std::string(allocator_name<T>())
+ std::string(" cannot allocate memory!") );
return NULL;
}
// HasStaticDeallocate
template<typename T>
class HasStaticDeallocate
{
typedef void (*static_method)(void *, size_t);
template<typename U, static_method> struct SFINAE {};
template<typename U> static char Test(SFINAE<U, &U::deallocate>*);
template<typename U> static int Test(...);
public:
enum { value = sizeof(Test<T>(0)) == sizeof(char) };
};
template <typename T>
inline
typename enable_if<HasStaticDeallocate<T>::value, void>::type
allocator_deallocate(void * ptr, size_t size)
{
T::deallocate(ptr,size);
}
template <typename T>
inline
typename enable_if<!HasStaticDeallocate<T>::value, void>::type
allocator_deallocate(void *, size_t)
{
throw_runtime_exception( std::string("Error: ")
+ std::string(allocator_name<T>())
+ std::string(" cannot deallocate memory!") );
}
// HasStaticReallocate
template<typename T>
class HasStaticReallocate
{
typedef void * (*static_method)(void *, size_t, size_t);
template<typename U, static_method> struct SFINAE {};
template<typename U> static char Test(SFINAE<U, &U::reallocate>*);
template<typename U> static int Test(...);
public:
enum { value = sizeof(Test<T>(0)) == sizeof(char) };
};
template <typename T>
inline
typename enable_if<HasStaticReallocate<T>::value, void *>::type
allocator_reallocate(void * old_ptr, size_t old_size, size_t new_size)
{
return T::reallocate(old_ptr, old_size, new_size);
}
template <typename T>
inline
typename enable_if<!HasStaticReallocate<T>::value, void *>::type
allocator_reallocate(void *, size_t, size_t)
{
throw_runtime_exception( std::string("Error: ")
+ std::string(allocator_name<T>())
+ std::string(" cannot reallocate memory!") );
return NULL;
}
// HasStaticReallocate
template<typename T>
class HasStaticSupportTextureBinding
{
typedef bool (*static_method)();
template<typename U, static_method> struct SFINAE {};
template<typename U> static char Test(SFINAE<U, &U::support_texture_binding>*);
template<typename U> static int Test(...);
public:
enum { value = sizeof(Test<T>(0)) == sizeof(char) };
};
template <typename T>
inline
typename enable_if<HasStaticSupportTextureBinding<T>::value, bool>::type
allocator_support_texture_binding()
{
return T::support_texture_binding();
}
template <typename T>
inline
typename enable_if<!HasStaticSupportTextureBinding<T>::value, bool>::type
allocator_support_texture_binding()
{
return false;
}
template <typename T>
class Allocator : public AllocatorBase
{
public:
virtual const char * name() const
{
return allocator_name<T>();
}
virtual void* allocate(size_t size) const
{
return allocator_allocate<T>(size);
}
virtual void deallocate(void * ptr, size_t size) const
{
allocator_deallocate<T>(ptr,size);
}
virtual void * reallocate(void * old_ptr, size_t old_size, size_t new_size) const
{
return allocator_reallocate<T>(old_ptr, old_size, new_size);
}
virtual bool support_texture_binding() const
{
return allocator_support_texture_binding<T>();
}
static AllocatorBase * singleton()
{
return Singleton< Allocator<T> >::get();
}
};
//-----------------------------------------------------------------------------
// AllocationTracker
//-----------------------------------------------------------------------------
// forward declaration for friend classes
struct MallocHelper;
/// class AllocationTracker
/// Will call deallocate from the AllocatorBase when the reference count reaches 0.
/// Reference counting is disabled when the host is in parallel.
class AllocationTracker
{
// use the least significant bit of the AllocationRecord pointer to indicate if the
// AllocationTracker should reference count
enum {
REF_COUNT_BIT = static_cast<uintptr_t>(1)
, REF_COUNT_MASK = ~static_cast<uintptr_t>(1)
};
public:
/// Find an AllocationTracker such that
/// alloc_ptr <= ptr < alloc_ptr + alloc_size
/// O(n) where n is the number of tracked allocations.
template <typename StaticAllocator>
static AllocationTracker find( void const * ptr )
{
return find( ptr, Allocator<StaticAllocator>::singleton() );
}
/// Pretty print all the currently tracked memory
static void print_tracked_memory( std::ostream & out );
/// Default constructor
KOKKOS_INLINE_FUNCTION
AllocationTracker()
: m_alloc_rec(0)
{}
/// Create a AllocationTracker
///
/// Start reference counting the alloc_ptr.
/// When the reference count reachs 0 the allocator deallocate method
/// will be call with the given size. The alloc_ptr should have been
/// allocated with the allocator's allocate method.
///
/// If arg_allocator == NULL OR arg_alloc_ptr == NULL OR size == 0
/// do nothing
template <typename StaticAllocator>
AllocationTracker( StaticAllocator const &
, void * arg_alloc_ptr
, size_t arg_alloc_size
, const std::string & arg_label = std::string("") )
: m_alloc_rec(0)
{
AllocatorBase * arg_allocator = Allocator<StaticAllocator>::singleton();
initalize( arg_allocator, arg_alloc_ptr, arg_alloc_size, arg_label);
}
/// Create a AllocationTracker
///
/// Start reference counting the alloc_ptr.
/// When the reference count reachs 0 the allocator deallocate method
/// will be call with the given size. The alloc_ptr should have been
/// allocated with the allocator's allocate method.
///
/// If arg_allocator == NULL OR arg_alloc_ptr == NULL OR size == 0
/// do nothing
template <typename StaticAllocator>
AllocationTracker( StaticAllocator const &
, size_t arg_alloc_size
, const std::string & arg_label = std::string("")
)
: m_alloc_rec(0)
{
AllocatorBase * arg_allocator = Allocator<StaticAllocator>::singleton();
void * arg_alloc_ptr = arg_allocator->allocate( arg_alloc_size );
initalize( arg_allocator, arg_alloc_ptr, arg_alloc_size, arg_label);
}
/// Copy an AllocatorTracker
KOKKOS_INLINE_FUNCTION
AllocationTracker( const AllocationTracker & rhs )
: m_alloc_rec( rhs.m_alloc_rec)
{
#if !defined( __CUDA_ARCH__ )
if ( rhs.ref_counting() && tracking_enabled() ) {
increment_ref_count();
}
else {
m_alloc_rec = m_alloc_rec & REF_COUNT_MASK;
}
#else
m_alloc_rec = m_alloc_rec & REF_COUNT_MASK;
#endif
}
/// Copy an AllocatorTracker
/// Decrement the reference count of the current tracker if necessary
KOKKOS_INLINE_FUNCTION
AllocationTracker & operator=( const AllocationTracker & rhs )
{
if (this != &rhs) {
#if !defined( __CUDA_ARCH__ )
if ( ref_counting() ) {
decrement_ref_count();
}
m_alloc_rec = rhs.m_alloc_rec;
if ( rhs.ref_counting() && tracking_enabled() ) {
increment_ref_count();
}
else {
m_alloc_rec = m_alloc_rec & REF_COUNT_MASK;
}
#else
m_alloc_rec = rhs.m_alloc_rec & REF_COUNT_MASK;
#endif
}
return * this;
}
/// Destructor
/// Decrement the reference count if necessary
KOKKOS_INLINE_FUNCTION
~AllocationTracker()
{
#if !defined( __CUDA_ARCH__ )
if ( ref_counting() ) {
decrement_ref_count();
}
#endif
}
/// Is the tracker valid?
KOKKOS_INLINE_FUNCTION
bool is_valid() const
{
return (m_alloc_rec & REF_COUNT_MASK);
}
/// clear the tracker
KOKKOS_INLINE_FUNCTION
void clear()
{
#if !defined( __CUDA_ARCH__ )
if ( ref_counting() ) {
decrement_ref_count();
}
#endif
m_alloc_rec = 0;
}
/// is this tracker currently counting allocations?
KOKKOS_INLINE_FUNCTION
bool ref_counting() const
{
return (m_alloc_rec & REF_COUNT_BIT);
}
AllocatorBase * allocator() const;
/// pointer to the allocated memory
void * alloc_ptr() const;
/// size in bytes of the allocated memory
size_t alloc_size() const;
/// the current reference count
size_t ref_count() const;
/// the label given to the allocation
char const * label() const;
/// pretty print all the tracker's information to the std::ostream
void print( std::ostream & oss) const;
/// set an attribute ptr on the allocation record
/// the arg_attribute pointer will be deleted when the record is destroyed
/// the attribute ptr can only be set once
bool set_attribute( AllocatorAttributeBase * arg_attribute) const;
/// get the attribute ptr from the allocation record
AllocatorAttributeBase * attribute() const;
/// reallocate the memory tracked by this allocation
/// NOT thread-safe
void reallocate( size_t size ) const;
static void disable_tracking();
static void enable_tracking();
static bool tracking_enabled();
private:
static AllocationTracker find( void const * ptr, AllocatorBase const * arg_allocator );
void initalize( AllocatorBase * arg_allocator
, void * arg_alloc_ptr
, size_t arg_alloc_size
, std::string const & label );
void increment_ref_count() const;
void decrement_ref_count() const;
friend struct Impl::MallocHelper;
uintptr_t m_alloc_rec;
};
}} // namespace Kokkos::Impl
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
#endif //KOKKOS_ALLOCATION_TRACKER_HPP

View File

@ -0,0 +1,197 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_IMPL_ANALYZE_POLICY_HPP
#define KOKKOS_IMPL_ANALYZE_POLICY_HPP
#include <Kokkos_Core_fwd.hpp>
#include <Kokkos_Concepts.hpp>
#include <impl/Kokkos_Tags.hpp>
namespace Kokkos { namespace Impl {
template < typename ExecutionSpace = void
, typename Schedule = void
, typename WorkTag = void
, typename IndexType = void
, typename IterationPattern = void
>
struct PolicyTraitsBase
{
using type = PolicyTraitsBase< ExecutionSpace, Schedule, WorkTag, IndexType, IterationPattern>;
using execution_space = ExecutionSpace;
using schedule_type = Schedule;
using work_tag = WorkTag;
using index_type = IndexType;
using iteration_pattern = IterationPattern;
};
template <typename PolicyBase, typename ExecutionSpace>
struct SetExecutionSpace
{
static_assert( is_void<typename PolicyBase::execution_space>::value
, "Kokkos Error: More than one execution space given" );
using type = PolicyTraitsBase< ExecutionSpace
, typename PolicyBase::schedule_type
, typename PolicyBase::work_tag
, typename PolicyBase::index_type
, typename PolicyBase::iteration_pattern
>;
};
template <typename PolicyBase, typename Schedule>
struct SetSchedule
{
static_assert( is_void<typename PolicyBase::schedule_type>::value
, "Kokkos Error: More than one schedule type given" );
using type = PolicyTraitsBase< typename PolicyBase::execution_space
, Schedule
, typename PolicyBase::work_tag
, typename PolicyBase::index_type
, typename PolicyBase::iteration_pattern
>;
};
template <typename PolicyBase, typename WorkTag>
struct SetWorkTag
{
static_assert( is_void<typename PolicyBase::work_tag>::value
, "Kokkos Error: More than one work tag given" );
using type = PolicyTraitsBase< typename PolicyBase::execution_space
, typename PolicyBase::schedule_type
, WorkTag
, typename PolicyBase::index_type
, typename PolicyBase::iteration_pattern
>;
};
template <typename PolicyBase, typename IndexType>
struct SetIndexType
{
static_assert( is_void<typename PolicyBase::index_type>::value
, "Kokkos Error: More than one index type given" );
using type = PolicyTraitsBase< typename PolicyBase::execution_space
, typename PolicyBase::schedule_type
, typename PolicyBase::work_tag
, IndexType
, typename PolicyBase::iteration_pattern
>;
};
template <typename PolicyBase, typename IterationPattern>
struct SetIterationPattern
{
static_assert( is_void<typename PolicyBase::iteration_pattern>::value
, "Kokkos Error: More than one iteration_pattern given" );
using type = PolicyTraitsBase< typename PolicyBase::execution_space
, typename PolicyBase::schedule_type
, typename PolicyBase::work_tag
, typename PolicyBase::index_type
, IterationPattern
>;
};
template <typename Base, typename... Traits>
struct AnalyzePolicy;
template <typename Base, typename T, typename... Traits>
struct AnalyzePolicy<Base, T, Traits...> : public
AnalyzePolicy<
typename std::conditional< is_execution_space<T>::value , SetExecutionSpace<Base,T>
, typename std::conditional< is_schedule_type<T>::value , SetSchedule<Base,T>
, typename std::conditional< is_index_type<T>::value , SetIndexType<Base,T>
, typename std::conditional< std::is_integral<T>::value , SetIndexType<Base, IndexType<T> >
, typename std::conditional< is_iteration_pattern<T>::value, SetIterationPattern<Base,T>
, SetWorkTag<Base,T>
>::type >::type >::type >::type>::type::type
, Traits...
>
{};
template <typename Base>
struct AnalyzePolicy<Base>
{
using execution_space = typename std::conditional< is_void< typename Base::execution_space >::value
, DefaultExecutionSpace
, typename Base::execution_space
>::type;
using schedule_type = typename std::conditional< is_void< typename Base::schedule_type >::value
, Schedule< Static >
, typename Base::schedule_type
>::type;
using work_tag = typename Base::work_tag;
using index_type = typename std::conditional< is_void< typename Base::index_type >::value
, IndexType< typename execution_space::size_type >
, typename Base::index_type
>::type
::type // nasty hack to make index_type into an integral_type
; // instead of the wrapped IndexType<T> for backwards compatibility
using iteration_pattern = typename std::conditional< is_void< typename Base::iteration_pattern >::value
, void // TODO set default iteration pattern
, typename Base::iteration_pattern
>::type;
using type = PolicyTraitsBase< execution_space
, schedule_type
, work_tag
, index_type
, iteration_pattern
>;
};
template <typename... Traits>
struct PolicyTraits
: public AnalyzePolicy< PolicyTraitsBase<>, Traits... >::type
{};
}} // namespace Kokkos::Impl
#endif //KOKKOS_IMPL_ANALYZE_POLICY_HPP

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -218,7 +218,17 @@ T atomic_compare_exchange( volatile T * const dest , const T compare ,
while( !Impl::lock_address_host_space( (void*) dest ) );
T return_val = *dest;
if( return_val == compare ) {
const T tmp = *dest = val;
// Don't use the following line of code here:
//
//const T tmp = *dest = val;
//
// Instead, put each assignment in its own statement. This is
// because the overload of T::operator= for volatile *this should
// return void, not volatile T&. See Kokkos #177:
//
// https://github.com/kokkos/kokkos/issues/177
*dest = val;
const T tmp = *dest;
#ifndef KOKKOS_COMPILER_CLANG
(void) tmp;
#endif
@ -239,7 +249,7 @@ T atomic_compare_exchange( volatile T * const dest, const T compare, const T val
{
retval = dest[0];
if ( retval == compare )
dest[0] = val;
dest[0] = val;
}
return retval;
}

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -228,7 +228,17 @@ T atomic_exchange( volatile T * const dest ,
{
while( !Impl::lock_address_host_space( (void*) dest ) );
T return_val = *dest;
const T tmp = *dest = val;
// Don't use the following line of code here:
//
//const T tmp = *dest = val;
//
// Instead, put each assignment in its own statement. This is
// because the overload of T::operator= for volatile *this should
// return void, not volatile T&. See Kokkos #177:
//
// https://github.com/kokkos/kokkos/issues/177
*dest = val;
const T tmp = *dest;
#ifndef KOKKOS_COMPILER_CLANG
(void) tmp;
#endif
@ -305,7 +315,9 @@ void atomic_assign( volatile T * const dest ,
// member. The volatile return value implicitly defines a
// dereference that some compilers (gcc 4.7.2) warn is being ignored.
// Suppress warning by casting return to void.
(void)( *dest = val );
//(void)( *dest = val );
*dest = val;
Impl::unlock_address_host_space( (void*) dest );
}
//----------------------------------------------------------------------------

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -93,7 +93,7 @@ T atomic_fetch_add( volatile T * const dest ,
assume.i = oldval.i ;
newval.t = assume.t + val ;
oldval.i = atomicCAS( (int*)dest , assume.i , newval.i );
} while ( assumed.i != oldval.i );
} while ( assume.i != oldval.i );
return oldval.t ;
}
@ -156,9 +156,26 @@ T atomic_fetch_add( volatile T * const dest ,
#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
#if defined( KOKKOS_ENABLE_ASM ) && defined ( KOKKOS_USE_ISA_X86_64 )
KOKKOS_INLINE_FUNCTION
int atomic_fetch_add( volatile int * dest , const int val )
{
int original = val;
__asm__ __volatile__(
"lock xadd %1, %0"
: "+m" (*dest), "+r" (original)
: "m" (*dest), "r" (original)
: "memory"
);
return original;
}
#else
KOKKOS_INLINE_FUNCTION
int atomic_fetch_add( volatile int * const dest , const int val )
{ return __sync_fetch_and_add(dest,val); }
{ return __sync_fetch_and_add(dest, val); }
#endif
KOKKOS_INLINE_FUNCTION
long int atomic_fetch_add( volatile long int * const dest , const long int val )
@ -276,7 +293,17 @@ T atomic_fetch_add( volatile T * const dest ,
{
while( !Impl::lock_address_host_space( (void*) dest ) );
T return_val = *dest;
const T tmp = *dest = return_val + val;
// Don't use the following line of code here:
//
//const T tmp = *dest = return_val + val;
//
// Instead, put each assignment in its own statement. This is
// because the overload of T::operator= for volatile *this should
// return void, not volatile T&. See Kokkos #177:
//
// https://github.com/kokkos/kokkos/issues/177
*dest = return_val + val;
const T tmp = *dest;
(void) tmp;
Impl::unlock_address_host_space( (void*) dest );
return return_val;

View File

@ -73,7 +73,7 @@ T atomic_fetch_sub( volatile T * const dest ,
assume.i = oldval.i ;
newval.t = assume.t - val ;
oldval.i = atomicCAS( (int*)dest , assume.i , newval.i );
} while ( assumed.i != oldval.i );
} while ( assume.i != oldval.i );
return oldval.t ;
}

View File

@ -48,6 +48,22 @@
namespace Kokkos {
namespace Impl {
template<class Scalar1, class Scalar2>
struct MaxOper {
KOKKOS_FORCEINLINE_FUNCTION
static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
return (val1 > val2 ? val1 : val2);
}
};
template<class Scalar1, class Scalar2>
struct MinOper {
KOKKOS_FORCEINLINE_FUNCTION
static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
return (val1 < val2 ? val1 : val2);
}
};
template<class Scalar1, class Scalar2>
struct AddOper {
KOKKOS_FORCEINLINE_FUNCTION
@ -276,6 +292,18 @@ T atomic_oper_fetch( const Oper& op, volatile T * const dest ,
namespace Kokkos {
// Fetch_Oper atomics: return value before operation
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_fetch_max(volatile T * const dest, const T val) {
return Impl::atomic_fetch_oper(Impl::MaxOper<T,const T>(),dest,val);
}
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_fetch_min(volatile T * const dest, const T val) {
return Impl::atomic_fetch_oper(Impl::MinOper<T,const T>(),dest,val);
}
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_fetch_mul(volatile T * const dest, const T val) {
@ -326,6 +354,18 @@ T atomic_fetch_rshift(volatile T * const dest, const unsigned int val) {
// Oper Fetch atomics: return value after operation
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_max_fetch(volatile T * const dest, const T val) {
return Impl::atomic_oper_fetch(Impl::MaxOper<T,const T>(),dest,val);
}
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_min_fetch(volatile T * const dest, const T val) {
return Impl::atomic_oper_fetch(Impl::MinOper<T,const T>(),dest,val);
}
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_mul_fetch(volatile T * const dest, const T val) {

View File

@ -425,42 +425,6 @@ struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<8> {
typedef int64_t type;
};
#if ! KOKKOS_USING_EXP_VIEW
class AllocationTracker;
// Must be non-const, atomic access trait, and 32 or 64 bit type for true atomics.
template<class ViewTraits>
class ViewDataHandle<
ViewTraits ,
typename enable_if<
( ! is_same<typename ViewTraits::const_value_type,typename ViewTraits::value_type>::value) &&
( ViewTraits::memory_traits::Atomic )
>::type >
{
private:
// typedef typename if_c<(sizeof(typename ViewTraits::const_value_type)==4) ||
// (sizeof(typename ViewTraits::const_value_type)==8),
// int, Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars >::type
// atomic_view_possible;
typedef typename Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<sizeof(typename ViewTraits::const_value_type)>::type enable_atomic_type;
typedef ViewDataHandle self_type;
public:
enum { ReturnTypeIsReference = false };
typedef Impl::AtomicViewDataHandle<ViewTraits> handle_type;
typedef Impl::AtomicDataElement<ViewTraits> return_type;
KOKKOS_INLINE_FUNCTION
static handle_type create_handle( typename ViewTraits::value_type * arg_data_ptr, AllocationTracker const & /*arg_tracker*/ )
{
return handle_type(arg_data_ptr);
}
};
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
}} // namespace Kokkos::Impl
#endif

View File

@ -1,287 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_HostSpace.hpp>
#if ! KOKKOS_USING_EXP_VIEW
#include <impl/Kokkos_BasicAllocators.hpp>
#include <impl/Kokkos_Error.hpp>
#include <stdint.h> // uintptr_t
#include <cstdlib> // for malloc, realloc, and free
#include <cstring> // for memcpy
#if defined(KOKKOS_POSIX_MEMALIGN_AVAILABLE)
#include <sys/mman.h> // for mmap, munmap, MAP_ANON, etc
#include <unistd.h> // for sysconf, _SC_PAGE_SIZE, _SC_PHYS_PAGES
#endif
#include <sstream>
namespace Kokkos { namespace Impl {
/*--------------------------------------------------------------------------*/
void* MallocAllocator::allocate( size_t size )
{
void * ptr = NULL;
if (size) {
ptr = malloc(size);
if (!ptr)
{
std::ostringstream msg ;
msg << name() << ": allocate(" << size << ") FAILED";
throw_runtime_exception( msg.str() );
}
}
return ptr;
}
void MallocAllocator::deallocate( void * ptr, size_t /*size*/ )
{
if (ptr) {
free(ptr);
}
}
void * MallocAllocator::reallocate(void * old_ptr, size_t /*old_size*/, size_t new_size)
{
void * ptr = realloc(old_ptr, new_size);
if (new_size > 0u && ptr == NULL) {
throw_runtime_exception("Error: Malloc Allocator could not reallocate memory");
}
return ptr;
}
/*--------------------------------------------------------------------------*/
namespace {
void * raw_aligned_allocate( size_t size, size_t alignment )
{
void * ptr = NULL;
if ( size ) {
#if defined( __INTEL_COMPILER ) && !defined ( KOKKOS_HAVE_CUDA )
ptr = _mm_malloc( size , alignment );
#elif defined(KOKKOS_POSIX_MEMALIGN_AVAILABLE)
posix_memalign( & ptr, alignment , size );
#else
// Over-allocate to and round up to guarantee proper alignment.
size_t size_padded = size + alignment + sizeof(void *);
void * alloc_ptr = malloc( size_padded );
if (alloc_ptr) {
uintptr_t address = reinterpret_cast<uintptr_t>(alloc_ptr);
// offset enough to record the alloc_ptr
address += sizeof(void *);
uintptr_t rem = address % alignment;
uintptr_t offset = rem ? (alignment - rem) : 0u;
address += offset;
ptr = reinterpret_cast<void *>(address);
// record the alloc'd pointer
address -= sizeof(void *);
*reinterpret_cast<void **>(address) = alloc_ptr;
}
#endif
}
return ptr;
}
void raw_aligned_deallocate( void * ptr, size_t /*size*/ )
{
if ( ptr ) {
#if defined( __INTEL_COMPILER ) && !defined ( KOKKOS_HAVE_CUDA )
_mm_free( ptr );
#elif defined(KOKKOS_POSIX_MEMALIGN_AVAILABLE)
free( ptr );
#else
// get the alloc'd pointer
void * alloc_ptr = *(reinterpret_cast<void **>(ptr) -1);
free( alloc_ptr );
#endif
}
}
}
void* AlignedAllocator::allocate( size_t size )
{
void * ptr = 0 ;
if ( size ) {
ptr = raw_aligned_allocate(size, MEMORY_ALIGNMENT);
if (!ptr)
{
std::ostringstream msg ;
msg << name() << ": allocate(" << size << ") FAILED";
throw_runtime_exception( msg.str() );
}
}
return ptr;
}
void AlignedAllocator::deallocate( void * ptr, size_t size )
{
raw_aligned_deallocate( ptr, size);
}
void * AlignedAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
{
void * ptr = old_ptr;;
if (old_size < new_size) {
ptr = allocate( new_size );
memcpy(ptr, old_ptr, old_size );
deallocate( old_ptr, old_size );
}
return ptr;
}
/*--------------------------------------------------------------------------*/
// mmap flags for private anonymous memory allocation
#if defined( MAP_ANONYMOUS ) && defined( MAP_PRIVATE )
#define MMAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS)
#elif defined( MAP_ANON) && defined( MAP_PRIVATE )
#define MMAP_FLAGS (MAP_PRIVATE | MAP_ANON)
#else
#define NO_MMAP
#endif
// huge page tables
#if !defined( NO_MMAP )
#if defined( MAP_HUGETLB )
#define MMAP_FLAGS_HUGE (MMAP_FLAGS | MAP_HUGETLB )
#elif defined( MMAP_FLAGS )
#define MMAP_FLAGS_HUGE MMAP_FLAGS
#endif
// threshold to use huge pages
#define MMAP_USE_HUGE_PAGES (1u << 27)
#endif
// read write access to private memory
#if !defined( NO_MMAP )
#define MMAP_PROTECTION (PROT_READ | PROT_WRITE)
#endif
void* PageAlignedAllocator::allocate( size_t size )
{
void *ptr = NULL;
if (size) {
#if !defined NO_MMAP
if ( size < MMAP_USE_HUGE_PAGES ) {
ptr = mmap( NULL, size, MMAP_PROTECTION, MMAP_FLAGS, -1 /*file descriptor*/, 0 /*offset*/);
} else {
ptr = mmap( NULL, size, MMAP_PROTECTION, MMAP_FLAGS_HUGE, -1 /*file descriptor*/, 0 /*offset*/);
}
if (ptr == MAP_FAILED) {
ptr = NULL;
}
#else
static const size_t page_size = 4096; // TODO: read in from sysconf( _SC_PAGE_SIZE )
ptr = raw_aligned_allocate( size, page_size);
#endif
if (!ptr)
{
std::ostringstream msg ;
msg << name() << ": allocate(" << size << ") FAILED";
throw_runtime_exception( msg.str() );
}
}
return ptr;
}
void PageAlignedAllocator::deallocate( void * ptr, size_t size )
{
#if !defined( NO_MMAP )
munmap(ptr, size);
#else
raw_aligned_deallocate(ptr, size);
#endif
}
void * PageAlignedAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
{
void * ptr = NULL;
#if defined( NO_MMAP ) || defined( __APPLE__ ) || defined( __CYGWIN__ )
if (old_size != new_size) {
ptr = allocate( new_size );
memcpy(ptr, old_ptr, (old_size < new_size ? old_size : new_size) );
deallocate( old_ptr, old_size );
}
else {
ptr = old_ptr;
}
#else
ptr = mremap( old_ptr, old_size, new_size, MREMAP_MAYMOVE );
if (ptr == MAP_FAILED) {
throw_runtime_exception("Error: Page Aligned Allocator could not reallocate memory");
}
#endif
return ptr;
}
}} // namespace Kokkos::Impl
#endif /* #if ! KOKKOS_USING_EXP_VIEW */

Some files were not shown because too many files have changed in this diff Show More