forked from lijiext/lammps
Kokkos lib update
This commit is contained in:
parent
0252347d43
commit
236ebf7fab
|
@ -1,4 +1,15 @@
|
|||
|
||||
IF(COMMAND TRIBITS_PACKAGE_DECL)
|
||||
SET(KOKKOS_HAS_TRILINOS ON CACHE BOOL "")
|
||||
ELSE()
|
||||
SET(KOKKOS_HAS_TRILINOS OFF CACHE BOOL "")
|
||||
ENDIF()
|
||||
|
||||
IF(NOT KOKKOS_HAS_TRILINOS)
|
||||
CMAKE_MINIMUM_REQUIRED(VERSION 2.8.11 FATAL_ERROR)
|
||||
INCLUDE(cmake/tribits.cmake)
|
||||
ENDIF()
|
||||
|
||||
#
|
||||
# A) Forward delcare the package so that certain options are also defined for
|
||||
# subpackages
|
||||
|
@ -12,7 +23,22 @@ TRIBITS_PACKAGE_DECL(Kokkos) # ENABLE_SHADOWING_WARNINGS)
|
|||
# subpackages as well.
|
||||
#
|
||||
|
||||
TRIBITS_ADD_DEBUG_OPTION()
|
||||
|
||||
|
||||
# mfh 01 Aug 2016: See Issue #61:
|
||||
#
|
||||
# https://github.com/kokkos/kokkos/issues/61
|
||||
#
|
||||
# Don't use TRIBITS_ADD_DEBUG_OPTION() here, because that defines
|
||||
# HAVE_KOKKOS_DEBUG. We define KOKKOS_HAVE_DEBUG here instead,
|
||||
# for compatibility with Kokkos' Makefile build system.
|
||||
|
||||
TRIBITS_ADD_OPTION_AND_DEFINE(
|
||||
${PACKAGE_NAME}_ENABLE_DEBUG
|
||||
${PACKAGE_NAME_UC}_HAVE_DEBUG
|
||||
"Enable run-time debug checks. These checks may be expensive, so they are disabled by default in a release build."
|
||||
${${PROJECT_NAME}_ENABLE_DEBUG}
|
||||
)
|
||||
|
||||
TRIBITS_ADD_OPTION_AND_DEFINE(
|
||||
Kokkos_ENABLE_SIERRA_BUILD
|
||||
|
@ -82,11 +108,33 @@ TRIBITS_ADD_OPTION_AND_DEFINE(
|
|||
"${TPL_ENABLE_MPI}"
|
||||
)
|
||||
|
||||
# Set default value of Kokkos_ENABLE_Debug_Bounds_Check option
|
||||
#
|
||||
# CMake is case sensitive. The Kokkos_ENABLE_Debug_Bounds_Check
|
||||
# option (defined below) is annoyingly not all caps, but we need to
|
||||
# keep it that way for backwards compatibility. If users forget and
|
||||
# try using an all-caps variable, then make it count by using the
|
||||
# all-caps version as the default value of the original, not-all-caps
|
||||
# option. Otherwise, the default value of this option comes from
|
||||
# Kokkos_ENABLE_DEBUG (see Issue #367).
|
||||
|
||||
ASSERT_DEFINED(${PACKAGE_NAME}_ENABLE_DEBUG)
|
||||
IF(DEFINED Kokkos_ENABLE_DEBUG_BOUNDS_CHECK)
|
||||
IF(Kokkos_ENABLE_DEBUG_BOUNDS_CHECK)
|
||||
SET(Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT ON)
|
||||
ELSE()
|
||||
SET(Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT "${${PACKAGE_NAME}_ENABLE_DEBUG}")
|
||||
ENDIF()
|
||||
ELSE()
|
||||
SET(Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT "${${PACKAGE_NAME}_ENABLE_DEBUG}")
|
||||
ENDIF()
|
||||
ASSERT_DEFINED(Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT)
|
||||
|
||||
TRIBITS_ADD_OPTION_AND_DEFINE(
|
||||
Kokkos_ENABLE_Debug_Bounds_Check
|
||||
KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
|
||||
"Enable bounds checking support in Kokkos."
|
||||
OFF
|
||||
"Enable Kokkos::View run-time bounds checking."
|
||||
"${Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT}"
|
||||
)
|
||||
|
||||
TRIBITS_ADD_OPTION_AND_DEFINE(
|
||||
|
|
|
@ -7,7 +7,7 @@ CXXFLAGS=$(CCFLAGS)
|
|||
#Options: OpenMP,Serial,Pthreads,Cuda
|
||||
KOKKOS_DEVICES ?= "OpenMP"
|
||||
#KOKKOS_DEVICES ?= "Pthreads"
|
||||
#Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,ARMv8,BGQ,Power7,Power8,KNL
|
||||
#Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal61,ARMv8,BGQ,Power7,Power8,KNL,BDW
|
||||
KOKKOS_ARCH ?= ""
|
||||
#Options: yes,no
|
||||
KOKKOS_DEBUG ?= "no"
|
||||
|
@ -97,6 +97,7 @@ KOKKOS_INTERNAL_USE_CUDA := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Cuda |
|
|||
KOKKOS_INTERNAL_USE_ARCH_KNC := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNC | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_SNB := $(strip $(shell echo $(KOKKOS_ARCH) | grep SNB | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_HSW := $(strip $(shell echo $(KOKKOS_ARCH) | grep HSW | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_BDW := $(strip $(shell echo $(KOKKOS_ARCH) | grep BDW | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_KNL := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNL | wc -l))
|
||||
|
||||
#NVIDIA based
|
||||
|
@ -108,10 +109,12 @@ KOKKOS_INTERNAL_USE_ARCH_KEPLER37 := $(strip $(shell echo $(KOKKOS_ARCH) | grep
|
|||
KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell50 | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_MAXWELL52 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell52 | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_MAXWELL53 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell53 | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_PASCAL61 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Pascal61 | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc))
|
||||
|
@ -123,6 +126,7 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_AR
|
|||
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc))
|
||||
|
@ -142,11 +146,11 @@ KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(strip $(shell echo $(KOKKOS_ARCH) | grep AM
|
|||
|
||||
#Any AVX?
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
|
||||
|
||||
# Decide what ISA level we are able to support
|
||||
KOKKOS_INTERNAL_USE_ISA_X86_64 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ISA_X86_64 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ISA_KNC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNC) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8) | bc ))
|
||||
|
||||
|
@ -304,8 +308,8 @@ endif
|
|||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_POWER8 1" >> KokkosCore_config.tmp )
|
||||
KOKKOS_CXXFLAGS += -mcpu=power8
|
||||
KOKKOS_LDFLAGS += -mcpu=power8
|
||||
KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
|
||||
KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX2), 1)
|
||||
|
@ -321,8 +325,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX2), 1)
|
|||
|
||||
else
|
||||
# Assume that this is a really a GNU compiler
|
||||
KOKKOS_CXXFLAGS += -march=core-avx2
|
||||
KOKKOS_LDFLAGS += -march=core-avx2
|
||||
KOKKOS_CXXFLAGS += -march=core-avx2 -mtune=core-avx2
|
||||
KOKKOS_LDFLAGS += -march=core-avx2 -mtune=core-avx2
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
@ -390,6 +394,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1)
|
|||
tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL53 1" >> KokkosCore_config.tmp )
|
||||
KOKKOS_CXXFLAGS += -arch=sm_53
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL 1" >> KokkosCore_config.tmp )
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL61 1" >> KokkosCore_config.tmp )
|
||||
KOKKOS_CXXFLAGS += -arch=sm_61
|
||||
endif
|
||||
endif
|
||||
|
||||
KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h)
|
||||
|
|
|
@ -1,9 +1,5 @@
|
|||
Kokkos_UnorderedMap_impl.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/containers/src/impl/Kokkos_UnorderedMap_impl.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/containers/src/impl/Kokkos_UnorderedMap_impl.cpp
|
||||
Kokkos_AllocationTracker.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_AllocationTracker.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_AllocationTracker.cpp
|
||||
Kokkos_BasicAllocators.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_BasicAllocators.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_BasicAllocators.cpp
|
||||
Kokkos_Core.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Core.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Core.cpp
|
||||
Kokkos_CPUDiscovery.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_CPUDiscovery.cpp
|
||||
|
@ -20,6 +16,10 @@ Kokkos_Serial.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Seria
|
|||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial.cpp
|
||||
Kokkos_Serial_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_TaskPolicy.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_TaskPolicy.cpp
|
||||
Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
|
||||
Kokkos_Serial_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp
|
||||
Kokkos_Shape.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Shape.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Shape.cpp
|
||||
Kokkos_spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
|
||||
|
@ -32,12 +32,12 @@ Kokkos_MemoryPool.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_M
|
|||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
||||
Kokkos_Cuda_BasicAllocators.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp
|
||||
Kokkos_Cuda_Impl.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Impl.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Impl.cpp
|
||||
Kokkos_CudaSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp
|
||||
Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
|
||||
Kokkos_Cuda_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp
|
||||
endif
|
||||
|
@ -61,6 +61,8 @@ endif
|
|||
ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
|
||||
Kokkos_OpenMPexec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMPexec.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMPexec.cpp
|
||||
Kokkos_OpenMP_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
|
||||
endif
|
||||
|
||||
Kokkos_HBWSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp
|
||||
|
|
|
@ -37,7 +37,7 @@ hcedwar(at)sandia.gov and crtrott(at)sandia.gov
|
|||
====Requirements============================================================
|
||||
============================================================================
|
||||
|
||||
Primary tested compilers are:
|
||||
Primary tested compilers on X86 are:
|
||||
GCC 4.7.2
|
||||
GCC 4.8.4
|
||||
GCC 4.9.2
|
||||
|
@ -48,26 +48,43 @@ Primary tested compilers are:
|
|||
Clang 3.5.2
|
||||
Clang 3.6.1
|
||||
|
||||
Primary tested compilers on Power 8 are:
|
||||
IBM XL 13.1.3 (OpenMP,Serial)
|
||||
GCC 4.9.2 (OpenMP,Serial)
|
||||
GCC 5.3.0 (OpenMP,Serial)
|
||||
|
||||
Secondary tested compilers are:
|
||||
CUDA 6.5 (with gcc 4.7.2)
|
||||
CUDA 7.0 (with gcc 4.7.2)
|
||||
CUDA 7.5 (with gcc 4.8.4)
|
||||
|
||||
Other compilers working:
|
||||
PGI 15.4
|
||||
IBM XL 13.1.2
|
||||
Cygwin 2.1.0 64bit with gcc 4.9.3
|
||||
X86:
|
||||
Intel 17.0.042 (the FENL example causes internal compiler error)
|
||||
PGI 15.4
|
||||
Cygwin 2.1.0 64bit with gcc 4.9.3
|
||||
KNL:
|
||||
Intel 16.2.181 (the FENL example causes internal compiler error)
|
||||
Intel 17.0.042 (the FENL example causes internal compiler error)
|
||||
|
||||
Known non-working combinations:
|
||||
Power8:
|
||||
GCC 6.1.0
|
||||
Pthreads backend
|
||||
|
||||
|
||||
Primary tested compiler are passing in release mode
|
||||
with warnings as errors. We are using the following set
|
||||
of flags:
|
||||
with warnings as errors. They also are tested with a comprehensive set of
|
||||
backend combinations (i.e. OpenMP, Pthreads, Serial, OpenMP+Serial, ...).
|
||||
We are using the following set of flags:
|
||||
GCC: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits
|
||||
-Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized
|
||||
Intel: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized
|
||||
Clang: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized
|
||||
|
||||
Secondary compilers are passing without -Werror.
|
||||
Other compilers are tested occasionally.
|
||||
Other compilers are tested occasionally, in particular when pushing from develop to
|
||||
master branch, without -Werror and only for a select set of backends.
|
||||
|
||||
============================================================================
|
||||
====Getting started=========================================================
|
||||
|
|
|
@ -771,6 +771,7 @@ namespace Kokkos {
|
|||
friend class Random_XorShift1024_Pool<DeviceType>;
|
||||
public:
|
||||
|
||||
typedef Random_XorShift1024_Pool<DeviceType> pool_type;
|
||||
typedef DeviceType device_type;
|
||||
|
||||
enum {MAX_URAND = 0xffffffffU};
|
||||
|
@ -779,10 +780,10 @@ namespace Kokkos {
|
|||
enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)};
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Random_XorShift1024 (uint64_t* state, int p, int state_idx = 0):
|
||||
Random_XorShift1024 (const typename pool_type::state_data_type& state, int p, int state_idx = 0):
|
||||
p_(p),state_idx_(state_idx){
|
||||
for(int i=0 ; i<16; i++)
|
||||
state_[i] = state[i];
|
||||
state_[i] = state(state_idx,i);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
@ -933,6 +934,7 @@ namespace Kokkos {
|
|||
state_data_type state_;
|
||||
int_view_type p_;
|
||||
int num_states_;
|
||||
friend class Random_XorShift1024<DeviceType>;
|
||||
|
||||
public:
|
||||
typedef Random_XorShift1024<DeviceType> generator_type;
|
||||
|
@ -1001,7 +1003,7 @@ namespace Kokkos {
|
|||
KOKKOS_INLINE_FUNCTION
|
||||
Random_XorShift1024<DeviceType> get_state() const {
|
||||
const int i = DeviceType::hardware_thread_id();
|
||||
return Random_XorShift1024<DeviceType>(&state_(i,0),p_(i),i);
|
||||
return Random_XorShift1024<DeviceType>(state_,p_(i),i);
|
||||
};
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
@ -1020,10 +1022,12 @@ namespace Kokkos {
|
|||
int p_;
|
||||
const int state_idx_;
|
||||
uint64_t* state_;
|
||||
const int stride_;
|
||||
friend class Random_XorShift1024_Pool<Kokkos::Cuda>;
|
||||
public:
|
||||
|
||||
typedef Kokkos::Cuda device_type;
|
||||
typedef Random_XorShift1024_Pool<device_type> pool_type;
|
||||
|
||||
enum {MAX_URAND = 0xffffffffU};
|
||||
enum {MAX_URAND64 = 0xffffffffffffffffULL-1};
|
||||
|
@ -1031,30 +1035,30 @@ namespace Kokkos {
|
|||
enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)};
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Random_XorShift1024 (uint64_t* state, int p, int state_idx = 0):
|
||||
p_(p),state_idx_(state_idx),state_(state){
|
||||
Random_XorShift1024 (const typename pool_type::state_data_type& state, int p, int state_idx = 0):
|
||||
p_(p),state_idx_(state_idx),state_(&state(state_idx,0)),stride_(state.stride_1()){
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
uint32_t urand() {
|
||||
uint64_t state_0 = state_[ p_ ];
|
||||
uint64_t state_1 = state_[ p_ = ( p_ + 1 ) & 15 ];
|
||||
uint64_t state_0 = state_[ p_ * stride_ ];
|
||||
uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
|
||||
state_1 ^= state_1 << 31;
|
||||
state_1 ^= state_1 >> 11;
|
||||
state_0 ^= state_0 >> 30;
|
||||
uint64_t tmp = ( state_[ p_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL;
|
||||
uint64_t tmp = ( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL;
|
||||
tmp = tmp>>16;
|
||||
return static_cast<uint32_t>(tmp&MAX_URAND);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
uint64_t urand64() {
|
||||
uint64_t state_0 = state_[ p_ ];
|
||||
uint64_t state_1 = state_[ p_ = ( p_ + 1 ) & 15 ];
|
||||
uint64_t state_0 = state_[ p_ * stride_ ];
|
||||
uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
|
||||
state_1 ^= state_1 << 31;
|
||||
state_1 ^= state_1 >> 11;
|
||||
state_0 ^= state_0 >> 30;
|
||||
return (( state_[ p_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1;
|
||||
return (( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
@ -1227,9 +1231,9 @@ Random_XorShift1024<Kokkos::Cuda> Random_XorShift1024_Pool<Kokkos::Cuda>::get_st
|
|||
if(i>=num_states_) {i = i_offset;}
|
||||
}
|
||||
|
||||
return Random_XorShift1024<Kokkos::Cuda>(&state_(i,0), p_(i), i);
|
||||
return Random_XorShift1024<Kokkos::Cuda>(state_, p_(i), i);
|
||||
#else
|
||||
return Random_XorShift1024<Kokkos::Cuda>(&state_(0,0), p_(0), 0);
|
||||
return Random_XorShift1024<Kokkos::Cuda>(state_, p_(0), 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -1248,14 +1252,15 @@ void Random_XorShift1024_Pool<Kokkos::Cuda>::free_state(const Random_XorShift102
|
|||
#endif
|
||||
|
||||
|
||||
namespace Impl {
|
||||
|
||||
template<class ViewType, class RandomPool, int loops, int rank>
|
||||
template<class ViewType, class RandomPool, int loops, int rank, class IndexType>
|
||||
struct fill_random_functor_range;
|
||||
template<class ViewType, class RandomPool, int loops, int rank>
|
||||
template<class ViewType, class RandomPool, int loops, int rank, class IndexType>
|
||||
struct fill_random_functor_begin_end;
|
||||
|
||||
template<class ViewType, class RandomPool, int loops>
|
||||
struct fill_random_functor_range<ViewType,RandomPool,loops,1>{
|
||||
template<class ViewType, class RandomPool, int loops, class IndexType>
|
||||
struct fill_random_functor_range<ViewType,RandomPool,loops,1,IndexType>{
|
||||
typedef typename ViewType::execution_space execution_space;
|
||||
ViewType a;
|
||||
RandomPool rand_pool;
|
||||
|
@ -1268,19 +1273,19 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,1>{
|
|||
a(a_),rand_pool(rand_pool_),range(range_) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (unsigned int i) const {
|
||||
void operator() (const IndexType& i) const {
|
||||
typename RandomPool::generator_type gen = rand_pool.get_state();
|
||||
for(unsigned int j=0;j<loops;j++) {
|
||||
const uint64_t idx = i*loops+j;
|
||||
if(idx<a.dimension_0())
|
||||
for(IndexType j=0;j<loops;j++) {
|
||||
const IndexType idx = i*loops+j;
|
||||
if(idx<static_cast<IndexType>(a.dimension_0()))
|
||||
a(idx) = Rand::draw(gen,range);
|
||||
}
|
||||
rand_pool.free_state(gen);
|
||||
}
|
||||
};
|
||||
|
||||
template<class ViewType, class RandomPool, int loops>
|
||||
struct fill_random_functor_range<ViewType,RandomPool,loops,2>{
|
||||
template<class ViewType, class RandomPool, int loops, class IndexType>
|
||||
struct fill_random_functor_range<ViewType,RandomPool,loops,2,IndexType>{
|
||||
typedef typename ViewType::execution_space execution_space;
|
||||
ViewType a;
|
||||
RandomPool rand_pool;
|
||||
|
@ -1293,12 +1298,12 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,2>{
|
|||
a(a_),rand_pool(rand_pool_),range(range_) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (unsigned int i) const {
|
||||
void operator() (IndexType i) const {
|
||||
typename RandomPool::generator_type gen = rand_pool.get_state();
|
||||
for(unsigned int j=0;j<loops;j++) {
|
||||
const uint64_t idx = i*loops+j;
|
||||
if(idx<a.dimension_0()) {
|
||||
for(unsigned int k=0;k<a.dimension_1();k++)
|
||||
for(IndexType j=0;j<loops;j++) {
|
||||
const IndexType idx = i*loops+j;
|
||||
if(idx<static_cast<IndexType>(a.dimension_0())) {
|
||||
for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
|
||||
a(idx,k) = Rand::draw(gen,range);
|
||||
}
|
||||
}
|
||||
|
@ -1307,8 +1312,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,2>{
|
|||
};
|
||||
|
||||
|
||||
template<class ViewType, class RandomPool, int loops>
|
||||
struct fill_random_functor_range<ViewType,RandomPool,loops,3>{
|
||||
template<class ViewType, class RandomPool, int loops, class IndexType>
|
||||
struct fill_random_functor_range<ViewType,RandomPool,loops,3,IndexType>{
|
||||
typedef typename ViewType::execution_space execution_space;
|
||||
ViewType a;
|
||||
RandomPool rand_pool;
|
||||
|
@ -1321,13 +1326,13 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,3>{
|
|||
a(a_),rand_pool(rand_pool_),range(range_) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (unsigned int i) const {
|
||||
void operator() (IndexType i) const {
|
||||
typename RandomPool::generator_type gen = rand_pool.get_state();
|
||||
for(unsigned int j=0;j<loops;j++) {
|
||||
const uint64_t idx = i*loops+j;
|
||||
if(idx<a.dimension_0()) {
|
||||
for(unsigned int k=0;k<a.dimension_1();k++)
|
||||
for(unsigned int l=0;l<a.dimension_2();l++)
|
||||
for(IndexType j=0;j<loops;j++) {
|
||||
const IndexType idx = i*loops+j;
|
||||
if(idx<static_cast<IndexType>(a.dimension_0())) {
|
||||
for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
|
||||
for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
|
||||
a(idx,k,l) = Rand::draw(gen,range);
|
||||
}
|
||||
}
|
||||
|
@ -1335,8 +1340,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,3>{
|
|||
}
|
||||
};
|
||||
|
||||
template<class ViewType, class RandomPool, int loops>
|
||||
struct fill_random_functor_range<ViewType,RandomPool,loops,4>{
|
||||
template<class ViewType, class RandomPool, int loops, class IndexType>
|
||||
struct fill_random_functor_range<ViewType,RandomPool,loops,4, IndexType>{
|
||||
typedef typename ViewType::execution_space execution_space;
|
||||
ViewType a;
|
||||
RandomPool rand_pool;
|
||||
|
@ -1349,14 +1354,14 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,4>{
|
|||
a(a_),rand_pool(rand_pool_),range(range_) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (unsigned int i) const {
|
||||
void operator() (IndexType i) const {
|
||||
typename RandomPool::generator_type gen = rand_pool.get_state();
|
||||
for(unsigned int j=0;j<loops;j++) {
|
||||
const uint64_t idx = i*loops+j;
|
||||
if(idx<a.dimension_0()) {
|
||||
for(unsigned int k=0;k<a.dimension_1();k++)
|
||||
for(unsigned int l=0;l<a.dimension_2();l++)
|
||||
for(unsigned int m=0;m<a.dimension_3();m++)
|
||||
for(IndexType j=0;j<loops;j++) {
|
||||
const IndexType idx = i*loops+j;
|
||||
if(idx<static_cast<IndexType>(a.dimension_0())) {
|
||||
for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
|
||||
for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
|
||||
for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
|
||||
a(idx,k,l,m) = Rand::draw(gen,range);
|
||||
}
|
||||
}
|
||||
|
@ -1364,8 +1369,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,4>{
|
|||
}
|
||||
};
|
||||
|
||||
template<class ViewType, class RandomPool, int loops>
|
||||
struct fill_random_functor_range<ViewType,RandomPool,loops,5>{
|
||||
template<class ViewType, class RandomPool, int loops, class IndexType>
|
||||
struct fill_random_functor_range<ViewType,RandomPool,loops,5,IndexType>{
|
||||
typedef typename ViewType::execution_space execution_space;
|
||||
ViewType a;
|
||||
RandomPool rand_pool;
|
||||
|
@ -1378,15 +1383,15 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,5>{
|
|||
a(a_),rand_pool(rand_pool_),range(range_) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (unsigned int i) const {
|
||||
void operator() (IndexType i) const {
|
||||
typename RandomPool::generator_type gen = rand_pool.get_state();
|
||||
for(unsigned int j=0;j<loops;j++) {
|
||||
const uint64_t idx = i*loops+j;
|
||||
if(idx<a.dimension_0()) {
|
||||
for(unsigned int k=0;k<a.dimension_1();k++)
|
||||
for(unsigned int l=0;l<a.dimension_2();l++)
|
||||
for(unsigned int m=0;m<a.dimension_3();m++)
|
||||
for(unsigned int n=0;n<a.dimension_4();n++)
|
||||
for(IndexType j=0;j<loops;j++) {
|
||||
const IndexType idx = i*loops+j;
|
||||
if(idx<static_cast<IndexType>(a.dimension_0())) {
|
||||
for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
|
||||
for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
|
||||
for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
|
||||
for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
|
||||
a(idx,k,l,m,n) = Rand::draw(gen,range);
|
||||
}
|
||||
}
|
||||
|
@ -1394,8 +1399,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,5>{
|
|||
}
|
||||
};
|
||||
|
||||
template<class ViewType, class RandomPool, int loops>
|
||||
struct fill_random_functor_range<ViewType,RandomPool,loops,6>{
|
||||
template<class ViewType, class RandomPool, int loops, class IndexType>
|
||||
struct fill_random_functor_range<ViewType,RandomPool,loops,6,IndexType>{
|
||||
typedef typename ViewType::execution_space execution_space;
|
||||
ViewType a;
|
||||
RandomPool rand_pool;
|
||||
|
@ -1408,16 +1413,16 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,6>{
|
|||
a(a_),rand_pool(rand_pool_),range(range_) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (unsigned int i) const {
|
||||
void operator() (IndexType i) const {
|
||||
typename RandomPool::generator_type gen = rand_pool.get_state();
|
||||
for(unsigned int j=0;j<loops;j++) {
|
||||
const uint64_t idx = i*loops+j;
|
||||
if(idx<a.dimension_0()) {
|
||||
for(unsigned int k=0;k<a.dimension_1();k++)
|
||||
for(unsigned int l=0;l<a.dimension_2();l++)
|
||||
for(unsigned int m=0;m<a.dimension_3();m++)
|
||||
for(unsigned int n=0;n<a.dimension_4();n++)
|
||||
for(unsigned int o=0;o<a.dimension_5();o++)
|
||||
for(IndexType j=0;j<loops;j++) {
|
||||
const IndexType idx = i*loops+j;
|
||||
if(idx<static_cast<IndexType>(a.dimension_0())) {
|
||||
for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
|
||||
for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
|
||||
for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
|
||||
for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
|
||||
for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
|
||||
a(idx,k,l,m,n,o) = Rand::draw(gen,range);
|
||||
}
|
||||
}
|
||||
|
@ -1425,8 +1430,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,6>{
|
|||
}
|
||||
};
|
||||
|
||||
template<class ViewType, class RandomPool, int loops>
|
||||
struct fill_random_functor_range<ViewType,RandomPool,loops,7>{
|
||||
template<class ViewType, class RandomPool, int loops, class IndexType>
|
||||
struct fill_random_functor_range<ViewType,RandomPool,loops,7,IndexType>{
|
||||
typedef typename ViewType::execution_space execution_space;
|
||||
ViewType a;
|
||||
RandomPool rand_pool;
|
||||
|
@ -1439,17 +1444,17 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,7>{
|
|||
a(a_),rand_pool(rand_pool_),range(range_) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (unsigned int i) const {
|
||||
void operator() (IndexType i) const {
|
||||
typename RandomPool::generator_type gen = rand_pool.get_state();
|
||||
for(unsigned int j=0;j<loops;j++) {
|
||||
const uint64_t idx = i*loops+j;
|
||||
if(idx<a.dimension_0()) {
|
||||
for(unsigned int k=0;k<a.dimension_1();k++)
|
||||
for(unsigned int l=0;l<a.dimension_2();l++)
|
||||
for(unsigned int m=0;m<a.dimension_3();m++)
|
||||
for(unsigned int n=0;n<a.dimension_4();n++)
|
||||
for(unsigned int o=0;o<a.dimension_5();o++)
|
||||
for(unsigned int p=0;p<a.dimension_6();p++)
|
||||
for(IndexType j=0;j<loops;j++) {
|
||||
const IndexType idx = i*loops+j;
|
||||
if(idx<static_cast<IndexType>(a.dimension_0())) {
|
||||
for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
|
||||
for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
|
||||
for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
|
||||
for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
|
||||
for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
|
||||
for(IndexType p=0;p<static_cast<IndexType>(a.dimension_6());p++)
|
||||
a(idx,k,l,m,n,o,p) = Rand::draw(gen,range);
|
||||
}
|
||||
}
|
||||
|
@ -1457,8 +1462,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,7>{
|
|||
}
|
||||
};
|
||||
|
||||
template<class ViewType, class RandomPool, int loops>
|
||||
struct fill_random_functor_range<ViewType,RandomPool,loops,8>{
|
||||
template<class ViewType, class RandomPool, int loops, class IndexType>
|
||||
struct fill_random_functor_range<ViewType,RandomPool,loops,8,IndexType>{
|
||||
typedef typename ViewType::execution_space execution_space;
|
||||
ViewType a;
|
||||
RandomPool rand_pool;
|
||||
|
@ -1471,26 +1476,26 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,8>{
|
|||
a(a_),rand_pool(rand_pool_),range(range_) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (unsigned int i) const {
|
||||
void operator() (IndexType i) const {
|
||||
typename RandomPool::generator_type gen = rand_pool.get_state();
|
||||
for(unsigned int j=0;j<loops;j++) {
|
||||
const uint64_t idx = i*loops+j;
|
||||
if(idx<a.dimension_0()) {
|
||||
for(unsigned int k=0;k<a.dimension_1();k++)
|
||||
for(unsigned int l=0;l<a.dimension_2();l++)
|
||||
for(unsigned int m=0;m<a.dimension_3();m++)
|
||||
for(unsigned int n=0;n<a.dimension_4();n++)
|
||||
for(unsigned int o=0;o<a.dimension_5();o++)
|
||||
for(unsigned int p=0;p<a.dimension_6();p++)
|
||||
for(unsigned int q=0;q<a.dimension_7();q++)
|
||||
for(IndexType j=0;j<loops;j++) {
|
||||
const IndexType idx = i*loops+j;
|
||||
if(idx<static_cast<IndexType>(a.dimension_0())) {
|
||||
for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
|
||||
for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
|
||||
for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
|
||||
for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
|
||||
for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
|
||||
for(IndexType p=0;p<static_cast<IndexType>(a.dimension_6());p++)
|
||||
for(IndexType q=0;q<static_cast<IndexType>(a.dimension_7());q++)
|
||||
a(idx,k,l,m,n,o,p,q) = Rand::draw(gen,range);
|
||||
}
|
||||
}
|
||||
rand_pool.free_state(gen);
|
||||
}
|
||||
};
|
||||
template<class ViewType, class RandomPool, int loops>
|
||||
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,1>{
|
||||
template<class ViewType, class RandomPool, int loops, class IndexType>
|
||||
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,1,IndexType>{
|
||||
typedef typename ViewType::execution_space execution_space;
|
||||
ViewType a;
|
||||
RandomPool rand_pool;
|
||||
|
@ -1503,19 +1508,19 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,1>{
|
|||
a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (unsigned int i) const {
|
||||
void operator() (IndexType i) const {
|
||||
typename RandomPool::generator_type gen = rand_pool.get_state();
|
||||
for(unsigned int j=0;j<loops;j++) {
|
||||
const uint64_t idx = i*loops+j;
|
||||
if(idx<a.dimension_0())
|
||||
for(IndexType j=0;j<loops;j++) {
|
||||
const IndexType idx = i*loops+j;
|
||||
if(idx<static_cast<IndexType>(a.dimension_0()))
|
||||
a(idx) = Rand::draw(gen,begin,end);
|
||||
}
|
||||
rand_pool.free_state(gen);
|
||||
}
|
||||
};
|
||||
|
||||
template<class ViewType, class RandomPool, int loops>
|
||||
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,2>{
|
||||
template<class ViewType, class RandomPool, int loops, class IndexType>
|
||||
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,2,IndexType>{
|
||||
typedef typename ViewType::execution_space execution_space;
|
||||
ViewType a;
|
||||
RandomPool rand_pool;
|
||||
|
@ -1528,12 +1533,12 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,2>{
|
|||
a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (unsigned int i) const {
|
||||
void operator() (IndexType i) const {
|
||||
typename RandomPool::generator_type gen = rand_pool.get_state();
|
||||
for(unsigned int j=0;j<loops;j++) {
|
||||
const uint64_t idx = i*loops+j;
|
||||
if(idx<a.dimension_0()) {
|
||||
for(unsigned int k=0;k<a.dimension_1();k++)
|
||||
for(IndexType j=0;j<loops;j++) {
|
||||
const IndexType idx = i*loops+j;
|
||||
if(idx<static_cast<IndexType>(a.dimension_0())) {
|
||||
for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
|
||||
a(idx,k) = Rand::draw(gen,begin,end);
|
||||
}
|
||||
}
|
||||
|
@ -1542,8 +1547,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,2>{
|
|||
};
|
||||
|
||||
|
||||
template<class ViewType, class RandomPool, int loops>
|
||||
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,3>{
|
||||
template<class ViewType, class RandomPool, int loops, class IndexType>
|
||||
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,3,IndexType>{
|
||||
typedef typename ViewType::execution_space execution_space;
|
||||
ViewType a;
|
||||
RandomPool rand_pool;
|
||||
|
@ -1556,13 +1561,13 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,3>{
|
|||
a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (unsigned int i) const {
|
||||
void operator() (IndexType i) const {
|
||||
typename RandomPool::generator_type gen = rand_pool.get_state();
|
||||
for(unsigned int j=0;j<loops;j++) {
|
||||
const uint64_t idx = i*loops+j;
|
||||
if(idx<a.dimension_0()) {
|
||||
for(unsigned int k=0;k<a.dimension_1();k++)
|
||||
for(unsigned int l=0;l<a.dimension_2();l++)
|
||||
for(IndexType j=0;j<loops;j++) {
|
||||
const IndexType idx = i*loops+j;
|
||||
if(idx<static_cast<IndexType>(a.dimension_0())) {
|
||||
for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
|
||||
for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
|
||||
a(idx,k,l) = Rand::draw(gen,begin,end);
|
||||
}
|
||||
}
|
||||
|
@ -1570,8 +1575,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,3>{
|
|||
}
|
||||
};
|
||||
|
||||
template<class ViewType, class RandomPool, int loops>
|
||||
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,4>{
|
||||
template<class ViewType, class RandomPool, int loops, class IndexType>
|
||||
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,4,IndexType>{
|
||||
typedef typename ViewType::execution_space execution_space;
|
||||
ViewType a;
|
||||
RandomPool rand_pool;
|
||||
|
@ -1584,14 +1589,14 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,4>{
|
|||
a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (unsigned int i) const {
|
||||
void operator() (IndexType i) const {
|
||||
typename RandomPool::generator_type gen = rand_pool.get_state();
|
||||
for(unsigned int j=0;j<loops;j++) {
|
||||
const uint64_t idx = i*loops+j;
|
||||
if(idx<a.dimension_0()) {
|
||||
for(unsigned int k=0;k<a.dimension_1();k++)
|
||||
for(unsigned int l=0;l<a.dimension_2();l++)
|
||||
for(unsigned int m=0;m<a.dimension_3();m++)
|
||||
for(IndexType j=0;j<loops;j++) {
|
||||
const IndexType idx = i*loops+j;
|
||||
if(idx<static_cast<IndexType>(a.dimension_0())) {
|
||||
for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
|
||||
for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
|
||||
for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
|
||||
a(idx,k,l,m) = Rand::draw(gen,begin,end);
|
||||
}
|
||||
}
|
||||
|
@ -1599,8 +1604,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,4>{
|
|||
}
|
||||
};
|
||||
|
||||
template<class ViewType, class RandomPool, int loops>
|
||||
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,5>{
|
||||
template<class ViewType, class RandomPool, int loops, class IndexType>
|
||||
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,5,IndexType>{
|
||||
typedef typename ViewType::execution_space execution_space;
|
||||
ViewType a;
|
||||
RandomPool rand_pool;
|
||||
|
@ -1613,15 +1618,15 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,5>{
|
|||
a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (unsigned int i) const {
|
||||
void operator() (IndexType i) const {
|
||||
typename RandomPool::generator_type gen = rand_pool.get_state();
|
||||
for(unsigned int j=0;j<loops;j++) {
|
||||
const uint64_t idx = i*loops+j;
|
||||
if(idx<a.dimension_0()){
|
||||
for(unsigned int l=0;l<a.dimension_1();l++)
|
||||
for(unsigned int m=0;m<a.dimension_2();m++)
|
||||
for(unsigned int n=0;n<a.dimension_3();n++)
|
||||
for(unsigned int o=0;o<a.dimension_4();o++)
|
||||
for(IndexType j=0;j<loops;j++) {
|
||||
const IndexType idx = i*loops+j;
|
||||
if(idx<static_cast<IndexType>(a.dimension_0())){
|
||||
for(IndexType l=0;l<static_cast<IndexType>(a.dimension_1());l++)
|
||||
for(IndexType m=0;m<static_cast<IndexType>(a.dimension_2());m++)
|
||||
for(IndexType n=0;n<static_cast<IndexType>(a.dimension_3());n++)
|
||||
for(IndexType o=0;o<static_cast<IndexType>(a.dimension_4());o++)
|
||||
a(idx,l,m,n,o) = Rand::draw(gen,begin,end);
|
||||
}
|
||||
}
|
||||
|
@ -1629,8 +1634,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,5>{
|
|||
}
|
||||
};
|
||||
|
||||
template<class ViewType, class RandomPool, int loops>
|
||||
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,6>{
|
||||
template<class ViewType, class RandomPool, int loops, class IndexType>
|
||||
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,6,IndexType>{
|
||||
typedef typename ViewType::execution_space execution_space;
|
||||
ViewType a;
|
||||
RandomPool rand_pool;
|
||||
|
@ -1643,16 +1648,16 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,6>{
|
|||
a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (unsigned int i) const {
|
||||
void operator() (IndexType i) const {
|
||||
typename RandomPool::generator_type gen = rand_pool.get_state();
|
||||
for(unsigned int j=0;j<loops;j++) {
|
||||
const uint64_t idx = i*loops+j;
|
||||
if(idx<a.dimension_0()) {
|
||||
for(unsigned int k=0;k<a.dimension_1();k++)
|
||||
for(unsigned int l=0;l<a.dimension_2();l++)
|
||||
for(unsigned int m=0;m<a.dimension_3();m++)
|
||||
for(unsigned int n=0;n<a.dimension_4();n++)
|
||||
for(unsigned int o=0;o<a.dimension_5();o++)
|
||||
for(IndexType j=0;j<loops;j++) {
|
||||
const IndexType idx = i*loops+j;
|
||||
if(idx<static_cast<IndexType>(a.dimension_0())) {
|
||||
for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
|
||||
for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
|
||||
for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
|
||||
for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
|
||||
for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
|
||||
a(idx,k,l,m,n,o) = Rand::draw(gen,begin,end);
|
||||
}
|
||||
}
|
||||
|
@ -1661,8 +1666,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,6>{
|
|||
};
|
||||
|
||||
|
||||
template<class ViewType, class RandomPool, int loops>
|
||||
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,7>{
|
||||
template<class ViewType, class RandomPool, int loops, class IndexType>
|
||||
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,7,IndexType>{
|
||||
typedef typename ViewType::execution_space execution_space;
|
||||
ViewType a;
|
||||
RandomPool rand_pool;
|
||||
|
@ -1675,17 +1680,17 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,7>{
|
|||
a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (unsigned int i) const {
|
||||
void operator() (IndexType i) const {
|
||||
typename RandomPool::generator_type gen = rand_pool.get_state();
|
||||
for(unsigned int j=0;j<loops;j++) {
|
||||
const uint64_t idx = i*loops+j;
|
||||
if(idx<a.dimension_0()) {
|
||||
for(unsigned int k=0;k<a.dimension_1();k++)
|
||||
for(unsigned int l=0;l<a.dimension_2();l++)
|
||||
for(unsigned int m=0;m<a.dimension_3();m++)
|
||||
for(unsigned int n=0;n<a.dimension_4();n++)
|
||||
for(unsigned int o=0;o<a.dimension_5();o++)
|
||||
for(unsigned int p=0;p<a.dimension_6();p++)
|
||||
for(IndexType j=0;j<loops;j++) {
|
||||
const IndexType idx = i*loops+j;
|
||||
if(idx<static_cast<IndexType>(a.dimension_0())) {
|
||||
for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
|
||||
for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
|
||||
for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
|
||||
for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
|
||||
for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
|
||||
for(IndexType p=0;p<static_cast<IndexType>(a.dimension_6());p++)
|
||||
a(idx,k,l,m,n,o,p) = Rand::draw(gen,begin,end);
|
||||
}
|
||||
}
|
||||
|
@ -1693,8 +1698,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,7>{
|
|||
}
|
||||
};
|
||||
|
||||
template<class ViewType, class RandomPool, int loops>
|
||||
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,8>{
|
||||
template<class ViewType, class RandomPool, int loops, class IndexType>
|
||||
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,8,IndexType>{
|
||||
typedef typename ViewType::execution_space execution_space;
|
||||
ViewType a;
|
||||
RandomPool rand_pool;
|
||||
|
@ -1707,18 +1712,18 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,8>{
|
|||
a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (unsigned int i) const {
|
||||
void operator() (IndexType i) const {
|
||||
typename RandomPool::generator_type gen = rand_pool.get_state();
|
||||
for(unsigned int j=0;j<loops;j++) {
|
||||
const uint64_t idx = i*loops+j;
|
||||
if(idx<a.dimension_0()) {
|
||||
for(unsigned int k=0;k<a.dimension_1();k++)
|
||||
for(unsigned int l=0;l<a.dimension_2();l++)
|
||||
for(unsigned int m=0;m<a.dimension_3();m++)
|
||||
for(unsigned int n=0;n<a.dimension_4();n++)
|
||||
for(unsigned int o=0;o<a.dimension_5();o++)
|
||||
for(unsigned int p=0;p<a.dimension_6();p++)
|
||||
for(unsigned int q=0;q<a.dimension_7();q++)
|
||||
for(IndexType j=0;j<loops;j++) {
|
||||
const IndexType idx = i*loops+j;
|
||||
if(idx<static_cast<IndexType>(a.dimension_0())) {
|
||||
for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
|
||||
for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
|
||||
for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
|
||||
for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
|
||||
for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
|
||||
for(IndexType p=0;p<static_cast<IndexType>(a.dimension_6());p++)
|
||||
for(IndexType q=0;q<static_cast<IndexType>(a.dimension_7());q++)
|
||||
a(idx,k,l,m,n,o,p,q) = Rand::draw(gen,begin,end);
|
||||
}
|
||||
}
|
||||
|
@ -1726,18 +1731,20 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,8>{
|
|||
}
|
||||
};
|
||||
|
||||
template<class ViewType, class RandomPool>
|
||||
}
|
||||
|
||||
template<class ViewType, class RandomPool, class IndexType = int64_t>
|
||||
void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type range) {
|
||||
int64_t LDA = a.dimension_0();
|
||||
if(LDA>0)
|
||||
parallel_for((LDA+127)/128,fill_random_functor_range<ViewType,RandomPool,128,ViewType::Rank>(a,g,range));
|
||||
parallel_for((LDA+127)/128,Impl::fill_random_functor_range<ViewType,RandomPool,128,ViewType::Rank,IndexType>(a,g,range));
|
||||
}
|
||||
|
||||
template<class ViewType, class RandomPool>
|
||||
template<class ViewType, class RandomPool, class IndexType = int64_t>
|
||||
void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type begin,typename ViewType::const_value_type end ) {
|
||||
int64_t LDA = a.dimension_0();
|
||||
if(LDA>0)
|
||||
parallel_for((LDA+127)/128,fill_random_functor_begin_end<ViewType,RandomPool,128,ViewType::Rank>(a,g,begin,end));
|
||||
parallel_for((LDA+127)/128,Impl::fill_random_functor_begin_end<ViewType,RandomPool,128,ViewType::Rank,IndexType>(a,g,begin,end));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -50,6 +50,7 @@
|
|||
#include <Kokkos_Core.hpp>
|
||||
#include <Kokkos_Random.hpp>
|
||||
#include <cmath>
|
||||
#include <chrono>
|
||||
|
||||
namespace Test {
|
||||
|
||||
|
@ -207,7 +208,6 @@ struct test_histogram1d_functor {
|
|||
density_1d (d1d),
|
||||
mean (1.0*num_draws/HIST_DIM1D*3)
|
||||
{
|
||||
printf ("Mean: %e\n", mean);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION void
|
||||
|
@ -295,7 +295,7 @@ struct test_random_scalar {
|
|||
parallel_reduce (num_draws/1024, functor_type (pool, density_1d, density_3d), result);
|
||||
|
||||
//printf("Result: %lf %lf %lf\n",result.mean/num_draws/3,result.variance/num_draws/3,result.covariance/num_draws/2);
|
||||
double tolerance = 2.0*sqrt(1.0/num_draws);
|
||||
double tolerance = 1.6*sqrt(1.0/num_draws);
|
||||
double mean_expect = 0.5*Kokkos::rand<rnd_type,Scalar>::max();
|
||||
double variance_expect = 1.0/3.0*mean_expect*mean_expect;
|
||||
double mean_eps = mean_expect/(result.mean/num_draws/3)-1.0;
|
||||
|
@ -303,10 +303,10 @@ struct test_random_scalar {
|
|||
double covariance_eps = result.covariance/num_draws/2/variance_expect;
|
||||
pass_mean = ((-tolerance < mean_eps) &&
|
||||
( tolerance > mean_eps)) ? 1:0;
|
||||
pass_var = ((-tolerance < variance_eps) &&
|
||||
( tolerance > variance_eps)) ? 1:0;
|
||||
pass_covar = ((-1.4*tolerance < covariance_eps) &&
|
||||
( 1.4*tolerance > covariance_eps)) ? 1:0;
|
||||
pass_var = ((-1.5*tolerance < variance_eps) &&
|
||||
( 1.5*tolerance > variance_eps)) ? 1:0;
|
||||
pass_covar = ((-2.0*tolerance < covariance_eps) &&
|
||||
( 2.0*tolerance > covariance_eps)) ? 1:0;
|
||||
cerr << "Pass: " << pass_mean
|
||||
<< " " << pass_var
|
||||
<< " " << mean_eps
|
||||
|
@ -328,12 +328,12 @@ struct test_random_scalar {
|
|||
double mean_eps = mean_expect/(result.mean/HIST_DIM1D)-1.0;
|
||||
double variance_eps = variance_expect/(result.variance/HIST_DIM1D)-1.0;
|
||||
double covariance_eps = (result.covariance/HIST_DIM1D - covariance_expect)/mean_expect;
|
||||
pass_hist1d_mean = ((-tolerance < mean_eps) &&
|
||||
( tolerance > mean_eps)) ? 1:0;
|
||||
pass_hist1d_var = ((-tolerance < variance_eps) &&
|
||||
( tolerance > variance_eps)) ? 1:0;
|
||||
pass_hist1d_covar = ((-tolerance < covariance_eps) &&
|
||||
( tolerance > covariance_eps)) ? 1:0;
|
||||
pass_hist1d_mean = ((-0.0001 < mean_eps) &&
|
||||
( 0.0001 > mean_eps)) ? 1:0;
|
||||
pass_hist1d_var = ((-0.07 < variance_eps) &&
|
||||
( 0.07 > variance_eps)) ? 1:0;
|
||||
pass_hist1d_covar = ((-0.06 < covariance_eps) &&
|
||||
( 0.06 > covariance_eps)) ? 1:0;
|
||||
|
||||
cerr << "Density 1D: " << mean_eps
|
||||
<< " " << variance_eps
|
||||
|
@ -363,8 +363,8 @@ struct test_random_scalar {
|
|||
double covariance_eps = (result.covariance/HIST_DIM1D - covariance_expect)/mean_expect;
|
||||
pass_hist3d_mean = ((-tolerance < mean_eps) &&
|
||||
( tolerance > mean_eps)) ? 1:0;
|
||||
pass_hist3d_var = ((-tolerance < variance_eps) &&
|
||||
( tolerance > variance_eps)) ? 1:0;
|
||||
pass_hist3d_var = ((-1.2*tolerance < variance_eps) &&
|
||||
( 1.2*tolerance > variance_eps)) ? 1:0;
|
||||
pass_hist3d_covar = ((-tolerance < covariance_eps) &&
|
||||
( tolerance > covariance_eps)) ? 1:0;
|
||||
|
||||
|
@ -386,8 +386,13 @@ void test_random(unsigned int num_draws)
|
|||
typename test_random_functor<RandomGenerator,int>::type_1d density_1d("D1d");
|
||||
typename test_random_functor<RandomGenerator,int>::type_3d density_3d("D3d");
|
||||
|
||||
|
||||
uint64_t ticks = std::chrono::high_resolution_clock::now().time_since_epoch().count();
|
||||
cerr << "Test Seed:" << ticks << endl;
|
||||
|
||||
RandomGenerator pool(ticks);
|
||||
|
||||
cerr << "Test Scalar=int" << endl;
|
||||
RandomGenerator pool(31891);
|
||||
test_random_scalar<RandomGenerator,int> test_int(density_1d,density_3d,pool,num_draws);
|
||||
ASSERT_EQ( test_int.pass_mean,1);
|
||||
ASSERT_EQ( test_int.pass_var,1);
|
||||
|
|
|
@ -0,0 +1,79 @@
|
|||
# @HEADER
|
||||
# ************************************************************************
|
||||
#
|
||||
# Trilinos: An Object-Oriented Solver Framework
|
||||
# Copyright (2001) Sandia Corporation
|
||||
#
|
||||
#
|
||||
# Copyright (2001) Sandia Corporation. Under the terms of Contract
|
||||
# DE-AC04-94AL85000, there is a non-exclusive license for use of this
|
||||
# work by or on behalf of the U.S. Government. Export of this program
|
||||
# may require a license from the United States Government.
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the Corporation nor the names of the
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# NOTICE: The United States Government is granted for itself and others
|
||||
# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
|
||||
# license in this data to reproduce, prepare derivative works, and
|
||||
# perform publicly and display publicly. Beginning five (5) years from
|
||||
# July 25, 2001, the United States Government is granted for itself and
|
||||
# others acting on its behalf a paid-up, nonexclusive, irrevocable
|
||||
# worldwide license in this data to reproduce, prepare derivative works,
|
||||
# distribute copies to the public, perform publicly and display
|
||||
# publicly, and to permit others to do so.
|
||||
#
|
||||
# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
|
||||
# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
|
||||
# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
|
||||
# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
|
||||
# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
|
||||
# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
|
||||
#
|
||||
# ************************************************************************
|
||||
# @HEADER
|
||||
|
||||
# Check for CUDA support
|
||||
|
||||
SET(_CUDA_FAILURE OFF)
|
||||
|
||||
# Have CMake find CUDA
|
||||
IF(NOT _CUDA_FAILURE)
|
||||
FIND_PACKAGE(CUDA 3.2)
|
||||
IF (NOT CUDA_FOUND)
|
||||
SET(_CUDA_FAILURE ON)
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
|
||||
IF(NOT _CUDA_FAILURE)
|
||||
# if we haven't met failure
|
||||
macro(PACKAGE_ADD_CUDA_LIBRARY cuda_target)
|
||||
TRIBITS_ADD_LIBRARY(${cuda_target} ${ARGN} CUDALIBRARY)
|
||||
endmacro()
|
||||
GLOBAL_SET(TPL_CUDA_LIBRARY_DIRS)
|
||||
GLOBAL_SET(TPL_CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE})
|
||||
GLOBAL_SET(TPL_CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY} ${CUDA_cublas_LIBRARY} ${CUDA_cufft_LIBRARY})
|
||||
TIBITS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE)
|
||||
ELSE()
|
||||
SET(TPL_ENABLE_CUDA OFF)
|
||||
ENDIF()
|
|
@ -0,0 +1,64 @@
|
|||
# @HEADER
|
||||
# ************************************************************************
|
||||
#
|
||||
# Trilinos: An Object-Oriented Solver Framework
|
||||
# Copyright (2001) Sandia Corporation
|
||||
#
|
||||
#
|
||||
# Copyright (2001) Sandia Corporation. Under the terms of Contract
|
||||
# DE-AC04-94AL85000, there is a non-exclusive license for use of this
|
||||
# work by or on behalf of the U.S. Government. Export of this program
|
||||
# may require a license from the United States Government.
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the Corporation nor the names of the
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# NOTICE: The United States Government is granted for itself and others
|
||||
# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
|
||||
# license in this data to reproduce, prepare derivative works, and
|
||||
# perform publicly and display publicly. Beginning five (5) years from
|
||||
# July 25, 2001, the United States Government is granted for itself and
|
||||
# others acting on its behalf a paid-up, nonexclusive, irrevocable
|
||||
# worldwide license in this data to reproduce, prepare derivative works,
|
||||
# distribute copies to the public, perform publicly and display
|
||||
# publicly, and to permit others to do so.
|
||||
#
|
||||
# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
|
||||
# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
|
||||
# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
|
||||
# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
|
||||
# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
|
||||
# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
|
||||
#
|
||||
# ************************************************************************
|
||||
# @HEADER
|
||||
|
||||
include(${TRIBITS_DEPS_DIR}/CUDA.cmake)
|
||||
|
||||
IF (TPL_ENABLE_CUDA)
|
||||
GLOBAL_SET(TPL_CUSPARSE_LIBRARY_DIRS)
|
||||
GLOBAL_SET(TPL_CUSPARSE_INCLUDE_DIRS ${TPL_CUDA_INCLUDE_DIRS})
|
||||
GLOBAL_SET(TPL_CUSPARSE_LIBRARIES ${CUDA_cusparse_LIBRARY})
|
||||
TIBITS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE)
|
||||
ENDIF()
|
||||
|
|
@ -0,0 +1,70 @@
|
|||
# @HEADER
|
||||
# ************************************************************************
|
||||
#
|
||||
# Trilinos: An Object-Oriented Solver Framework
|
||||
# Copyright (2001) Sandia Corporation
|
||||
#
|
||||
#
|
||||
# Copyright (2001) Sandia Corporation. Under the terms of Contract
|
||||
# DE-AC04-94AL85000, there is a non-exclusive license for use of this
|
||||
# work by or on behalf of the U.S. Government. Export of this program
|
||||
# may require a license from the United States Government.
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the Corporation nor the names of the
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# NOTICE: The United States Government is granted for itself and others
|
||||
# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
|
||||
# license in this data to reproduce, prepare derivative works, and
|
||||
# perform publicly and display publicly. Beginning five (5) years from
|
||||
# July 25, 2001, the United States Government is granted for itself and
|
||||
# others acting on its behalf a paid-up, nonexclusive, irrevocable
|
||||
# worldwide license in this data to reproduce, prepare derivative works,
|
||||
# distribute copies to the public, perform publicly and display
|
||||
# publicly, and to permit others to do so.
|
||||
#
|
||||
# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
|
||||
# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
|
||||
# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
|
||||
# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
|
||||
# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
|
||||
# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
|
||||
#
|
||||
# ************************************************************************
|
||||
# @HEADER
|
||||
|
||||
|
||||
#-----------------------------------------------------------------------------
|
||||
# Hardware locality detection and control library.
|
||||
#
|
||||
# Acquisition information:
|
||||
# Date checked: November 2011
|
||||
# Checked by: H. Carter Edwards <hcedwar AT sandia.gov>
|
||||
# Source: http://www.open-mpi.org/projects/hwloc/
|
||||
# Version: 1.3
|
||||
#
|
||||
|
||||
TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( HWLOC
|
||||
REQUIRED_HEADERS hwloc.h
|
||||
REQUIRED_LIBS_NAMES "hwloc"
|
||||
)
|
|
@ -0,0 +1,83 @@
|
|||
# @HEADER
|
||||
# ************************************************************************
|
||||
#
|
||||
# Trilinos: An Object-Oriented Solver Framework
|
||||
# Copyright (2001) Sandia Corporation
|
||||
#
|
||||
#
|
||||
# Copyright (2001) Sandia Corporation. Under the terms of Contract
|
||||
# DE-AC04-94AL85000, there is a non-exclusive license for use of this
|
||||
# work by or on behalf of the U.S. Government. Export of this program
|
||||
# may require a license from the United States Government.
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the Corporation nor the names of the
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# NOTICE: The United States Government is granted for itself and others
|
||||
# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
|
||||
# license in this data to reproduce, prepare derivative works, and
|
||||
# perform publicly and display publicly. Beginning five (5) years from
|
||||
# July 25, 2001, the United States Government is granted for itself and
|
||||
# others acting on its behalf a paid-up, nonexclusive, irrevocable
|
||||
# worldwide license in this data to reproduce, prepare derivative works,
|
||||
# distribute copies to the public, perform publicly and display
|
||||
# publicly, and to permit others to do so.
|
||||
#
|
||||
# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
|
||||
# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
|
||||
# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
|
||||
# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
|
||||
# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
|
||||
# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
|
||||
#
|
||||
# ************************************************************************
|
||||
# @HEADER
|
||||
|
||||
|
||||
SET(USE_THREADS FALSE)
|
||||
|
||||
IF(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES)
|
||||
# Use CMake's Thread finder since it is a bit smarter in determining
|
||||
# whether pthreads is already built into the compiler and doesn't need
|
||||
# a library to link.
|
||||
FIND_PACKAGE(Threads)
|
||||
#If Threads found a copy of pthreads make sure it is one of the cases the tribits
|
||||
#tpl system cannot handle.
|
||||
IF(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT)
|
||||
IF(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread")
|
||||
SET(USE_THREADS TRUE)
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
|
||||
IF(USE_THREADS)
|
||||
SET(TPL_Pthread_INCLUDE_DIRS "")
|
||||
SET(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}")
|
||||
SET(TPL_Pthread_LIBRARY_DIRS "")
|
||||
TIBITS_CREATE_IMPORTED_TPL_LIBRARY(Pthread)
|
||||
ELSE()
|
||||
TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( Pthread
|
||||
REQUIRED_HEADERS pthread.h
|
||||
REQUIRED_LIBS_NAMES pthread
|
||||
)
|
||||
ENDIF()
|
|
@ -0,0 +1,70 @@
|
|||
# @HEADER
|
||||
# ************************************************************************
|
||||
#
|
||||
# Trilinos: An Object-Oriented Solver Framework
|
||||
# Copyright (2001) Sandia Corporation
|
||||
#
|
||||
#
|
||||
# Copyright (2001) Sandia Corporation. Under the terms of Contract
|
||||
# DE-AC04-94AL85000, there is a non-exclusive license for use of this
|
||||
# work by or on behalf of the U.S. Government. Export of this program
|
||||
# may require a license from the United States Government.
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the Corporation nor the names of the
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# NOTICE: The United States Government is granted for itself and others
|
||||
# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
|
||||
# license in this data to reproduce, prepare derivative works, and
|
||||
# perform publicly and display publicly. Beginning five (5) years from
|
||||
# July 25, 2001, the United States Government is granted for itself and
|
||||
# others acting on its behalf a paid-up, nonexclusive, irrevocable
|
||||
# worldwide license in this data to reproduce, prepare derivative works,
|
||||
# distribute copies to the public, perform publicly and display
|
||||
# publicly, and to permit others to do so.
|
||||
#
|
||||
# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
|
||||
# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
|
||||
# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
|
||||
# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
|
||||
# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
|
||||
# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
|
||||
#
|
||||
# ************************************************************************
|
||||
# @HEADER
|
||||
|
||||
|
||||
#-----------------------------------------------------------------------------
|
||||
# Hardware locality detection and control library.
|
||||
#
|
||||
# Acquisition information:
|
||||
# Date checked: July 2014
|
||||
# Checked by: H. Carter Edwards <hcedwar AT sandia.gov>
|
||||
# Source: https://code.google.com/p/qthreads
|
||||
#
|
||||
|
||||
TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( QTHREAD
|
||||
REQUIRED_HEADERS qthread.h
|
||||
REQUIRED_LIBS_NAMES "qthread"
|
||||
)
|
||||
|
|
@ -0,0 +1,485 @@
|
|||
INCLUDE(CMakeParseArguments)
|
||||
INCLUDE(CTest)
|
||||
|
||||
FUNCTION(ASSERT_DEFINED VARS)
|
||||
FOREACH(VAR ${VARS})
|
||||
IF(NOT DEFINED ${VAR})
|
||||
MESSAGE(SEND_ERROR "Error, the variable ${VAR} is not defined!")
|
||||
ENDIF()
|
||||
ENDFOREACH()
|
||||
ENDFUNCTION()
|
||||
|
||||
MACRO(GLOBAL_SET VARNAME)
|
||||
SET(${VARNAME} ${ARGN} CACHE INTERNAL "")
|
||||
ENDMACRO()
|
||||
|
||||
MACRO(PREPEND_GLOBAL_SET VARNAME)
|
||||
ASSERT_DEFINED(${VARNAME})
|
||||
GLOBAL_SET(${VARNAME} ${ARGN} ${${VARNAME}})
|
||||
ENDMACRO()
|
||||
|
||||
FUNCTION(REMOVE_GLOBAL_DUPLICATES VARNAME)
|
||||
ASSERT_DEFINED(${VARNAME})
|
||||
IF (${VARNAME})
|
||||
SET(TMP ${${VARNAME}})
|
||||
LIST(REMOVE_DUPLICATES TMP)
|
||||
GLOBAL_SET(${VARNAME} ${TMP})
|
||||
ENDIF()
|
||||
ENDFUNCTION()
|
||||
|
||||
MACRO(TRIBITS_ADD_OPTION_AND_DEFINE USER_OPTION_NAME MACRO_DEFINE_NAME DOCSTRING DEFAULT_VALUE)
|
||||
MESSAGE(STATUS "TRIBITS_ADD_OPTION_AND_DEFINE: '${USER_OPTION_NAME}' '${MACRO_DEFINE_NAME}' '${DEFAULT_VALUE}'")
|
||||
SET( ${USER_OPTION_NAME} "${DEFAULT_VALUE}" CACHE BOOL "${DOCSTRING}" )
|
||||
IF(NOT ${MACRO_DEFINE_NAME} STREQUAL "")
|
||||
IF(${USER_OPTION_NAME})
|
||||
GLOBAL_SET(${MACRO_DEFINE_NAME} ON)
|
||||
ELSE()
|
||||
GLOBAL_SET(${MACRO_DEFINE_NAME} OFF)
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
ENDMACRO()
|
||||
|
||||
FUNCTION(TRIBITS_CONFIGURE_FILE PACKAGE_NAME_CONFIG_FILE)
|
||||
|
||||
# Configure the file
|
||||
CONFIGURE_FILE(
|
||||
${PACKAGE_SOURCE_DIR}/cmake/${PACKAGE_NAME_CONFIG_FILE}.in
|
||||
${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME_CONFIG_FILE}
|
||||
)
|
||||
|
||||
ENDFUNCTION()
|
||||
|
||||
MACRO(TRIBITS_ADD_DEBUG_OPTION)
|
||||
TRIBITS_ADD_OPTION_AND_DEFINE(
|
||||
${PROJECT_NAME}_ENABLE_DEBUG
|
||||
HAVE_${PROJECT_NAME_UC}_DEBUG
|
||||
"Enable a host of runtime debug checking."
|
||||
OFF
|
||||
)
|
||||
ENDMACRO()
|
||||
|
||||
|
||||
MACRO(TRIBITS_ADD_TEST_DIRECTORIES)
|
||||
FOREACH(TEST_DIR ${ARGN})
|
||||
ADD_SUBDIRECTORY(${TEST_DIR})
|
||||
ENDFOREACH()
|
||||
ENDMACRO()
|
||||
|
||||
MACRO(TRIBITS_ADD_EXAMPLE_DIRECTORIES)
|
||||
|
||||
IF(${PACKAGE_NAME}_ENABLE_EXAMPLES OR ${PARENT_PACKAGE_NAME}_ENABLE_EXAMPLES)
|
||||
FOREACH(EXAMPLE_DIR ${ARGN})
|
||||
ADD_SUBDIRECTORY(${EXAMPLE_DIR})
|
||||
ENDFOREACH()
|
||||
ENDIF()
|
||||
|
||||
ENDMACRO()
|
||||
|
||||
MACRO(TARGET_TRANSFER_PROPERTY TARGET_NAME PROP_IN PROP_OUT)
|
||||
SET(PROP_VALUES)
|
||||
FOREACH(TARGET_X ${ARGN})
|
||||
LIST(APPEND PROP_VALUES "$<TARGET_PROPERTY:${TARGET_X},${PROP_IN}>")
|
||||
ENDFOREACH()
|
||||
SET_TARGET_PROPERTIES(${TARGET_NAME} PROPERTIES ${PROP_OUT} "${PROP_VALUES}")
|
||||
ENDMACRO()
|
||||
|
||||
MACRO(ADD_INTERFACE_LIBRARY LIB_NAME)
|
||||
FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp "")
|
||||
ADD_LIBRARY(${LIB_NAME} STATIC ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp)
|
||||
SET_TARGET_PROPERTIES(${LIB_NAME} PROPERTIES INTERFACE TRUE)
|
||||
ENDMACRO()
|
||||
|
||||
# Older versions of cmake does not make include directories transitive
|
||||
MACRO(TARGET_LINK_AND_INCLUDE_LIBRARIES TARGET_NAME)
|
||||
TARGET_LINK_LIBRARIES(${TARGET_NAME} LINK_PUBLIC ${ARGN})
|
||||
FOREACH(DEP_LIB ${ARGN})
|
||||
TARGET_INCLUDE_DIRECTORIES(${TARGET_NAME} PUBLIC $<TARGET_PROPERTY:${DEP_LIB},INTERFACE_INCLUDE_DIRECTORIES>)
|
||||
TARGET_INCLUDE_DIRECTORIES(${TARGET_NAME} PUBLIC $<TARGET_PROPERTY:${DEP_LIB},INCLUDE_DIRECTORIES>)
|
||||
ENDFOREACH()
|
||||
ENDMACRO()
|
||||
|
||||
FUNCTION(TRIBITS_ADD_LIBRARY LIBRARY_NAME)
|
||||
|
||||
SET(options STATIC SHARED TESTONLY NO_INSTALL_LIB_OR_HEADERS CUDALIBRARY)
|
||||
SET(oneValueArgs)
|
||||
SET(multiValueArgs HEADERS HEADERS_INSTALL_SUBDIR NOINSTALLHEADERS SOURCES DEPLIBS IMPORTEDLIBS DEFINES ADDED_LIB_TARGET_NAME_OUT)
|
||||
|
||||
CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
|
||||
|
||||
IF(PARSE_HEADERS)
|
||||
LIST(REMOVE_DUPLICATES PARSE_HEADERS)
|
||||
ENDIF()
|
||||
IF(PARSE_SOURCES)
|
||||
LIST(REMOVE_DUPLICATES PARSE_SOURCES)
|
||||
ENDIF()
|
||||
|
||||
# Local variable to hold all of the libraries that will be directly linked
|
||||
# to this library.
|
||||
SET(LINK_LIBS ${${PACKAGE_NAME}_DEPS})
|
||||
|
||||
# Add dependent libraries passed directly in
|
||||
|
||||
IF (PARSE_IMPORTEDLIBS)
|
||||
LIST(APPEND LINK_LIBS ${PARSE_IMPORTEDLIBS})
|
||||
ENDIF()
|
||||
|
||||
IF (PARSE_DEPLIBS)
|
||||
LIST(APPEND LINK_LIBS ${PARSE_DEPLIBS})
|
||||
ENDIF()
|
||||
|
||||
# Add the library and all the dependencies
|
||||
|
||||
IF (PARSE_DEFINES)
|
||||
ADD_DEFINITIONS(${PARSE_DEFINES})
|
||||
ENDIF()
|
||||
|
||||
IF (PARSE_STATIC)
|
||||
SET(STATIC_KEYWORD "STATIC")
|
||||
ELSE()
|
||||
SET(STATIC_KEYWORD)
|
||||
ENDIF()
|
||||
|
||||
IF (PARSE_SHARED)
|
||||
SET(SHARED_KEYWORD "SHARED")
|
||||
ELSE()
|
||||
SET(SHARED_KEYWORD)
|
||||
ENDIF()
|
||||
|
||||
IF (PARSE_TESTONLY)
|
||||
SET(EXCLUDE_FROM_ALL_KEYWORD "EXCLUDE_FROM_ALL")
|
||||
ELSE()
|
||||
SET(EXCLUDE_FROM_ALL_KEYWORD)
|
||||
ENDIF()
|
||||
IF (NOT PARSE_CUDALIBRARY)
|
||||
ADD_LIBRARY(
|
||||
${LIBRARY_NAME}
|
||||
${STATIC_KEYWORD}
|
||||
${SHARED_KEYWORD}
|
||||
${EXCLUDE_FROM_ALL_KEYWORD}
|
||||
${PARSE_HEADERS}
|
||||
${PARSE_NOINSTALLHEADERS}
|
||||
${PARSE_SOURCES}
|
||||
)
|
||||
ELSE()
|
||||
CUDA_ADD_LIBRARY(
|
||||
${LIBRARY_NAME}
|
||||
${PARSE_HEADERS}
|
||||
${PARSE_NOINSTALLHEADERS}
|
||||
${PARSE_SOURCES}
|
||||
)
|
||||
ENDIF()
|
||||
|
||||
TARGET_LINK_AND_INCLUDE_LIBRARIES(${LIBRARY_NAME} ${LINK_LIBS})
|
||||
|
||||
IF (NOT PARSE_TESTONLY OR PARSE_NO_INSTALL_LIB_OR_HEADERS)
|
||||
|
||||
INSTALL(
|
||||
TARGETS ${LIBRARY_NAME}
|
||||
EXPORT ${PROJECT_NAME}
|
||||
RUNTIME DESTINATION bin
|
||||
LIBRARY DESTINATION lib
|
||||
ARCHIVE DESTINATION lib
|
||||
COMPONENT ${PACKAGE_NAME}
|
||||
)
|
||||
|
||||
INSTALL(
|
||||
FILES ${PARSE_HEADERS}
|
||||
EXPORT ${PROJECT_NAME}
|
||||
DESTINATION include
|
||||
COMPONENT ${PACKAGE_NAME}
|
||||
)
|
||||
|
||||
INSTALL(
|
||||
DIRECTORY ${PARSE_HEADERS_INSTALL_SUBDIR}
|
||||
EXPORT ${PROJECT_NAME}
|
||||
DESTINATION include
|
||||
COMPONENT ${PACKAGE_NAME}
|
||||
)
|
||||
|
||||
ENDIF()
|
||||
|
||||
IF (NOT PARSE_TESTONLY)
|
||||
PREPEND_GLOBAL_SET(${PACKAGE_NAME}_LIBS ${LIBRARY_NAME})
|
||||
REMOVE_GLOBAL_DUPLICATES(${PACKAGE_NAME}_LIBS)
|
||||
ENDIF()
|
||||
|
||||
ENDFUNCTION()
|
||||
|
||||
FUNCTION(TRIBITS_ADD_EXECUTABLE EXE_NAME)
|
||||
|
||||
SET(options NOEXEPREFIX NOEXESUFFIX ADD_DIR_TO_NAME INSTALLABLE TESTONLY)
|
||||
SET(oneValueArgs ADDED_EXE_TARGET_NAME_OUT)
|
||||
SET(multiValueArgs SOURCES CATEGORIES HOST XHOST HOSTTYPE XHOSTTYPE DIRECTORY TESTONLYLIBS IMPORTEDLIBS DEPLIBS COMM LINKER_LANGUAGE TARGET_DEFINES DEFINES)
|
||||
|
||||
CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
|
||||
|
||||
IF (PARSE_TARGET_DEFINES)
|
||||
TARGET_COMPILE_DEFINITIONS(${EXE_NAME} PUBLIC ${PARSE_TARGET_DEFINES})
|
||||
ENDIF()
|
||||
|
||||
SET(LINK_LIBS PACKAGE_${PACKAGE_NAME})
|
||||
|
||||
IF (PARSE_TESTONLYLIBS)
|
||||
LIST(APPEND LINK_LIBS ${PARSE_TESTONLYLIBS})
|
||||
ENDIF()
|
||||
|
||||
IF (PARSE_IMPORTEDLIBS)
|
||||
LIST(APPEND LINK_LIBS ${PARSE_IMPORTEDLIBS})
|
||||
ENDIF()
|
||||
|
||||
SET (EXE_SOURCES)
|
||||
IF(PARSE_DIRECTORY)
|
||||
FOREACH( SOURCE_FILE ${PARSE_SOURCES} )
|
||||
IF(IS_ABSOLUTE ${SOURCE_FILE})
|
||||
SET (EXE_SOURCES ${EXE_SOURCES} ${SOURCE_FILE})
|
||||
ELSE()
|
||||
SET (EXE_SOURCES ${EXE_SOURCES} ${PARSE_DIRECTORY}/${SOURCE_FILE})
|
||||
ENDIF()
|
||||
ENDFOREACH( )
|
||||
ELSE()
|
||||
FOREACH( SOURCE_FILE ${PARSE_SOURCES} )
|
||||
SET (EXE_SOURCES ${EXE_SOURCES} ${SOURCE_FILE})
|
||||
ENDFOREACH( )
|
||||
ENDIF()
|
||||
|
||||
SET(EXE_BINARY_NAME ${EXE_NAME})
|
||||
IF(DEFINED PACKAGE_NAME AND NOT PARSE_NOEXEPREFIX)
|
||||
SET(EXE_BINARY_NAME ${PACKAGE_NAME}_${EXE_BINARY_NAME})
|
||||
ENDIF()
|
||||
|
||||
IF (PARSE_TESTONLY)
|
||||
SET(EXCLUDE_FROM_ALL_KEYWORD "EXCLUDE_FROM_ALL")
|
||||
ELSE()
|
||||
SET(EXCLUDE_FROM_ALL_KEYWORD)
|
||||
ENDIF()
|
||||
ADD_EXECUTABLE(${EXE_BINARY_NAME} ${EXCLUDE_FROM_ALL_KEYWORD} ${EXE_SOURCES})
|
||||
|
||||
TARGET_LINK_AND_INCLUDE_LIBRARIES(${EXE_BINARY_NAME} ${LINK_LIBS})
|
||||
|
||||
IF(PARSE_ADDED_EXE_TARGET_NAME_OUT)
|
||||
SET(${PARSE_ADDED_EXE_TARGET_NAME_OUT} ${EXE_BINARY_NAME} PARENT_SCOPE)
|
||||
ENDIF()
|
||||
|
||||
IF(PARSE_INSTALLABLE)
|
||||
INSTALL(
|
||||
TARGETS ${EXE_BINARY_NAME}
|
||||
EXPORT ${PROJECT_NAME}
|
||||
DESTINATION bin
|
||||
)
|
||||
ENDIF()
|
||||
ENDFUNCTION()
|
||||
|
||||
ADD_CUSTOM_TARGET(check COMMAND ${CMAKE_CTEST_COMMAND} -VV -C ${CMAKE_CFG_INTDIR})
|
||||
|
||||
FUNCTION(TRIBITS_ADD_EXECUTABLE_AND_TEST EXE_NAME)
|
||||
|
||||
SET(options STANDARD_PASS_OUTPUT WILL_FAIL)
|
||||
SET(oneValueArgs PASS_REGULAR_EXPRESSION FAIL_REGULAR_EXPRESSION ENVIRONMENT TIMEOUT CATEGORIES ADDED_TESTS_NAMES_OUT ADDED_EXE_TARGET_NAME_OUT)
|
||||
SET(multiValueArgs)
|
||||
|
||||
CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
|
||||
|
||||
TRIBITS_ADD_EXECUTABLE(${EXE_NAME} TESTONLY ADDED_EXE_TARGET_NAME_OUT TEST_NAME ${PARSE_UNPARSED_ARGUMENTS})
|
||||
|
||||
IF(WIN32)
|
||||
ADD_TEST(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} COMMAND ${TEST_NAME}${CMAKE_EXECUTABLE_SUFFIX})
|
||||
ELSE()
|
||||
ADD_TEST(NAME ${TEST_NAME} COMMAND ${TEST_NAME})
|
||||
ENDIF()
|
||||
ADD_DEPENDENCIES(check ${TEST_NAME})
|
||||
|
||||
IF(PARSE_FAIL_REGULAR_EXPRESSION)
|
||||
SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${PARSE_FAIL_REGULAR_EXPRESSION})
|
||||
ENDIF()
|
||||
|
||||
IF(PARSE_PASS_REGULAR_EXPRESSION)
|
||||
SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${PARSE_PASS_REGULAR_EXPRESSION})
|
||||
ENDIF()
|
||||
|
||||
IF(PARSE_WILL_FAIL)
|
||||
SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES WILL_FAIL ${PARSE_WILL_FAIL})
|
||||
ENDIF()
|
||||
|
||||
IF(PARSE_ADDED_TESTS_NAMES_OUT)
|
||||
SET(${PARSE_ADDED_TESTS_NAMES_OUT} ${TEST_NAME} PARENT_SCOPE)
|
||||
ENDIF()
|
||||
|
||||
IF(PARSE_ADDED_EXE_TARGET_NAME_OUT)
|
||||
SET(${PARSE_ADDED_EXE_TARGET_NAME_OUT} ${TEST_NAME} PARENT_SCOPE)
|
||||
ENDIF()
|
||||
|
||||
ENDFUNCTION()
|
||||
|
||||
MACRO(TIBITS_CREATE_IMPORTED_TPL_LIBRARY TPL_NAME)
|
||||
ADD_INTERFACE_LIBRARY(TPL_LIB_${TPL_NAME})
|
||||
TARGET_LINK_LIBRARIES(TPL_LIB_${TPL_NAME} LINK_PUBLIC ${TPL_${TPL_NAME}_LIBRARIES})
|
||||
TARGET_INCLUDE_DIRECTORIES(TPL_LIB_${TPL_NAME} INTERFACE ${TPL_${TPL_NAME}_INCLUDE_DIRS})
|
||||
ENDMACRO()
|
||||
|
||||
FUNCTION(TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES TPL_NAME)
|
||||
|
||||
SET(options MUST_FIND_ALL_LIBS MUST_FIND_ALL_HEADERS NO_PRINT_ENABLE_SUCCESS_FAIL)
|
||||
SET(oneValueArgs)
|
||||
SET(multiValueArgs REQUIRED_HEADERS REQUIRED_LIBS_NAMES)
|
||||
|
||||
CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
|
||||
|
||||
SET(_${TPL_NAME}_ENABLE_SUCCESS TRUE)
|
||||
IF (PARSE_REQUIRED_LIBS_NAMES)
|
||||
FIND_LIBRARY(TPL_${TPL_NAME}_LIBRARIES NAMES ${PARSE_REQUIRED_LIBS_NAMES})
|
||||
IF(NOT TPL_${TPL_NAME}_LIBRARIES)
|
||||
SET(_${TPL_NAME}_ENABLE_SUCCESS FALSE)
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
IF (PARSE_REQUIRED_HEADERS)
|
||||
FIND_PATH(TPL_${TPL_NAME}_INCLUDE_DIRS NAMES ${PARSE_REQUIRED_HEADERS})
|
||||
IF(NOT TPL_${TPL_NAME}_INCLUDE_DIRS)
|
||||
SET(_${TPL_NAME}_ENABLE_SUCCESS FALSE)
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
|
||||
|
||||
IF (_${TPL_NAME}_ENABLE_SUCCESS)
|
||||
TIBITS_CREATE_IMPORTED_TPL_LIBRARY(${TPL_NAME})
|
||||
ENDIF()
|
||||
|
||||
ENDFUNCTION()
|
||||
|
||||
MACRO(TRIBITS_PROCESS_TPL_DEP_FILE TPL_FILE)
|
||||
GET_FILENAME_COMPONENT(TPL_NAME ${TPL_FILE} NAME_WE)
|
||||
INCLUDE("${TPL_FILE}")
|
||||
IF(TARGET TPL_LIB_${TPL_NAME})
|
||||
MESSAGE(STATUS "Found tpl library: ${TPL_NAME}")
|
||||
SET(TPL_ENABLE_${TPL_NAME} TRUE)
|
||||
ELSE()
|
||||
MESSAGE(STATUS "Tpl library not found: ${TPL_NAME}")
|
||||
SET(TPL_ENABLE_${TPL_NAME} FALSE)
|
||||
ENDIF()
|
||||
ENDMACRO()
|
||||
|
||||
MACRO(PREPEND_TARGET_SET VARNAME TARGET_NAME TYPE)
|
||||
IF(TYPE STREQUAL "REQUIRED")
|
||||
SET(REQUIRED TRUE)
|
||||
ELSE()
|
||||
SET(REQUIRED FALSE)
|
||||
ENDIF()
|
||||
IF(TARGET ${TARGET_NAME})
|
||||
PREPEND_GLOBAL_SET(${VARNAME} ${TARGET_NAME})
|
||||
ELSE()
|
||||
IF(REQUIRED)
|
||||
MESSAGE(FATAL_ERROR "Missing dependency ${TARGET_NAME}")
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
ENDMACRO()
|
||||
|
||||
MACRO(TRIBITS_APPEND_PACKAGE_DEPS DEP_LIST TYPE)
|
||||
FOREACH(DEP ${ARGN})
|
||||
PREPEND_GLOBAL_SET(${DEP_LIST} PACKAGE_${DEP})
|
||||
ENDFOREACH()
|
||||
ENDMACRO()
|
||||
|
||||
MACRO(TRIBITS_APPEND_TPLS_DEPS DEP_LIST TYPE)
|
||||
FOREACH(DEP ${ARGN})
|
||||
PREPEND_TARGET_SET(${DEP_LIST} TPL_LIB_${DEP} ${TYPE})
|
||||
ENDFOREACH()
|
||||
ENDMACRO()
|
||||
|
||||
MACRO(TRIBITS_ENABLE_TPLS)
|
||||
FOREACH(TPL ${ARGN})
|
||||
IF(TARGET ${TPL})
|
||||
GLOBAL_SET(${PACKAGE_NAME}_ENABLE_${TPL} TRUE)
|
||||
ELSE()
|
||||
GLOBAL_SET(${PACKAGE_NAME}_ENABLE_${TPL} FALSE)
|
||||
ENDIF()
|
||||
ENDFOREACH()
|
||||
ENDMACRO()
|
||||
|
||||
MACRO(TRIBITS_PACKAGE_DEFINE_DEPENDENCIES)
|
||||
|
||||
SET(options)
|
||||
SET(oneValueArgs)
|
||||
SET(multiValueArgs
|
||||
LIB_REQUIRED_PACKAGES
|
||||
LIB_OPTIONAL_PACKAGES
|
||||
TEST_REQUIRED_PACKAGES
|
||||
TEST_OPTIONAL_PACKAGES
|
||||
LIB_REQUIRED_TPLS
|
||||
LIB_OPTIONAL_TPLS
|
||||
TEST_REQUIRED_TPLS
|
||||
TEST_OPTIONAL_TPLS
|
||||
REGRESSION_EMAIL_LIST
|
||||
SUBPACKAGES_DIRS_CLASSIFICATIONS_OPTREQS
|
||||
)
|
||||
CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
|
||||
|
||||
GLOBAL_SET(${PACKAGE_NAME}_DEPS "")
|
||||
TRIBITS_APPEND_PACKAGE_DEPS(${PACKAGE_NAME}_DEPS REQUIRED ${PARSE_LIB_REQUIRED_PACKAGES})
|
||||
TRIBITS_APPEND_PACKAGE_DEPS(${PACKAGE_NAME}_DEPS OPTIONAL ${PARSE_LIB_OPTIONAL_PACKAGES})
|
||||
TRIBITS_APPEND_TPLS_DEPS(${PACKAGE_NAME}_DEPS REQUIRED ${PARSE_LIB_REQUIRED_TPLS})
|
||||
TRIBITS_APPEND_TPLS_DEPS(${PACKAGE_NAME}_DEPS OPTIONAL ${PARSE_LIB_OPTIONAL_TPLS})
|
||||
|
||||
GLOBAL_SET(${PACKAGE_NAME}_TEST_DEPS "")
|
||||
TRIBITS_APPEND_PACKAGE_DEPS(${PACKAGE_NAME}_TEST_DEPS REQUIRED ${PARSE_TEST_REQUIRED_PACKAGES})
|
||||
TRIBITS_APPEND_PACKAGE_DEPS(${PACKAGE_NAME}_TEST_DEPS OPTIONAL ${PARSE_TEST_OPTIONAL_PACKAGES})
|
||||
TRIBITS_APPEND_TPLS_DEPS(${PACKAGE_NAME}_TEST_DEPS REQUIRED ${PARSE_TEST_REQUIRED_TPLS})
|
||||
TRIBITS_APPEND_TPLS_DEPS(${PACKAGE_NAME}_TEST_DEPS OPTIONAL ${PARSE_TEST_OPTIONAL_TPLS})
|
||||
|
||||
TRIBITS_ENABLE_TPLS(${PARSE_LIB_REQUIRED_TPLS} ${PARSE_LIB_OPTIONAL_TPLS} ${PARSE_TEST_REQUIRED_TPLS} ${PARSE_TEST_OPTIONAL_TPLS})
|
||||
|
||||
ENDMACRO()
|
||||
|
||||
MACRO(TRIBITS_SUBPACKAGE NAME)
|
||||
SET(PACKAGE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
SET(PARENT_PACKAGE_NAME ${PACKAGE_NAME})
|
||||
SET(PACKAGE_NAME ${PACKAGE_NAME}${NAME})
|
||||
STRING(TOUPPER ${PACKAGE_NAME} PACKAGE_NAME_UC)
|
||||
|
||||
ADD_INTERFACE_LIBRARY(PACKAGE_${PACKAGE_NAME})
|
||||
|
||||
GLOBAL_SET(${PACKAGE_NAME}_LIBS "")
|
||||
|
||||
INCLUDE(${PACKAGE_SOURCE_DIR}/cmake/Dependencies.cmake)
|
||||
|
||||
ENDMACRO(TRIBITS_SUBPACKAGE)
|
||||
|
||||
MACRO(TRIBITS_SUBPACKAGE_POSTPROCESS)
|
||||
TARGET_LINK_AND_INCLUDE_LIBRARIES(PACKAGE_${PACKAGE_NAME} ${${PACKAGE_NAME}_LIBS})
|
||||
ENDMACRO(TRIBITS_SUBPACKAGE_POSTPROCESS)
|
||||
|
||||
MACRO(TRIBITS_PACKAGE_DECL NAME)
|
||||
|
||||
PROJECT(${NAME})
|
||||
STRING(TOUPPER ${PROJECT_NAME} PROJECT_NAME_UC)
|
||||
SET(PACKAGE_NAME ${PROJECT_NAME})
|
||||
STRING(TOUPPER ${PACKAGE_NAME} PACKAGE_NAME_UC)
|
||||
|
||||
SET(TRIBITS_DEPS_DIR "${CMAKE_SOURCE_DIR}/cmake/deps")
|
||||
FILE(GLOB TPLS_FILES "${TRIBITS_DEPS_DIR}/*.cmake")
|
||||
FOREACH(TPL_FILE ${TPLS_FILES})
|
||||
TRIBITS_PROCESS_TPL_DEP_FILE(${TPL_FILE})
|
||||
ENDFOREACH()
|
||||
|
||||
ENDMACRO()
|
||||
|
||||
|
||||
MACRO(TRIBITS_PROCESS_SUBPACKAGES)
|
||||
FILE(GLOB SUBPACKAGES RELATIVE ${CMAKE_SOURCE_DIR} */cmake/Dependencies.cmake)
|
||||
FOREACH(SUBPACKAGE ${SUBPACKAGES})
|
||||
GET_FILENAME_COMPONENT(SUBPACKAGE_CMAKE ${SUBPACKAGE} DIRECTORY)
|
||||
GET_FILENAME_COMPONENT(SUBPACKAGE_DIR ${SUBPACKAGE_CMAKE} DIRECTORY)
|
||||
ADD_SUBDIRECTORY(${SUBPACKAGE_DIR})
|
||||
ENDFOREACH()
|
||||
ENDMACRO(TRIBITS_PROCESS_SUBPACKAGES)
|
||||
|
||||
MACRO(TRIBITS_PACKAGE_DEF)
|
||||
ENDMACRO(TRIBITS_PACKAGE_DEF)
|
||||
|
||||
MACRO(TRIBITS_EXCLUDE_AUTOTOOLS_FILES)
|
||||
ENDMACRO(TRIBITS_EXCLUDE_AUTOTOOLS_FILES)
|
||||
|
||||
MACRO(TRIBITS_EXCLUDE_FILES)
|
||||
ENDMACRO(TRIBITS_EXCLUDE_FILES)
|
||||
|
||||
MACRO(TRIBITS_PACKAGE_POSTPROCESS)
|
||||
ENDMACRO(TRIBITS_PACKAGE_POSTPROCESS)
|
||||
|
|
@ -0,0 +1,153 @@
|
|||
// -------------------------------------------------------------------------------- //
|
||||
|
||||
The following steps are for workstations/servers with the SEMS environment installed.
|
||||
|
||||
// -------------------------------------------------------------------------------- //
|
||||
Summary:
|
||||
|
||||
- Step 1: Rigorous testing of Kokkos' develop branch for each backend (Serial, OpenMP, Threads, Cuda) with all supported compilers.
|
||||
|
||||
- Step 2: Snapshot Kokkos' develop branch into current Trilinos develop branch.
|
||||
|
||||
- Step 3: Build and test Trilinos with combinations of compilers, types, backends.
|
||||
|
||||
- Step 4: Promote Kokkos develop branch to master if the snapshot does not cause any new tests to fail; else track/fix causes of new failures.
|
||||
|
||||
- Step 5: Snapshot Kokkos tagged master branch into Trilinos and push Trilinos.
|
||||
// -------------------------------------------------------------------------------- //
|
||||
|
||||
|
||||
// -------------------------------------------------------------------------------- //
|
||||
|
||||
Step 1:
|
||||
1.1. Update kokkos develop branch (NOT a fork)
|
||||
|
||||
(From kokkos directory):
|
||||
git fetch --all
|
||||
git checkout develop
|
||||
git reset --hard origin/develop
|
||||
|
||||
1.2. Create a testing directory - here the directory is created within the kokkos directory
|
||||
|
||||
mkdir testing
|
||||
cd testing
|
||||
|
||||
1.3. Run the test_all_sandia script; various compiler and build-list options can be specified
|
||||
|
||||
../config/test_all_sandia
|
||||
|
||||
1.4 Clean repository of untracked files
|
||||
|
||||
cd ../
|
||||
git clean -df
|
||||
|
||||
// -------------------------------------------------------------------------------- //
|
||||
|
||||
Step 2:
|
||||
2.1 Update Trilinos develop branch
|
||||
|
||||
(From Trilinos directory):
|
||||
git checkout develop
|
||||
git fetch --all
|
||||
git reset --hard origin/develop
|
||||
git clean -df
|
||||
|
||||
2.2 Snapshot Kokkos into Trilinos - this requires python/2.7.9 and that both Trilinos and Kokkos be clean - no untracked or modified files
|
||||
|
||||
module load python/2.7.9
|
||||
python KOKKOS_PATH/config/snapshot.py KOKKOS_PATH TRILINOS_PATH/packages
|
||||
|
||||
// -------------------------------------------------------------------------------- //
|
||||
|
||||
Step 3:
|
||||
3.1. Build and test Trilinos with 3 different configurations; a configure-all script is provided in Trilinos and should be modified to test each of the following 3 configurations with appropriate environment variable(s):
|
||||
|
||||
- GCC/4.7.2-OpenMP/Complex
|
||||
Run tests with the following environment variable:
|
||||
|
||||
export OMP_NUM_THREADS=2
|
||||
|
||||
|
||||
- Intel/15.0.2-Serial/NoComplex
|
||||
|
||||
|
||||
- GCC/4.8.4/CUDA/7.5.18-Cuda/Serial/NoComplex
|
||||
Run tests with the following environment variables:
|
||||
|
||||
export CUDA_LAUNCH_BLOCKING=1
|
||||
export CUDA_MANAGED_FORCE_DEVICE_ALLOC=1
|
||||
|
||||
|
||||
mkdir Build
|
||||
cd Build
|
||||
cp TRILINOS_PATH/sampleScripts/Sandia-SEMS/configure-all ./
|
||||
** Set the path to Trilinos appropriately within the configure-all script **
|
||||
source $SEMS_MODULE_ROOT/utils/sems-modules-init.sh kokkos
|
||||
source configure-all
|
||||
make -k (-k means "keep going" to get past build errors; -j12 can also be specified to build with 12 threads, for example)
|
||||
ctest
|
||||
|
||||
3.2. Compare the failed test output to the test output on the dashboard ( testing.sandia.gov/cdash select Trilinos ); investigate and fix problems if new tests fail after the Kokkos snapshot
|
||||
|
||||
// -------------------------------------------------------------------------------- //
|
||||
|
||||
Step 4:
|
||||
4.1. Once all Trilinos tests pass promote Kokkos develop branch to master on Github
|
||||
|
||||
- DO NOT fast-forward the merge!!!!
|
||||
|
||||
(From kokkos directory):
|
||||
git checkout master
|
||||
git fetch --all
|
||||
# Ensure we are on the current origin/master
|
||||
git reset --hard origin/master
|
||||
git merge --no-ff origin/develop
|
||||
|
||||
4.2. Update the tag in kokkos/config/master_history.txt
|
||||
Tag description: MajorNumber.MinorNumber.WeeksSinceMinorNumberUpdate
|
||||
Tag format: #.#.##
|
||||
|
||||
# Prepend master_history.txt with
|
||||
|
||||
# tag: #.#.##
|
||||
# date: mm/dd/yyyy
|
||||
# master: sha1
|
||||
# develop: sha1
|
||||
# -----------------------
|
||||
|
||||
git commit --amend -a
|
||||
|
||||
git tag -a #.#.##
|
||||
tag: #.#.##
|
||||
date: mm/dd/yyyy
|
||||
master: sha1
|
||||
develop: sha1
|
||||
|
||||
git push --follow-tags origin master
|
||||
|
||||
// -------------------------------------------------------------------------------- //
|
||||
|
||||
Step 5:
|
||||
5.1. Make sure Trilinos is up-to-date - chances are other changes have been committed since the integration testing process began. If a substantial change has occurred that may be affected by the snapshot the testing procedure may need to be repeated
|
||||
|
||||
(From Trilinos directory):
|
||||
git checkout develop
|
||||
git fetch --all
|
||||
git reset --hard origin/develop
|
||||
git clean -df
|
||||
|
||||
5.2. Snapshot Kokkos master branch into Trilinos
|
||||
|
||||
(From kokkos directory):
|
||||
git fetch --all
|
||||
git checkout tags/#.#.##
|
||||
git clean -df
|
||||
|
||||
python KOKKOS_PATH/config/snapshot.py KOKKOS_PATH TRILINOS_PATH/packages
|
||||
|
||||
5.3. Push the updated develop branch of Trilinos to Github - congratulations!!!
|
||||
|
||||
(From Trilinos directory):
|
||||
git push
|
||||
|
||||
// -------------------------------------------------------------------------------- //
|
|
@ -0,0 +1,3 @@
|
|||
tag: 2.01.00 date: 07:21:2016 master: xxxxxxxx develop: fa6dfcc4
|
||||
tag: 2.01.06 date: 09:02:2016 master: 9afaa87f develop: 555f1a3a
|
||||
|
|
@ -1,17 +1,12 @@
|
|||
#!/bin/bash
|
||||
#
|
||||
# This shell script (nvcc_wrapper) wraps both the host compiler and
|
||||
# NVCC, if you are building Trilinos with CUDA enabled. The script
|
||||
# remedies some differences between the interface of NVCC and that of
|
||||
# the host compiler, in particular for linking. It also means that
|
||||
# Trilinos doesn't need separate .cu files; it can just use .cpp
|
||||
# files.
|
||||
# NVCC, if you are building legacy C or C++ code with CUDA enabled.
|
||||
# The script remedies some differences between the interface of NVCC
|
||||
# and that of the host compiler, in particular for linking.
|
||||
# It also means that a legacy code doesn't need separate .cu files;
|
||||
# it can just use .cpp files.
|
||||
#
|
||||
# Hopefully, at some point, NVIDIA may fix NVCC so as to make this
|
||||
# script obsolete. For now, this script exists and if you want to
|
||||
# build Trilinos with CUDA enabled, you must use this script as your
|
||||
# compiler.
|
||||
|
||||
# Default settings: change those according to your machine. For
|
||||
# example, you may have have two different wrappers with either icpc
|
||||
# or g++ as their back-end compiler. The defaults can be overwritten
|
||||
|
@ -53,6 +48,10 @@ object_files=""
|
|||
# Link objects for the host linker only
|
||||
object_files_xlinker=""
|
||||
|
||||
# Shared libraries with version numbers are not handled correctly by NVCC
|
||||
shared_versioned_libraries_host=""
|
||||
shared_versioned_libraries=""
|
||||
|
||||
# Does the User set the architecture
|
||||
arch_set=0
|
||||
|
||||
|
@ -76,6 +75,9 @@ first_xcompiler_arg=1
|
|||
|
||||
temp_dir=${TMPDIR:-/tmp}
|
||||
|
||||
# Check if we have an optimization argument already
|
||||
optimization_applied=0
|
||||
|
||||
#echo "Arguments: $# $@"
|
||||
|
||||
while [ $# -gt 0 ]
|
||||
|
@ -97,8 +99,17 @@ do
|
|||
*.cpp|*.cxx|*.cc|*.C|*.c++|*.cu)
|
||||
cpp_files="$cpp_files $1"
|
||||
;;
|
||||
# Ensure we only have one optimization flag because NVCC doesn't allow muliple
|
||||
-O*)
|
||||
if [ $optimization_applied -eq 1 ]; then
|
||||
echo "nvcc_wrapper - *warning* you have set multiple optimization flags (-O*), only the first is used because nvcc can only accept a single optimization setting."
|
||||
else
|
||||
shared_args="$shared_args $1"
|
||||
optimization_applied=1
|
||||
fi
|
||||
;;
|
||||
#Handle shared args (valid for both nvcc and the host compiler)
|
||||
-O*|-D*|-c|-I*|-L*|-l*|-g|--help|--version|-E|-M|-shared)
|
||||
-D*|-c|-I*|-L*|-l*|-g|--help|--version|-E|-M|-shared)
|
||||
shared_args="$shared_args $1"
|
||||
;;
|
||||
#Handle shared args that have an argument
|
||||
|
@ -107,7 +118,7 @@ do
|
|||
shift
|
||||
;;
|
||||
#Handle known nvcc args
|
||||
-gencode*|--dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|--resource-usage)
|
||||
-gencode*|--dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|--resource-usage|-Xptxas*)
|
||||
cuda_args="$cuda_args $1"
|
||||
;;
|
||||
#Handle known nvcc args that have an argument
|
||||
|
@ -175,10 +186,15 @@ do
|
|||
object_files_xlinker="$object_files_xlinker -Xlinker $1"
|
||||
;;
|
||||
#Handle object files which always need to use "-Xlinker": -x cu applies to all input files, so give them to linker, except if only linking
|
||||
*.so.*|*.dylib)
|
||||
*.dylib)
|
||||
object_files="$object_files -Xlinker $1"
|
||||
object_files_xlinker="$object_files_xlinker -Xlinker $1"
|
||||
;;
|
||||
#Handle shared libraries with *.so.* names which nvcc can't do.
|
||||
*.so.*)
|
||||
shared_versioned_libraries_host="$shared_versioned_libraries_host $1"
|
||||
shared_versioned_libraries="$shared_versioned_libraries -Xlinker $1"
|
||||
;;
|
||||
#All other args are sent to the host compiler
|
||||
*)
|
||||
if [ $first_xcompiler_arg -eq 1 ]; then
|
||||
|
@ -204,13 +220,13 @@ if [ $arch_set -ne 1 ]; then
|
|||
fi
|
||||
|
||||
#Compose compilation command
|
||||
nvcc_command="nvcc $cuda_args $shared_args $xlinker_args"
|
||||
nvcc_command="nvcc $cuda_args $shared_args $xlinker_args $shared_versioned_libraries"
|
||||
if [ $first_xcompiler_arg -eq 0 ]; then
|
||||
nvcc_command="$nvcc_command -Xcompiler $xcompiler_args"
|
||||
fi
|
||||
|
||||
#Compose host only command
|
||||
host_command="$host_compiler $shared_args $xcompiler_args $host_linker_args"
|
||||
host_command="$host_compiler $shared_args $xcompiler_args $host_linker_args $shared_versioned_libraries_host"
|
||||
|
||||
#nvcc does not accept '#pragma ident SOME_MACRO_STRING' but it does accept '#ident SOME_MACRO_STRING'
|
||||
if [ $replace_pragma_ident -eq 1 ]; then
|
||||
|
|
|
@ -6,34 +6,36 @@
|
|||
|
||||
set -o pipefail
|
||||
|
||||
# Determine current machine
|
||||
|
||||
MACHINE=""
|
||||
HOSTNAME=$(hostname)
|
||||
if [[ "$HOSTNAME" =~ (white|ride).* ]]; then
|
||||
MACHINE=white
|
||||
elif [[ "$HOSTNAME" =~ .*bowman.* ]]; then
|
||||
MACHINE=bowman
|
||||
elif [[ "$HOSTNAME" =~ node.* ]]; then # Warning: very generic name
|
||||
MACHINE=shepard
|
||||
elif [ ! -z "$SEMS_MODULEFILES_ROOT" ]; then
|
||||
MACHINE=sems
|
||||
else
|
||||
echo "Unrecognized machine" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
GCC_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial"
|
||||
IBM_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
|
||||
INTEL_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial"
|
||||
CLANG_BUILD_LIST="Pthread,Serial,Pthread_Serial"
|
||||
CUDA_BUILD_LIST="Cuda_OpenMP,Cuda_Pthread,Cuda_Serial"
|
||||
|
||||
GCC_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wignored-qualifiers,-Wempty-body,-Wclobbered,-Wuninitialized"
|
||||
IBM_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
|
||||
CLANG_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
|
||||
INTEL_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
|
||||
CUDA_WARNING_FLAGS=""
|
||||
|
||||
BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>/base,hwloc/1.10.1/<COMPILER_NAME>/<COMPILER_VERSION>/base"
|
||||
CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/4.7.2/base"
|
||||
|
||||
export OMP_NUM_THREADS=4
|
||||
|
||||
declare -i NUM_RESULTS_TO_KEEP=7
|
||||
|
||||
RESULT_ROOT_PREFIX=TestAll
|
||||
|
||||
source /projects/modulefiles/utils/sems-modules-init.sh
|
||||
source /projects/modulefiles/utils/kokkos-modules-init.sh
|
||||
|
||||
SCRIPT_KOKKOS_ROOT=$( cd "$( dirname "$0" )" && cd .. && pwd )
|
||||
|
||||
#
|
||||
# Handle arguments
|
||||
#
|
||||
|
||||
# Default. Machine specific can override
|
||||
DEBUG=False
|
||||
ARGS=""
|
||||
CUSTOM_BUILD_LIST=""
|
||||
|
@ -41,6 +43,107 @@ DRYRUN=False
|
|||
BUILD_ONLY=False
|
||||
declare -i NUM_JOBS_TO_RUN_IN_PARALLEL=3
|
||||
TEST_SCRIPT=False
|
||||
SKIP_HWLOC=False
|
||||
|
||||
ARCH_FLAG=""
|
||||
|
||||
#
|
||||
# Machine specific config
|
||||
#
|
||||
|
||||
if [ "$MACHINE" = "sems" ]; then
|
||||
source /projects/modulefiles/utils/sems-modules-init.sh
|
||||
source /projects/modulefiles/utils/kokkos-modules-init.sh
|
||||
|
||||
BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>/base,hwloc/1.10.1/<COMPILER_NAME>/<COMPILER_VERSION>/base"
|
||||
CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/4.7.2/base"
|
||||
|
||||
# Format: (compiler module-list build-list exe-name warning-flag)
|
||||
COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/4.9.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/5.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"clang/3.5.2 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
|
||||
"clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
|
||||
"cuda/6.5.14 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
"cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
"cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
)
|
||||
|
||||
elif [ "$MACHINE" = "white" ]; then
|
||||
source /etc/profile.d/modules.sh
|
||||
SKIP_HWLOC=True
|
||||
export SLURM_TASKS_PER_NODE=32
|
||||
|
||||
BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>"
|
||||
IBM_MODULE_LIST="<COMPILER_NAME>/xl/<COMPILER_VERSION>"
|
||||
CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/4.9.2"
|
||||
|
||||
# Don't do pthread on white
|
||||
GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
|
||||
|
||||
# Format: (compiler module-list build-list exe-name warning-flag)
|
||||
COMPILERS=("gcc/4.9.2 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/5.3.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"ibm/13.1.3 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS"
|
||||
)
|
||||
|
||||
ARCH_FLAG="--arch=Power8"
|
||||
NUM_JOBS_TO_RUN_IN_PARALLEL=8
|
||||
|
||||
elif [ "$MACHINE" = "bowman" ]; then
|
||||
source /etc/profile.d/modules.sh
|
||||
SKIP_HWLOC=True
|
||||
export SLURM_TASKS_PER_NODE=32
|
||||
|
||||
BASE_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
|
||||
|
||||
OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial"
|
||||
|
||||
# Format: (compiler module-list build-list exe-name warning-flag)
|
||||
COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"intel/17.0.064 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
)
|
||||
|
||||
ARCH_FLAG="--arch=KNL"
|
||||
NUM_JOBS_TO_RUN_IN_PARALLEL=8
|
||||
|
||||
elif [ "$MACHINE" = "shepard" ]; then
|
||||
source /etc/profile.d/modules.sh
|
||||
SKIP_HWLOC=True
|
||||
export SLURM_TASKS_PER_NODE=32
|
||||
|
||||
BASE_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
|
||||
|
||||
OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial"
|
||||
|
||||
# Format: (compiler module-list build-list exe-name warning-flag)
|
||||
COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"intel/17.0.064 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
)
|
||||
|
||||
ARCH_FLAG="--arch=HSW"
|
||||
NUM_JOBS_TO_RUN_IN_PARALLEL=8
|
||||
|
||||
else
|
||||
echo "Unhandled machine $MACHINE" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
export OMP_NUM_THREADS=4
|
||||
|
||||
declare -i NUM_RESULTS_TO_KEEP=7
|
||||
|
||||
RESULT_ROOT_PREFIX=TestAll
|
||||
|
||||
SCRIPT_KOKKOS_ROOT=$( cd "$( dirname "$0" )" && cd .. && pwd )
|
||||
|
||||
#
|
||||
# Handle arguments
|
||||
#
|
||||
|
||||
while [[ $# > 0 ]]
|
||||
do
|
||||
|
@ -61,6 +164,9 @@ BUILD_ONLY=True
|
|||
--test-script*)
|
||||
TEST_SCRIPT=True
|
||||
;;
|
||||
--skip-hwloc*)
|
||||
SKIP_HWLOC=True
|
||||
;;
|
||||
--num*)
|
||||
NUM_JOBS_TO_RUN_IN_PARALLEL="${key#*=}"
|
||||
;;
|
||||
|
@ -73,6 +179,7 @@ echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory"
|
|||
echo " Defaults to root repo containing this script"
|
||||
echo "--debug: Run tests in debug. Defaults to False"
|
||||
echo "--test-script: Test this script, not Kokkos"
|
||||
echo "--skip-hwloc: Do not do hwloc tests"
|
||||
echo "--num=N: Number of jobs to run in parallel "
|
||||
echo "--dry-run: Just print what would be executed"
|
||||
echo "--build-only: Just do builds, don't run anything"
|
||||
|
@ -82,21 +189,16 @@ echo " Valid items:"
|
|||
echo " OpenMP, Pthread, Serial, OpenMP_Serial, Pthread_Serial"
|
||||
echo " Cuda_OpenMP, Cuda_Pthread, Cuda_Serial"
|
||||
echo ""
|
||||
|
||||
echo "ARGS: list of expressions matching compilers to test"
|
||||
echo " supported compilers"
|
||||
echo " gcc/4.7.2"
|
||||
echo " gcc/4.8.4"
|
||||
echo " gcc/4.9.2"
|
||||
echo " gcc/5.1.0"
|
||||
echo " intel/14.0.4"
|
||||
echo " intel/15.0.2"
|
||||
echo " intel/16.0.1"
|
||||
echo " clang/3.5.2"
|
||||
echo " clang/3.6.1"
|
||||
echo " cuda/6.5.14"
|
||||
echo " cuda/7.0.28"
|
||||
echo " cuda/7.5.18"
|
||||
echo " supported compilers sems"
|
||||
for COMPILER_DATA in "${COMPILERS[@]}"; do
|
||||
ARR=($COMPILER_DATA)
|
||||
COMPILER=${ARR[0]}
|
||||
echo " $COMPILER"
|
||||
done
|
||||
echo ""
|
||||
|
||||
echo "Examples:"
|
||||
echo " Run all tests"
|
||||
echo " % test_all_sandia"
|
||||
|
@ -147,21 +249,6 @@ if [ -z "$ARGS" ]; then
|
|||
ARGS='?'
|
||||
fi
|
||||
|
||||
# Format: (compiler module-list build-list exe-name warning-flag)
|
||||
COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/4.9.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/5.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"clang/3.5.2 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
|
||||
"clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
|
||||
"cuda/6.5.14 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
"cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
"cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
)
|
||||
|
||||
# Process args to figure out which compilers to test
|
||||
COMPILERS_TO_TEST=""
|
||||
for ARG in $ARGS; do
|
||||
|
@ -240,18 +327,19 @@ run_cmd() {
|
|||
fi
|
||||
}
|
||||
|
||||
# report_and_log_test_results <SUCCESS> <DESC> <PHASE>
|
||||
# report_and_log_test_results <SUCCESS> <DESC> <COMMENT>
|
||||
report_and_log_test_result() {
|
||||
# Use sane var names
|
||||
local success=$1; local desc=$2; local phase=$3;
|
||||
local success=$1; local desc=$2; local comment=$3;
|
||||
|
||||
if [ "$success" = "0" ]; then
|
||||
echo " PASSED $desc"
|
||||
touch $PASSED_DIR/$desc
|
||||
echo $comment > $PASSED_DIR/$desc
|
||||
else
|
||||
# For failures, comment should be the name of the phase that failed
|
||||
echo " FAILED $desc" >&2
|
||||
echo $phase > $FAILED_DIR/$desc
|
||||
cat ${desc}.${phase}.log
|
||||
echo $comment > $FAILED_DIR/$desc
|
||||
cat ${desc}.${comment}.log
|
||||
fi
|
||||
}
|
||||
|
||||
|
@ -309,6 +397,8 @@ single_build_and_test() {
|
|||
|
||||
echo " Starting job $desc"
|
||||
|
||||
local comment="no_comment"
|
||||
|
||||
if [ "$TEST_SCRIPT" = "True" ]; then
|
||||
local rand=$[ 1 + $[ RANDOM % 10 ]]
|
||||
sleep $rand
|
||||
|
@ -316,14 +406,19 @@ single_build_and_test() {
|
|||
run_cmd ls fake_problem >& ${desc}.configure.log || { report_and_log_test_result 1 $desc configure && return 0; }
|
||||
fi
|
||||
else
|
||||
run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
|
||||
run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
|
||||
local -i build_start_time=$(date +%s)
|
||||
run_cmd make build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
|
||||
local -i build_end_time=$(date +%s)
|
||||
comment="build_time=$(($build_end_time-$build_start_time))"
|
||||
if [[ "$BUILD_ONLY" == False ]]; then
|
||||
run_cmd make test >& ${desc}.test.log || { report_and_log_test_result 1 ${desc} test && return 0; }
|
||||
local -i run_end_time=$(date +%s)
|
||||
comment="$comment run_time=$(($run_end_time-$build_end_time))"
|
||||
fi
|
||||
fi
|
||||
|
||||
report_and_log_test_result 0 $desc
|
||||
report_and_log_test_result 0 $desc "$comment"
|
||||
|
||||
return 0
|
||||
}
|
||||
|
@ -374,7 +469,7 @@ build_and_test_all() {
|
|||
run_in_background $compiler $build $BUILD_TYPE
|
||||
|
||||
# If not cuda, do a hwloc test too
|
||||
if [[ "$compiler" != cuda* ]]; then
|
||||
if [[ "$compiler" != cuda* && "$SKIP_HWLOC" == False ]]; then
|
||||
run_in_background $compiler $build "hwloc-$BUILD_TYPE"
|
||||
fi
|
||||
done
|
||||
|
@ -401,7 +496,11 @@ wait_summarize_and_exit() {
|
|||
echo "PASSED TESTS"
|
||||
echo "#######################################################"
|
||||
|
||||
\ls -1 $PASSED_DIR | sort
|
||||
local passed_test
|
||||
for passed_test in $(\ls -1 $PASSED_DIR | sort)
|
||||
do
|
||||
echo $passed_test $(cat $PASSED_DIR/$passed_test)
|
||||
done
|
||||
|
||||
echo "#######################################################"
|
||||
echo "FAILED TESTS"
|
||||
|
@ -409,7 +508,7 @@ wait_summarize_and_exit() {
|
|||
|
||||
local failed_test
|
||||
local -i rv=0
|
||||
for failed_test in $(\ls -1 $FAILED_DIR)
|
||||
for failed_test in $(\ls -1 $FAILED_DIR | sort)
|
||||
do
|
||||
echo $failed_test "("$(cat $FAILED_DIR/$failed_test)" failed)"
|
||||
rv=$rv+1
|
||||
|
|
|
@ -16,11 +16,22 @@ IF(Kokkos_ENABLE_OpenMP)
|
|||
LIST( APPEND SOURCES TestOpenMP.cpp)
|
||||
ENDIF()
|
||||
|
||||
TRIBITS_ADD_EXECUTABLE_AND_TEST(
|
||||
PerformanceTest
|
||||
# Per #374, we always want to build this test, but we only want to run
|
||||
# it as a PERFORMANCE test. That's why we separate building the test
|
||||
# from running the test.
|
||||
|
||||
TRIBITS_ADD_EXECUTABLE(
|
||||
PerfTestExec
|
||||
SOURCES ${SOURCES}
|
||||
COMM serial mpi
|
||||
NUM_MPI_PROCS 1
|
||||
FAIL_REGULAR_EXPRESSION " FAILED "
|
||||
TESTONLYLIBS kokkos_gtest
|
||||
)
|
||||
|
||||
TRIBITS_ADD_TEST(
|
||||
PerformanceTest
|
||||
NAME PerfTestExec
|
||||
COMM serial mpi
|
||||
NUM_MPI_PROCS 1
|
||||
CATEGORIES PERFORMANCE
|
||||
FAIL_REGULAR_EXPRESSION " FAILED "
|
||||
)
|
||||
|
|
|
@ -54,6 +54,8 @@
|
|||
|
||||
#if defined( KOKKOS_HAVE_CUDA )
|
||||
|
||||
#include <TestDynRankView.hpp>
|
||||
|
||||
#include <Kokkos_UnorderedMap.hpp>
|
||||
|
||||
#include <TestGlobal2LocalIds.hpp>
|
||||
|
@ -77,6 +79,13 @@ protected:
|
|||
}
|
||||
};
|
||||
|
||||
TEST_F( cuda, dynrankview_perf )
|
||||
{
|
||||
std::cout << "Cuda" << std::endl;
|
||||
std::cout << " DynRankView vs View: Initialization Only " << std::endl;
|
||||
test_dynrankview_op_perf<Kokkos::Cuda>( 4096 );
|
||||
}
|
||||
|
||||
TEST_F( cuda, global_2_local)
|
||||
{
|
||||
std::cout << "Cuda" << std::endl;
|
||||
|
|
|
@ -0,0 +1,265 @@
|
|||
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
|
||||
#ifndef KOKKOS_TEST_DYNRANKVIEW_HPP
|
||||
#define KOKKOS_TEST_DYNRANKVIEW_HPP
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <Kokkos_DynRankView.hpp>
|
||||
#include <vector>
|
||||
|
||||
#include <impl/Kokkos_Timer.hpp>
|
||||
|
||||
// Compare performance of DynRankView to View, specific focus on the parenthesis operators
|
||||
|
||||
namespace Performance {
|
||||
|
||||
//View functor
|
||||
template <typename DeviceType>
|
||||
struct InitViewFunctor {
|
||||
typedef Kokkos::View<double***, DeviceType> inviewtype;
|
||||
inviewtype _inview;
|
||||
|
||||
InitViewFunctor( inviewtype &inview_ ) : _inview(inview_)
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(const int i) const {
|
||||
for (unsigned j = 0; j < _inview.dimension(1); ++j) {
|
||||
for (unsigned k = 0; k < _inview.dimension(2); ++k) {
|
||||
_inview(i,j,k) = i/2 -j*j + k/3;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct SumComputationTest
|
||||
{
|
||||
typedef Kokkos::View<double***, DeviceType> inviewtype;
|
||||
inviewtype _inview;
|
||||
|
||||
typedef Kokkos::View<double*, DeviceType> outviewtype;
|
||||
outviewtype _outview;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
SumComputationTest(inviewtype &inview_ , outviewtype &outview_) : _inview(inview_), _outview(outview_) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(const int i) const {
|
||||
for (unsigned j = 0; j < _inview.dimension(1); ++j) {
|
||||
for (unsigned k = 0; k < _inview.dimension(2); ++k) {
|
||||
_outview(i) += _inview(i,j,k) ;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
template <typename DeviceType>
|
||||
struct InitStrideViewFunctor {
|
||||
typedef Kokkos::View<double***, Kokkos::LayoutStride, DeviceType> inviewtype;
|
||||
inviewtype _inview;
|
||||
|
||||
InitStrideViewFunctor( inviewtype &inview_ ) : _inview(inview_)
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(const int i) const {
|
||||
for (unsigned j = 0; j < _inview.dimension(1); ++j) {
|
||||
for (unsigned k = 0; k < _inview.dimension(2); ++k) {
|
||||
_inview(i,j,k) = i/2 -j*j + k/3;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
template <typename DeviceType>
|
||||
struct InitViewRank7Functor {
|
||||
typedef Kokkos::View<double*******, DeviceType> inviewtype;
|
||||
inviewtype _inview;
|
||||
|
||||
InitViewRank7Functor( inviewtype &inview_ ) : _inview(inview_)
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(const int i) const {
|
||||
for (unsigned j = 0; j < _inview.dimension(1); ++j) {
|
||||
for (unsigned k = 0; k < _inview.dimension(2); ++k) {
|
||||
_inview(i,j,k,0,0,0,0) = i/2 -j*j + k/3;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
//DynRankView functor
|
||||
template <typename DeviceType>
|
||||
struct InitDynRankViewFunctor {
|
||||
typedef Kokkos::DynRankView<double, DeviceType> inviewtype;
|
||||
inviewtype _inview;
|
||||
|
||||
InitDynRankViewFunctor( inviewtype &inview_ ) : _inview(inview_)
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(const int i) const {
|
||||
for (unsigned j = 0; j < _inview.dimension(1); ++j) {
|
||||
for (unsigned k = 0; k < _inview.dimension(2); ++k) {
|
||||
_inview(i,j,k) = i/2 -j*j + k/3;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct SumComputationTest
|
||||
{
|
||||
typedef Kokkos::DynRankView<double, DeviceType> inviewtype;
|
||||
inviewtype _inview;
|
||||
|
||||
typedef Kokkos::DynRankView<double, DeviceType> outviewtype;
|
||||
outviewtype _outview;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
SumComputationTest(inviewtype &inview_ , outviewtype &outview_) : _inview(inview_), _outview(outview_) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(const int i) const {
|
||||
for (unsigned j = 0; j < _inview.dimension(1); ++j) {
|
||||
for (unsigned k = 0; k < _inview.dimension(2); ++k) {
|
||||
_outview(i) += _inview(i,j,k) ;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
|
||||
template <typename DeviceType>
|
||||
void test_dynrankview_op_perf( const int par_size )
|
||||
{
|
||||
|
||||
typedef DeviceType execution_space;
|
||||
typedef typename execution_space::size_type size_type;
|
||||
const size_type dim2 = 900;
|
||||
const size_type dim3 = 300;
|
||||
|
||||
double elapsed_time_view = 0;
|
||||
double elapsed_time_compview = 0;
|
||||
double elapsed_time_strideview = 0;
|
||||
double elapsed_time_view_rank7 = 0;
|
||||
double elapsed_time_drview = 0;
|
||||
double elapsed_time_compdrview = 0;
|
||||
Kokkos::Timer timer;
|
||||
{
|
||||
Kokkos::View<double***,DeviceType> testview("testview",par_size,dim2,dim3);
|
||||
typedef InitViewFunctor<DeviceType> FunctorType;
|
||||
|
||||
timer.reset();
|
||||
Kokkos::RangePolicy<DeviceType> policy(0,par_size);
|
||||
Kokkos::parallel_for( policy , FunctorType(testview) );
|
||||
DeviceType::fence();
|
||||
elapsed_time_view = timer.seconds();
|
||||
std::cout << " View time (init only): " << elapsed_time_view << std::endl;
|
||||
|
||||
|
||||
timer.reset();
|
||||
Kokkos::View<double*,DeviceType> sumview("sumview",par_size);
|
||||
Kokkos::parallel_for( policy , typename FunctorType::SumComputationTest(testview, sumview) );
|
||||
DeviceType::fence();
|
||||
elapsed_time_compview = timer.seconds();
|
||||
std::cout << " View sum computation time: " << elapsed_time_view << std::endl;
|
||||
|
||||
|
||||
Kokkos::View<double***,Kokkos::LayoutStride, DeviceType> teststrideview = Kokkos::subview(testview, Kokkos::ALL, Kokkos::ALL,Kokkos::ALL);
|
||||
typedef InitStrideViewFunctor<DeviceType> FunctorStrideType;
|
||||
|
||||
timer.reset();
|
||||
Kokkos::parallel_for( policy , FunctorStrideType(teststrideview) );
|
||||
DeviceType::fence();
|
||||
elapsed_time_strideview = timer.seconds();
|
||||
std::cout << " Strided View time (init only): " << elapsed_time_strideview << std::endl;
|
||||
}
|
||||
{
|
||||
Kokkos::View<double*******,DeviceType> testview("testview",par_size,dim2,dim3,1,1,1,1);
|
||||
typedef InitViewRank7Functor<DeviceType> FunctorType;
|
||||
|
||||
timer.reset();
|
||||
Kokkos::RangePolicy<DeviceType> policy(0,par_size);
|
||||
Kokkos::parallel_for( policy , FunctorType(testview) );
|
||||
DeviceType::fence();
|
||||
elapsed_time_view_rank7 = timer.seconds();
|
||||
std::cout << " View Rank7 time (init only): " << elapsed_time_view_rank7 << std::endl;
|
||||
}
|
||||
{
|
||||
Kokkos::DynRankView<double,DeviceType> testdrview("testdrview",par_size,dim2,dim3);
|
||||
typedef InitDynRankViewFunctor<DeviceType> FunctorType;
|
||||
|
||||
timer.reset();
|
||||
Kokkos::RangePolicy<DeviceType> policy(0,par_size);
|
||||
Kokkos::parallel_for( policy , FunctorType(testdrview) );
|
||||
DeviceType::fence();
|
||||
elapsed_time_drview = timer.seconds();
|
||||
std::cout << " DynRankView time (init only): " << elapsed_time_drview << std::endl;
|
||||
|
||||
timer.reset();
|
||||
Kokkos::DynRankView<double,DeviceType> sumview("sumview",par_size);
|
||||
Kokkos::parallel_for( policy , typename FunctorType::SumComputationTest(testdrview, sumview) );
|
||||
DeviceType::fence();
|
||||
elapsed_time_compdrview = timer.seconds();
|
||||
std::cout << " DynRankView sum computation time: " << elapsed_time_compdrview << std::endl;
|
||||
|
||||
}
|
||||
|
||||
std::cout << " Ratio of View to DynRankView time: " << elapsed_time_view / elapsed_time_drview << std::endl; //expect < 1
|
||||
std::cout << " Ratio of View to DynRankView sum computation time: " << elapsed_time_compview / elapsed_time_compdrview << std::endl; //expect < 1
|
||||
std::cout << " Ratio of View to View Rank7 time: " << elapsed_time_view / elapsed_time_view_rank7 << std::endl; //expect < 1
|
||||
std::cout << " Ratio of StrideView to DynRankView time: " << elapsed_time_strideview / elapsed_time_drview << std::endl; //expect < 1
|
||||
std::cout << " Ratio of DynRankView to View Rank7 time: " << elapsed_time_drview / elapsed_time_view_rank7 << std::endl; //expect ?
|
||||
|
||||
timer.reset();
|
||||
|
||||
} //end test_dynrankview
|
||||
|
||||
|
||||
} //end Performance
|
||||
#endif
|
|
@ -178,7 +178,7 @@ void test_global_to_local_ids(unsigned num_ids)
|
|||
std::cout << num_ids << ", ";
|
||||
|
||||
double elasped_time = 0;
|
||||
Kokkos::Impl::Timer timer;
|
||||
Kokkos::Timer timer;
|
||||
|
||||
local_id_view local_2_global("local_ids", num_ids);
|
||||
global_id_view global_2_local((3u*num_ids)/2u);
|
||||
|
|
|
@ -50,6 +50,8 @@
|
|||
#include <TestGlobal2LocalIds.hpp>
|
||||
#include <TestUnorderedMapPerformance.hpp>
|
||||
|
||||
#include <TestDynRankView.hpp>
|
||||
|
||||
#include <iomanip>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
|
@ -91,6 +93,13 @@ protected:
|
|||
}
|
||||
};
|
||||
|
||||
TEST_F( openmp, dynrankview_perf )
|
||||
{
|
||||
std::cout << "OpenMP" << std::endl;
|
||||
std::cout << " DynRankView vs View: Initialization Only " << std::endl;
|
||||
test_dynrankview_op_perf<Kokkos::OpenMP>( 8192 );
|
||||
}
|
||||
|
||||
TEST_F( openmp, global_2_local)
|
||||
{
|
||||
std::cout << "OpenMP" << std::endl;
|
||||
|
|
|
@ -52,6 +52,8 @@
|
|||
#include <TestGlobal2LocalIds.hpp>
|
||||
#include <TestUnorderedMapPerformance.hpp>
|
||||
|
||||
#include <TestDynRankView.hpp>
|
||||
|
||||
#include <iomanip>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
|
@ -85,6 +87,13 @@ protected:
|
|||
}
|
||||
};
|
||||
|
||||
TEST_F( threads, dynrankview_perf )
|
||||
{
|
||||
std::cout << "Threads" << std::endl;
|
||||
std::cout << " DynRankView vs View: Initialization Only " << std::endl;
|
||||
test_dynrankview_op_perf<Kokkos::Threads>( 8192 );
|
||||
}
|
||||
|
||||
TEST_F( threads, global_2_local)
|
||||
{
|
||||
std::cout << "Threads" << std::endl;
|
||||
|
|
|
@ -80,7 +80,7 @@ struct UnorderedMapTest
|
|||
, map(capacity)
|
||||
, histogram(map.get_histogram())
|
||||
{
|
||||
Kokkos::Impl::Timer wall_clock ;
|
||||
Kokkos::Timer wall_clock ;
|
||||
wall_clock.reset();
|
||||
|
||||
value_type v = {};
|
||||
|
@ -228,7 +228,7 @@ void run_performance_tests(std::string const & base_file_name)
|
|||
distance_out << "\b\b\b " << std::endl;
|
||||
block_distance_out << "\b\b\b " << std::endl;
|
||||
|
||||
Kokkos::Impl::Timer wall_clock ;
|
||||
Kokkos::Timer wall_clock ;
|
||||
for (int i=0; i < num_collisions ; ++i) {
|
||||
wall_clock.reset();
|
||||
std::cout << "Collisions: " << collisions[i] << std::endl;
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -77,10 +77,7 @@ private:
|
|||
|
||||
public:
|
||||
|
||||
typedef Kokkos::Experimental::MemoryPool
|
||||
< typename traits::memory_space
|
||||
, typename traits::execution_space
|
||||
> memory_pool ;
|
||||
typedef Kokkos::Experimental::MemoryPool< typename traits::device_type > memory_pool ;
|
||||
|
||||
private:
|
||||
|
||||
|
@ -338,7 +335,7 @@ public:
|
|||
void operator()( unsigned i ) const
|
||||
{
|
||||
if ( m_destroy && i < m_chunk_max && 0 != m_chunks[i] ) {
|
||||
m_pool.deallocate( m_chunks[i] , m_pool.get_min_chunk_size() );
|
||||
m_pool.deallocate( m_chunks[i] , m_pool.get_min_block_size() );
|
||||
}
|
||||
m_chunks[i] = 0 ;
|
||||
}
|
||||
|
@ -397,7 +394,7 @@ public:
|
|||
// The memory pool chunk is guaranteed to be a power of two
|
||||
, m_chunk_shift(
|
||||
Kokkos::Impl::integral_power_of_two(
|
||||
m_pool.get_min_chunk_size()/sizeof(typename traits::value_type)) )
|
||||
m_pool.get_min_block_size()/sizeof(typename traits::value_type)) )
|
||||
, m_chunk_mask( ( 1 << m_chunk_shift ) - 1 )
|
||||
, m_chunk_max( ( arg_size_max + m_chunk_mask ) >> m_chunk_shift )
|
||||
{
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
|
@ -36,7 +36,7 @@
|
|||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
@ -45,6 +45,7 @@
|
|||
#define KOKKOS_BITSET_IMPL_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#include <impl/Kokkos_BitOps.hpp>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <cstdio>
|
||||
|
@ -52,122 +53,57 @@
|
|||
#include <iostream>
|
||||
#include <iomanip>
|
||||
|
||||
namespace Kokkos { namespace Impl {
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
unsigned rotate_right(unsigned i, int r)
|
||||
unsigned rotate_right( unsigned i, int r )
|
||||
{
|
||||
enum { size = static_cast<int>(sizeof(unsigned)*CHAR_BIT) };
|
||||
return r ? ((i >> r) | (i << (size-r))) : i ;
|
||||
enum { size = static_cast<int>( sizeof(unsigned) * CHAR_BIT ) };
|
||||
return r ? ( ( i >> r ) | ( i << ( size - r ) ) ) : i ;
|
||||
}
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
int bit_scan_forward(unsigned i)
|
||||
{
|
||||
#if defined( __CUDA_ARCH__ )
|
||||
return __ffs(i) - 1;
|
||||
#elif defined( __GNUC__ ) || defined( __GNUG__ )
|
||||
return __builtin_ffs(i) - 1;
|
||||
#elif defined( __INTEL_COMPILER )
|
||||
return _bit_scan_forward(i);
|
||||
#else
|
||||
|
||||
unsigned t = 1u;
|
||||
int r = 0;
|
||||
while (i && (i & t == 0))
|
||||
{
|
||||
t = t << 1;
|
||||
++r;
|
||||
}
|
||||
return r;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
int bit_scan_reverse(unsigned i)
|
||||
{
|
||||
enum { shift = static_cast<int>(sizeof(unsigned)*CHAR_BIT - 1) };
|
||||
#if defined( __CUDA_ARCH__ )
|
||||
return shift - __clz(i);
|
||||
#elif defined( __GNUC__ ) || defined( __GNUG__ )
|
||||
return shift - __builtin_clz(i);
|
||||
#elif defined( __INTEL_COMPILER )
|
||||
return _bit_scan_reverse(i);
|
||||
#else
|
||||
unsigned t = 1u << shift;
|
||||
int r = 0;
|
||||
while (i && (i & t == 0))
|
||||
{
|
||||
t = t >> 1;
|
||||
++r;
|
||||
}
|
||||
return r;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
// count the bits set
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
int popcount(unsigned i)
|
||||
{
|
||||
#if defined( __CUDA_ARCH__ )
|
||||
return __popc(i);
|
||||
#elif defined( __GNUC__ ) || defined( __GNUG__ )
|
||||
return __builtin_popcount(i);
|
||||
#elif defined ( __INTEL_COMPILER )
|
||||
return _popcnt32(i);
|
||||
#else
|
||||
// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetNaive
|
||||
i = i - ((i >> 1) & ~0u/3u); // temp
|
||||
i = (i & ~0u/15u*3u) + ((i >> 2) & ~0u/15u*3u); // temp
|
||||
i = (i + (i >> 4)) & ~0u/255u*15u; // temp
|
||||
return (int)((i * (~0u/255u)) >> (sizeof(unsigned) - 1) * CHAR_BIT); // count
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
template <typename Bitset>
|
||||
template < typename Bitset >
|
||||
struct BitsetCount
|
||||
{
|
||||
typedef Bitset bitset_type;
|
||||
typedef typename bitset_type::execution_space::execution_space execution_space;
|
||||
typedef typename bitset_type::size_type size_type;
|
||||
typedef size_type value_type;
|
||||
typedef Bitset bitset_type;
|
||||
typedef typename bitset_type::execution_space::execution_space execution_space;
|
||||
typedef typename bitset_type::size_type size_type;
|
||||
typedef size_type value_type;
|
||||
|
||||
bitset_type m_bitset;
|
||||
|
||||
BitsetCount( bitset_type const& bitset)
|
||||
BitsetCount( bitset_type const& bitset )
|
||||
: m_bitset(bitset)
|
||||
{}
|
||||
|
||||
size_type apply() const
|
||||
{
|
||||
size_type count = 0u;
|
||||
parallel_reduce(m_bitset.m_blocks.dimension_0(), *this, count);
|
||||
parallel_reduce( m_bitset.m_blocks.dimension_0(), *this, count );
|
||||
return count;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void init( value_type & count)
|
||||
void init( value_type & count ) const
|
||||
{
|
||||
count = 0u;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void join( volatile value_type & count, const volatile size_type & incr )
|
||||
void join( volatile value_type & count, const volatile size_type & incr ) const
|
||||
{
|
||||
count += incr;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()( size_type i, value_type & count) const
|
||||
void operator()( size_type i, value_type & count ) const
|
||||
{
|
||||
count += popcount(m_bitset.m_blocks[i]);
|
||||
count += bit_count( m_bitset.m_blocks[i] );
|
||||
}
|
||||
};
|
||||
|
||||
}} //Kokkos::Impl
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif // KOKKOS_BITSET_IMPL_HPP
|
||||
|
||||
|
|
|
@ -713,13 +713,20 @@ public:
|
|||
typedef Kokkos::Experimental::DynRankView< const T , device > const_dView0 ;
|
||||
|
||||
typedef Kokkos::Experimental::DynRankView< T, device, Kokkos::MemoryUnmanaged > dView0_unmanaged ;
|
||||
typedef typename dView0::host_mirror_space host ;
|
||||
typedef typename dView0::host_mirror_space host_drv_space ;
|
||||
|
||||
typedef Kokkos::Experimental::View< T , device > View0 ;
|
||||
typedef Kokkos::Experimental::View< T* , device > View1 ;
|
||||
typedef Kokkos::Experimental::View< T******* , device > View7 ;
|
||||
|
||||
typedef typename View0::host_mirror_space host_view_space ;
|
||||
|
||||
TestDynViewAPI()
|
||||
{
|
||||
run_test_resize_realloc();
|
||||
run_test_mirror();
|
||||
run_test();
|
||||
run_test_scalar();
|
||||
run_test();
|
||||
run_test_const();
|
||||
run_test_subview();
|
||||
run_test_subview_strided();
|
||||
|
@ -735,19 +742,147 @@ public:
|
|||
TestViewOperator_LeftAndRight< int , device , 1 >::testit(2);
|
||||
}
|
||||
|
||||
static void run_test_resize_realloc()
|
||||
{
|
||||
dView0 drv0("drv0", 10, 20, 30);
|
||||
ASSERT_EQ( drv0.rank(), 3);
|
||||
|
||||
Kokkos::Experimental::resize(drv0, 5, 10);
|
||||
ASSERT_EQ( drv0.rank(), 2);
|
||||
ASSERT_EQ( drv0.dimension_0(), 5);
|
||||
ASSERT_EQ( drv0.dimension_1(), 10);
|
||||
ASSERT_EQ( drv0.dimension_2(), 1);
|
||||
|
||||
Kokkos::Experimental::realloc(drv0, 10, 20);
|
||||
ASSERT_EQ( drv0.rank(), 2);
|
||||
ASSERT_EQ( drv0.dimension_0(), 10);
|
||||
ASSERT_EQ( drv0.dimension_1(), 20);
|
||||
ASSERT_EQ( drv0.dimension_2(), 1);
|
||||
|
||||
}
|
||||
|
||||
static void run_test_mirror()
|
||||
{
|
||||
typedef Kokkos::Experimental::DynRankView< int , host > view_type ;
|
||||
typedef Kokkos::Experimental::DynRankView< int , host_drv_space > view_type ;
|
||||
typedef typename view_type::HostMirror mirror_type ;
|
||||
view_type a("a");
|
||||
mirror_type am = Kokkos::Experimental::create_mirror_view(a);
|
||||
mirror_type ax = Kokkos::Experimental::create_mirror(a);
|
||||
ASSERT_EQ( & a() , & am() );
|
||||
ASSERT_EQ( a.rank() , am.rank() );
|
||||
ASSERT_EQ( ax.rank() , am.rank() );
|
||||
|
||||
if (Kokkos::HostSpace::execution_space::is_initialized() )
|
||||
{
|
||||
Kokkos::DynRankView<double, Kokkos::LayoutLeft, Kokkos::HostSpace> a_h("A",1000);
|
||||
auto a_h2 = Kokkos::create_mirror(Kokkos::HostSpace(),a_h);
|
||||
auto a_d = Kokkos::create_mirror(typename device::memory_space(),a_h);
|
||||
|
||||
int equal_ptr_h_h2 = (a_h.data() ==a_h2.data())?1:0;
|
||||
int equal_ptr_h_d = (a_h.data() ==a_d. data())?1:0;
|
||||
int equal_ptr_h2_d = (a_h2.data()==a_d. data())?1:0;
|
||||
|
||||
ASSERT_EQ(equal_ptr_h_h2,0);
|
||||
ASSERT_EQ(equal_ptr_h_d ,0);
|
||||
ASSERT_EQ(equal_ptr_h2_d,0);
|
||||
|
||||
ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
|
||||
ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
|
||||
|
||||
ASSERT_EQ(a_h.rank(),a_h2.rank());
|
||||
ASSERT_EQ(a_h.rank(),a_d.rank());
|
||||
}
|
||||
if (Kokkos::HostSpace::execution_space::is_initialized() )
|
||||
{
|
||||
Kokkos::DynRankView<double, Kokkos::LayoutRight, Kokkos::HostSpace> a_h("A",1000);
|
||||
auto a_h2 = Kokkos::create_mirror(Kokkos::HostSpace(),a_h);
|
||||
auto a_d = Kokkos::create_mirror(typename device::memory_space(),a_h);
|
||||
|
||||
int equal_ptr_h_h2 = (a_h.data() ==a_h2.data())?1:0;
|
||||
int equal_ptr_h_d = (a_h.data() ==a_d. data())?1:0;
|
||||
int equal_ptr_h2_d = (a_h2.data()==a_d. data())?1:0;
|
||||
|
||||
ASSERT_EQ(equal_ptr_h_h2,0);
|
||||
ASSERT_EQ(equal_ptr_h_d ,0);
|
||||
ASSERT_EQ(equal_ptr_h2_d,0);
|
||||
|
||||
ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
|
||||
ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
|
||||
|
||||
ASSERT_EQ(a_h.rank(),a_h2.rank());
|
||||
ASSERT_EQ(a_h.rank(),a_d.rank());
|
||||
}
|
||||
|
||||
if (Kokkos::HostSpace::execution_space::is_initialized() )
|
||||
{
|
||||
Kokkos::DynRankView<double, Kokkos::LayoutLeft, Kokkos::HostSpace> a_h("A",1000);
|
||||
auto a_h2 = Kokkos::create_mirror_view(Kokkos::HostSpace(),a_h);
|
||||
auto a_d = Kokkos::create_mirror_view(typename device::memory_space(),a_h);
|
||||
|
||||
int equal_ptr_h_h2 = a_h.data() ==a_h2.data()?1:0;
|
||||
int equal_ptr_h_d = a_h.data() ==a_d. data()?1:0;
|
||||
int equal_ptr_h2_d = a_h2.data()==a_d. data()?1:0;
|
||||
|
||||
int is_same_memspace = std::is_same<Kokkos::HostSpace,typename device::memory_space>::value?1:0;
|
||||
ASSERT_EQ(equal_ptr_h_h2,1);
|
||||
ASSERT_EQ(equal_ptr_h_d ,is_same_memspace);
|
||||
ASSERT_EQ(equal_ptr_h2_d ,is_same_memspace);
|
||||
|
||||
ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
|
||||
ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
|
||||
|
||||
ASSERT_EQ(a_h.rank(),a_h2.rank());
|
||||
ASSERT_EQ(a_h.rank(),a_d.rank());
|
||||
}
|
||||
if (Kokkos::HostSpace::execution_space::is_initialized() )
|
||||
{
|
||||
Kokkos::DynRankView<double, Kokkos::LayoutRight, Kokkos::HostSpace> a_h("A",1000);
|
||||
auto a_h2 = Kokkos::create_mirror_view(Kokkos::HostSpace(),a_h);
|
||||
auto a_d = Kokkos::create_mirror_view(typename device::memory_space(),a_h);
|
||||
|
||||
int equal_ptr_h_h2 = a_h.data() ==a_h2.data()?1:0;
|
||||
int equal_ptr_h_d = a_h.data() ==a_d. data()?1:0;
|
||||
int equal_ptr_h2_d = a_h2.data()==a_d. data()?1:0;
|
||||
|
||||
int is_same_memspace = std::is_same<Kokkos::HostSpace,typename device::memory_space>::value?1:0;
|
||||
ASSERT_EQ(equal_ptr_h_h2,1);
|
||||
ASSERT_EQ(equal_ptr_h_d ,is_same_memspace);
|
||||
ASSERT_EQ(equal_ptr_h2_d ,is_same_memspace);
|
||||
|
||||
ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
|
||||
ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
|
||||
|
||||
ASSERT_EQ(a_h.rank(),a_h2.rank());
|
||||
ASSERT_EQ(a_h.rank(),a_d.rank());
|
||||
}
|
||||
if (Kokkos::HostSpace::execution_space::is_initialized() )
|
||||
{
|
||||
typedef Kokkos::DynRankView< int , Kokkos::LayoutStride , Kokkos::HostSpace > view_stride_type ;
|
||||
unsigned order[] = { 6,5,4,3,2,1,0 }, dimen[] = { N0, N1, N2, 2, 2, 2, 2 }; //LayoutRight equivalent
|
||||
view_stride_type a_h( "a" , Kokkos::LayoutStride::order_dimensions(7, order, dimen) );
|
||||
auto a_h2 = Kokkos::create_mirror_view(Kokkos::HostSpace(),a_h);
|
||||
auto a_d = Kokkos::create_mirror_view(typename device::memory_space(),a_h);
|
||||
|
||||
int equal_ptr_h_h2 = a_h.data() ==a_h2.data()?1:0;
|
||||
int equal_ptr_h_d = a_h.data() ==a_d. data()?1:0;
|
||||
int equal_ptr_h2_d = a_h2.data()==a_d. data()?1:0;
|
||||
|
||||
int is_same_memspace = std::is_same<Kokkos::HostSpace,typename device::memory_space>::value?1:0;
|
||||
ASSERT_EQ(equal_ptr_h_h2,1);
|
||||
ASSERT_EQ(equal_ptr_h_d ,is_same_memspace);
|
||||
ASSERT_EQ(equal_ptr_h2_d ,is_same_memspace);
|
||||
|
||||
ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
|
||||
ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
|
||||
|
||||
ASSERT_EQ(a_h.rank(),a_h2.rank());
|
||||
ASSERT_EQ(a_h.rank(),a_d.rank());
|
||||
}
|
||||
}
|
||||
|
||||
static void run_test_scalar()
|
||||
{
|
||||
typedef typename dView0::HostMirror hView0 ;
|
||||
typedef typename dView0::HostMirror hView0 ; //HostMirror of DynRankView is a DynRankView
|
||||
|
||||
dView0 dx , dy ;
|
||||
hView0 hx , hy ;
|
||||
|
@ -765,6 +900,79 @@ public:
|
|||
Kokkos::Experimental::deep_copy( hy , dy );
|
||||
|
||||
ASSERT_EQ( hx(), hy() );
|
||||
ASSERT_EQ( dx.rank() , hx.rank() );
|
||||
ASSERT_EQ( dy.rank() , hy.rank() );
|
||||
|
||||
//View - DynRankView Interoperability tests
|
||||
// deep_copy DynRankView to View
|
||||
View0 vx("vx");
|
||||
Kokkos::deep_copy( vx , dx );
|
||||
ASSERT_EQ( rank(dx) , rank(vx) );
|
||||
|
||||
View0 vy("vy");
|
||||
Kokkos::deep_copy( vy , dy );
|
||||
ASSERT_EQ( rank(dy) , rank(vy) );
|
||||
|
||||
// deep_copy View to DynRankView
|
||||
dView0 dxx("dxx");
|
||||
Kokkos::deep_copy( dxx , vx );
|
||||
ASSERT_EQ( rank(dxx) , rank(vx) );
|
||||
|
||||
|
||||
View7 vcast = dx.ConstDownCast();
|
||||
ASSERT_EQ( dx.dimension_0() , vcast.dimension_0() );
|
||||
ASSERT_EQ( dx.dimension_1() , vcast.dimension_1() );
|
||||
ASSERT_EQ( dx.dimension_2() , vcast.dimension_2() );
|
||||
ASSERT_EQ( dx.dimension_3() , vcast.dimension_3() );
|
||||
ASSERT_EQ( dx.dimension_4() , vcast.dimension_4() );
|
||||
|
||||
View7 vcast1( dy.ConstDownCast() );
|
||||
ASSERT_EQ( dy.dimension_0() , vcast1.dimension_0() );
|
||||
ASSERT_EQ( dy.dimension_1() , vcast1.dimension_1() );
|
||||
ASSERT_EQ( dy.dimension_2() , vcast1.dimension_2() );
|
||||
ASSERT_EQ( dy.dimension_3() , vcast1.dimension_3() );
|
||||
ASSERT_EQ( dy.dimension_4() , vcast1.dimension_4() );
|
||||
|
||||
//View - DynRankView Interoperability tests
|
||||
// copy View to DynRankView
|
||||
dView0 dfromvx( vx );
|
||||
auto hmx = Kokkos::create_mirror_view(dfromvx) ;
|
||||
Kokkos::deep_copy(hmx , dfromvx);
|
||||
auto hvx = Kokkos::create_mirror_view(vx) ;
|
||||
Kokkos::deep_copy(hvx , vx);
|
||||
ASSERT_EQ( rank(hvx) , rank(hmx) );
|
||||
ASSERT_EQ( hvx.dimension_0() , hmx.dimension_0() );
|
||||
ASSERT_EQ( hvx.dimension_1() , hmx.dimension_1() );
|
||||
|
||||
// copy-assign View to DynRankView
|
||||
dView0 dfromvy = vy ;
|
||||
auto hmy = Kokkos::create_mirror_view(dfromvy) ;
|
||||
Kokkos::deep_copy(hmy , dfromvy);
|
||||
auto hvy = Kokkos::create_mirror_view(vy) ;
|
||||
Kokkos::deep_copy(hvy , vy);
|
||||
ASSERT_EQ( rank(hvy) , rank(hmy) );
|
||||
ASSERT_EQ( hvy.dimension_0() , hmy.dimension_0() );
|
||||
ASSERT_EQ( hvy.dimension_1() , hmy.dimension_1() );
|
||||
|
||||
|
||||
View7 vtest1("vtest1",2,2,2,2,2,2,2);
|
||||
dView0 dfromv1( vtest1 );
|
||||
ASSERT_EQ( dfromv1.rank() , vtest1.Rank );
|
||||
ASSERT_EQ( dfromv1.dimension_0() , vtest1.dimension_0() );
|
||||
ASSERT_EQ( dfromv1.dimension_1() , vtest1.dimension_1() );
|
||||
ASSERT_EQ( dfromv1.use_count() , vtest1.use_count() );
|
||||
|
||||
dView0 dfromv2( vcast );
|
||||
ASSERT_EQ( dfromv2.rank() , vcast.Rank );
|
||||
ASSERT_EQ( dfromv2.dimension_0() , vcast.dimension_0() );
|
||||
ASSERT_EQ( dfromv2.dimension_1() , vcast.dimension_1() );
|
||||
ASSERT_EQ( dfromv2.use_count() , vcast.use_count() );
|
||||
|
||||
dView0 dfromv3 = vcast1;
|
||||
ASSERT_EQ( dfromv3.rank() , vcast1.Rank );
|
||||
ASSERT_EQ( dfromv3.dimension_0() , vcast1.dimension_0() );
|
||||
ASSERT_EQ( dfromv3.dimension_1() , vcast1.dimension_1() );
|
||||
ASSERT_EQ( dfromv3.use_count() , vcast1.use_count() );
|
||||
}
|
||||
|
||||
static void run_test()
|
||||
|
@ -782,22 +990,32 @@ public:
|
|||
(void) thing;
|
||||
}
|
||||
|
||||
dView0 d_uninitialized(Kokkos::ViewAllocateWithoutInitializing("uninit"),10,20);
|
||||
ASSERT_TRUE( d_uninitialized.data() != nullptr );
|
||||
ASSERT_EQ( d_uninitialized.rank() , 2 );
|
||||
ASSERT_EQ( d_uninitialized.dimension_0() , 10 );
|
||||
ASSERT_EQ( d_uninitialized.dimension_1() , 20 );
|
||||
ASSERT_EQ( d_uninitialized.dimension_2() , 1 );
|
||||
|
||||
dView0 dx , dy , dz ;
|
||||
hView0 hx , hy , hz ;
|
||||
|
||||
ASSERT_TRUE( dx.ptr_on_device() == 0 );
|
||||
ASSERT_TRUE( dy.ptr_on_device() == 0 );
|
||||
ASSERT_TRUE( dz.ptr_on_device() == 0 );
|
||||
ASSERT_TRUE( Kokkos::Experimental::is_dyn_rank_view<dView0>::value );
|
||||
ASSERT_FALSE( Kokkos::Experimental::is_dyn_rank_view< Kokkos::View<double> >::value );
|
||||
|
||||
ASSERT_TRUE( dx.ptr_on_device() == 0 ); //Okay with UVM
|
||||
ASSERT_TRUE( dy.ptr_on_device() == 0 ); //Okay with UVM
|
||||
ASSERT_TRUE( dz.ptr_on_device() == 0 ); //Okay with UVM
|
||||
ASSERT_TRUE( hx.ptr_on_device() == 0 );
|
||||
ASSERT_TRUE( hy.ptr_on_device() == 0 );
|
||||
ASSERT_TRUE( hz.ptr_on_device() == 0 );
|
||||
ASSERT_EQ( dx.dimension_0() , 0u );
|
||||
ASSERT_EQ( dy.dimension_0() , 0u );
|
||||
ASSERT_EQ( dz.dimension_0() , 0u );
|
||||
ASSERT_EQ( dx.dimension_0() , 0u ); //Okay with UVM
|
||||
ASSERT_EQ( dy.dimension_0() , 0u ); //Okay with UVM
|
||||
ASSERT_EQ( dz.dimension_0() , 0u ); //Okay with UVM
|
||||
ASSERT_EQ( hx.dimension_0() , 0u );
|
||||
ASSERT_EQ( hy.dimension_0() , 0u );
|
||||
ASSERT_EQ( hz.dimension_0() , 0u );
|
||||
ASSERT_EQ( dx.rank() , 0u );
|
||||
ASSERT_EQ( dx.rank() , 0u ); //Okay with UVM
|
||||
ASSERT_EQ( hx.rank() , 0u );
|
||||
|
||||
dx = dView0( "dx" , N1 , N2 , N3 );
|
||||
|
@ -806,11 +1024,11 @@ public:
|
|||
hx = hView0( "hx" , N1 , N2 , N3 );
|
||||
hy = hView0( "hy" , N1 , N2 , N3 );
|
||||
|
||||
ASSERT_EQ( dx.dimension_0() , unsigned(N1) );
|
||||
ASSERT_EQ( dy.dimension_0() , unsigned(N1) );
|
||||
ASSERT_EQ( dx.dimension_0() , unsigned(N1) ); //Okay with UVM
|
||||
ASSERT_EQ( dy.dimension_0() , unsigned(N1) ); //Okay with UVM
|
||||
ASSERT_EQ( hx.dimension_0() , unsigned(N1) );
|
||||
ASSERT_EQ( hy.dimension_0() , unsigned(N1) );
|
||||
ASSERT_EQ( dx.rank() , 3 );
|
||||
ASSERT_EQ( dx.rank() , 3 ); //Okay with UVM
|
||||
ASSERT_EQ( hx.rank() , 3 );
|
||||
|
||||
dx = dView0( "dx" , N0 , N1 , N2 , N3 );
|
||||
|
@ -823,19 +1041,23 @@ public:
|
|||
ASSERT_EQ( hx.dimension_0() , unsigned(N0) );
|
||||
ASSERT_EQ( hy.dimension_0() , unsigned(N0) );
|
||||
ASSERT_EQ( dx.rank() , 4 );
|
||||
ASSERT_EQ( dy.rank() , 4 );
|
||||
ASSERT_EQ( hx.rank() , 4 );
|
||||
ASSERT_EQ( hy.rank() , 4 );
|
||||
|
||||
ASSERT_EQ( dx.use_count() , size_t(1) );
|
||||
|
||||
dView0_unmanaged unmanaged_dx = dx;
|
||||
ASSERT_EQ( dx.use_count() , size_t(1) );
|
||||
|
||||
|
||||
dView0_unmanaged unmanaged_from_ptr_dx = dView0_unmanaged(dx.ptr_on_device(),
|
||||
dx.dimension_0(),
|
||||
dx.dimension_1(),
|
||||
dx.dimension_2(),
|
||||
dx.dimension_3());
|
||||
|
||||
|
||||
{
|
||||
// Destruction of this view should be harmless
|
||||
const_dView0 unmanaged_from_ptr_const_dx( dx.ptr_on_device() ,
|
||||
|
@ -888,6 +1110,19 @@ public:
|
|||
hx = Kokkos::Experimental::create_mirror( dx );
|
||||
hy = Kokkos::Experimental::create_mirror( dy );
|
||||
|
||||
ASSERT_EQ( hx.rank() , dx.rank() );
|
||||
ASSERT_EQ( hy.rank() , dy.rank() );
|
||||
|
||||
ASSERT_EQ( hx.dimension_0() , unsigned(N0) );
|
||||
ASSERT_EQ( hx.dimension_1() , unsigned(N1) );
|
||||
ASSERT_EQ( hx.dimension_2() , unsigned(N2) );
|
||||
ASSERT_EQ( hx.dimension_3() , unsigned(N3) );
|
||||
|
||||
ASSERT_EQ( hy.dimension_0() , unsigned(N0) );
|
||||
ASSERT_EQ( hy.dimension_1() , unsigned(N1) );
|
||||
ASSERT_EQ( hy.dimension_2() , unsigned(N2) );
|
||||
ASSERT_EQ( hy.dimension_3() , unsigned(N3) );
|
||||
|
||||
// T v1 = hx() ; // Generates compile error as intended
|
||||
// T v2 = hx(0,0) ; // Generates compile error as intended
|
||||
// hx(0,0) = v2 ; // Generates compile error as intended
|
||||
|
@ -990,7 +1225,9 @@ public:
|
|||
for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
|
||||
{ ASSERT_EQ( hx(ip,i1,i2,i3) , T(0) ); }
|
||||
}}}}
|
||||
// ASSERT_EQ( hx(0,0,0,0,0,0,0,0) , T(0) ); //Test rank8 op behaves properly - if implemented
|
||||
}
|
||||
|
||||
dz = dx ; ASSERT_EQ( dx, dz); ASSERT_NE( dy, dz);
|
||||
dz = dy ; ASSERT_EQ( dy, dz); ASSERT_NE( dx, dz);
|
||||
|
||||
|
@ -1006,6 +1243,35 @@ public:
|
|||
ASSERT_TRUE( dx.ptr_on_device() == 0 );
|
||||
ASSERT_TRUE( dy.ptr_on_device() == 0 );
|
||||
ASSERT_TRUE( dz.ptr_on_device() == 0 );
|
||||
|
||||
//View - DynRankView Interoperability tests
|
||||
// deep_copy from view to dynrankview
|
||||
const int testdim = 4;
|
||||
dView0 dxx("dxx",testdim);
|
||||
View1 vxx("vxx",testdim);
|
||||
auto hvxx = Kokkos::create_mirror_view(vxx);
|
||||
for (int i = 0; i < testdim; ++i)
|
||||
{ hvxx(i) = i; }
|
||||
Kokkos::deep_copy(vxx,hvxx);
|
||||
Kokkos::deep_copy(dxx,vxx);
|
||||
auto hdxx = Kokkos::create_mirror_view(dxx);
|
||||
Kokkos::deep_copy(hdxx,dxx);
|
||||
for (int i = 0; i < testdim; ++i)
|
||||
{ ASSERT_EQ( hvxx(i) , hdxx(i) ); }
|
||||
|
||||
ASSERT_EQ( rank(hdxx) , rank(hvxx) );
|
||||
ASSERT_EQ( hdxx.dimension_0() , testdim );
|
||||
ASSERT_EQ( hdxx.dimension_0() , hvxx.dimension_0() );
|
||||
|
||||
// deep_copy from dynrankview to view
|
||||
View1 vdxx("vdxx",testdim);
|
||||
auto hvdxx = Kokkos::create_mirror_view(vdxx);
|
||||
Kokkos::deep_copy(hvdxx , hdxx);
|
||||
ASSERT_EQ( rank(hdxx) , rank(hvdxx) );
|
||||
ASSERT_EQ( hvdxx.dimension_0() , testdim );
|
||||
ASSERT_EQ( hdxx.dimension_0() , hvdxx.dimension_0() );
|
||||
for (int i = 0; i < testdim; ++i)
|
||||
{ ASSERT_EQ( hvxx(i) , hvdxx(i) ); }
|
||||
}
|
||||
|
||||
typedef T DataType ;
|
||||
|
@ -1059,35 +1325,66 @@ public:
|
|||
// N0 = 1000,N1 = 3,N2 = 5,N3 = 7
|
||||
unsigned order[] = { 6,5,4,3,2,1,0 }, dimen[] = { N0, N1, N2, 2, 2, 2, 2 }; //LayoutRight equivalent
|
||||
sdView d7( "d7" , Kokkos::LayoutStride::order_dimensions(7, order, dimen) );
|
||||
ASSERT_EQ( d7.rank() , 7 );
|
||||
|
||||
sdView ds0 = Kokkos::subdynrankview( d7 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ); //Should be rank0 subview
|
||||
sdView ds0 = Kokkos::subdynrankview( d7 , 1 , 1 , 1 , 1 , 1 , 1 , 1 );
|
||||
ASSERT_EQ( ds0.rank() , 0 );
|
||||
|
||||
//Basic test - ALL
|
||||
sdView dsALL = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() ); //compiles and runs
|
||||
sdView dsALL = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() );
|
||||
ASSERT_EQ( dsALL.rank() , 7 );
|
||||
|
||||
// Send a single value for one rank
|
||||
// Send a value to final rank returning rank 6 subview
|
||||
sdView dsm1 = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , 1 );
|
||||
ASSERT_EQ( dsm1.rank() , 6 );
|
||||
|
||||
// Send a std::pair as a rank
|
||||
// Send a std::pair as argument to a rank
|
||||
sdView dssp = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , std::pair<unsigned,unsigned>(1,2) );
|
||||
ASSERT_EQ( dssp.rank() , 7 );
|
||||
|
||||
// Send a kokkos::pair as a rank; take default layout as input
|
||||
// Send a kokkos::pair as argument to a rank; take default layout as input
|
||||
dView0 dd0("dd0" , N0 , N1 , N2 , 2 , 2 , 2 , 2 ); //default layout
|
||||
ASSERT_EQ( dd0.rank() , 7 );
|
||||
sdView dtkp = Kokkos::Experimental::subdynrankview( dd0 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) );
|
||||
ASSERT_EQ( dtkp.rank() , 7 );
|
||||
|
||||
// Return rank 7 subview, taking a pair as one argument, layout stride input
|
||||
sdView ds7 = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) );
|
||||
ASSERT_EQ( ds7.rank() , 7 );
|
||||
|
||||
// Default Layout DynRankView
|
||||
dView dv6("dv6" , N0 , N1 , N2 , N3 , 2 , 2 );
|
||||
ASSERT_EQ( dv6.rank() , 6 );
|
||||
|
||||
// DynRankView with LayoutRight
|
||||
typedef Kokkos::Experimental::DynRankView< T , Kokkos::LayoutRight , device > drView ;
|
||||
drView dr5( "dr5" , N0 , N1 , N2 , 2 , 2 );
|
||||
ASSERT_EQ( dr5.rank() , 5 );
|
||||
|
||||
// LayoutStride but arranged as LayoutRight
|
||||
unsigned order3[] = { 4,3,2,1,0 }, dimen3[] = { N0, N1, N2, 2, 2 };
|
||||
sdView d5( "d5" , Kokkos::LayoutStride::order_dimensions(5, order3, dimen3) );
|
||||
// NOTE: unused arg_layout dimensions must be set to ~size_t(0) so that
|
||||
// rank deduction can properly take place
|
||||
unsigned order5[] = { 4,3,2,1,0 }, dimen5[] = { N0, N1, N2, 2, 2 };
|
||||
Kokkos::LayoutStride ls = Kokkos::LayoutStride::order_dimensions(5, order5, dimen5);
|
||||
ls.dimension[5] = ~size_t(0);
|
||||
ls.dimension[6] = ~size_t(0);
|
||||
ls.dimension[7] = ~size_t(0);
|
||||
sdView d5("d5", ls);
|
||||
ASSERT_EQ( d5.rank() , 5 );
|
||||
|
||||
// LayoutStride arranged as LayoutRight - commented out as example that fails unit test
|
||||
// unsigned order5[] = { 4,3,2,1,0 }, dimen5[] = { N0, N1, N2, 2, 2 };
|
||||
// sdView d5( "d5" , Kokkos::LayoutStride::order_dimensions(5, order5, dimen5) );
|
||||
//
|
||||
// Fails the following unit test:
|
||||
// ASSERT_EQ( d5.rank() , dr5.rank() );
|
||||
//
|
||||
// Explanation: In construction of the Kokkos::LayoutStride below, since the
|
||||
// remaining dimensions are not specified, they will default to values of 0
|
||||
// rather than ~size_t(0).
|
||||
// When passed to the DynRankView constructor the default dimensions (of 0)
|
||||
// will be counted toward the dynamic rank and returning an incorrect value
|
||||
// (i.e. rank 7 rather than 5).
|
||||
|
||||
// Check LayoutRight dr5 and LayoutStride d5 dimensions agree (as they should)
|
||||
ASSERT_EQ( d5.dimension_0() , dr5.dimension_0() );
|
||||
|
@ -1100,21 +1397,21 @@ public:
|
|||
|
||||
// Rank 5 subview of rank 5 dynamic rank view, layout stride input
|
||||
sdView ds5 = Kokkos::Experimental::subdynrankview( d5 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) );
|
||||
ASSERT_EQ( ds5.rank() , 5 );
|
||||
|
||||
// Pass in extra ALL arguments beyond the rank of the DynRank View.
|
||||
// This behavior is allowed - ignore the extra ALL arguments when
|
||||
// the src.rank() < number of arguments, but be careful!
|
||||
sdView ds5plus = Kokkos::Experimental::subdynrankview( d5 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) , Kokkos::ALL() );
|
||||
|
||||
ASSERT_EQ( ds5.rank() , ds5plus.rank() );
|
||||
ASSERT_EQ( ds5.dimension_0() , ds5plus.dimension_0() );
|
||||
ASSERT_EQ( ds5.dimension_4() , ds5plus.dimension_4() );
|
||||
ASSERT_EQ( ds5.dimension_5() , ds5plus.dimension_5() );
|
||||
ASSERT_EQ( ds5.rank() , ds5plus.rank() );
|
||||
ASSERT_EQ( ds5.rank() , 5 );
|
||||
|
||||
#if ! defined( KOKKOS_HAVE_CUDA ) || defined ( KOKKOS_USE_CUDA_UVM )
|
||||
ASSERT_EQ( & ds5(1,1,1,1) - & ds5plus(1,1,1,1) , 0 );
|
||||
ASSERT_EQ( & ds5(1,1,1,1,0) - & ds5plus(1,1,1,1,0) , 0 );
|
||||
ASSERT_EQ( & ds5(1,1,1,1,0,0) - & ds5plus(1,1,1,1,0,0) , 0 ); // passing argument to rank beyond the view's rank is allowed iff it is a 0.
|
||||
#endif
|
||||
|
||||
// Similar test to rank 5 above, but create rank 4 subview
|
||||
|
@ -1131,9 +1428,9 @@ public:
|
|||
|
||||
static void run_test_subview_strided()
|
||||
{
|
||||
typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutLeft , host > drview_left ;
|
||||
typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutRight , host > drview_right ;
|
||||
typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutStride , host > drview_stride ;
|
||||
typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutLeft , host_drv_space > drview_left ;
|
||||
typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutRight , host_drv_space > drview_right ;
|
||||
typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutStride , host_drv_space > drview_stride ;
|
||||
|
||||
drview_left xl2( "xl2", 100 , 200 );
|
||||
drview_right xr2( "xr2", 100 , 200 );
|
||||
|
@ -1159,35 +1456,37 @@ public:
|
|||
drview_left xl4( "xl4", 10 , 20 , 30 , 40 );
|
||||
drview_right xr4( "xr4", 10 , 20 , 30 , 40 );
|
||||
|
||||
drview_stride yl4 = Kokkos::Experimental::subdynrankview( xl4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() );
|
||||
drview_stride yr4 = Kokkos::Experimental::subdynrankview( xr4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() );
|
||||
//Replace subdynrankview with subview - test
|
||||
drview_stride yl4 = Kokkos::Experimental::subview( xl4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() );
|
||||
drview_stride yr4 = Kokkos::Experimental::subview( xr4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() );
|
||||
|
||||
ASSERT_EQ( yl4.dimension_0() , xl4.dimension_1() );
|
||||
ASSERT_EQ( yl4.dimension_1() , xl4.dimension_3() );
|
||||
ASSERT_EQ( yr4.dimension_0() , xr4.dimension_1() );
|
||||
ASSERT_EQ( yr4.dimension_1() , xr4.dimension_3() );
|
||||
ASSERT_EQ( yl4.rank() , 2);
|
||||
ASSERT_EQ( yr4.rank() , 2);
|
||||
|
||||
ASSERT_EQ( & yl4(4,4) - & xl4(1,4,2,4) , 0 );
|
||||
ASSERT_EQ( & yr4(4,4) - & xr4(1,4,2,4) , 0 );
|
||||
|
||||
}
|
||||
|
||||
static void run_test_vector()
|
||||
{
|
||||
static const unsigned Length = 1000 , Count = 8 ;
|
||||
|
||||
typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutLeft , host > multivector_type ;
|
||||
typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutLeft , host_drv_space > multivector_type ;
|
||||
|
||||
typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutRight , host > multivector_right_type ;
|
||||
typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutRight , host_drv_space > multivector_right_type ;
|
||||
|
||||
multivector_type mv = multivector_type( "mv" , Length , Count );
|
||||
multivector_right_type mv_right = multivector_right_type( "mv" , Length , Count );
|
||||
|
||||
typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutStride , host > svector_type ;
|
||||
typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutStride , host > smultivector_type ;
|
||||
typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host > const_svector_right_type ; //LayoutStride, not right; setup to match original ViewAPI calls... update
|
||||
typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host > const_svector_type ;
|
||||
typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host > const_smultivector_type ;
|
||||
typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutStride , host_drv_space > svector_type ;
|
||||
typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutStride , host_drv_space > smultivector_type ;
|
||||
typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host_drv_space > const_svector_right_type ;
|
||||
typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host_drv_space > const_svector_type ;
|
||||
typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host_drv_space > const_smultivector_type ;
|
||||
|
||||
svector_type v1 = Kokkos::Experimental::subdynrankview( mv , Kokkos::ALL() , 0 );
|
||||
svector_type v2 = Kokkos::Experimental::subdynrankview( mv , Kokkos::ALL() , 1 );
|
||||
|
@ -1251,7 +1550,6 @@ public:
|
|||
const_smultivector_type cmv( mv );
|
||||
typename smultivector_type::const_type cmvX( cmv );
|
||||
typename const_smultivector_type::const_type ccmvX( cmv );
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -61,8 +61,7 @@ struct TestDynamicView
|
|||
typedef typename Space::execution_space execution_space ;
|
||||
typedef typename Space::memory_space memory_space ;
|
||||
|
||||
typedef Kokkos::Experimental::MemoryPool< memory_space , execution_space >
|
||||
memory_pool_type ;
|
||||
typedef Kokkos::Experimental::MemoryPool<typename Space::device_type> memory_pool_type;
|
||||
|
||||
typedef Kokkos::Experimental::DynamicView<Scalar*,Space> view_type;
|
||||
|
||||
|
@ -129,11 +128,9 @@ struct TestDynamicView
|
|||
typedef Kokkos::TeamPolicy<execution_space,TEST> TestPolicy ;
|
||||
typedef Kokkos::TeamPolicy<execution_space,VERIFY> VerifyPolicy ;
|
||||
|
||||
const unsigned int chunk_size = 1024 ;
|
||||
|
||||
// printf("TestDynamicView::run(%d) construct memory pool\n",arg_total_size);
|
||||
|
||||
memory_pool_type pool( memory_space() , chunk_size , arg_total_size * sizeof(Scalar) );
|
||||
memory_pool_type pool( memory_space() , arg_total_size * sizeof(Scalar) * 1.2 );
|
||||
|
||||
// printf("TestDynamicView::run(%d) construct dynamic view\n",arg_total_size);
|
||||
|
||||
|
|
|
@ -34,6 +34,7 @@
|
|||
#cmakedefine KOKKOS_HAVE_Winthread
|
||||
#cmakedefine KOKKOS_HAVE_OPENMP
|
||||
#cmakedefine KOKKOS_HAVE_HWLOC
|
||||
#cmakedefine KOKKOS_HAVE_DEBUG
|
||||
#cmakedefine KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
|
||||
#cmakedefine KOKKOS_HAVE_CXX11
|
||||
#cmakedefine KOKKOS_HAVE_CUSPARSE
|
||||
|
|
|
@ -8,11 +8,22 @@ SET(SOURCES
|
|||
PerfTestCuda.cpp
|
||||
)
|
||||
|
||||
TRIBITS_ADD_EXECUTABLE_AND_TEST(
|
||||
PerfTest
|
||||
# Per #374, we always want to build this test, but we only want to run
|
||||
# it as a PERFORMANCE test. That's why we separate building the test
|
||||
# from running the test.
|
||||
|
||||
TRIBITS_ADD_EXECUTABLE(
|
||||
PerfTestExec
|
||||
SOURCES ${SOURCES}
|
||||
COMM serial mpi
|
||||
NUM_MPI_PROCS 1
|
||||
FAIL_REGULAR_EXPRESSION " FAILED "
|
||||
TESTONLYLIBS kokkos_gtest
|
||||
)
|
||||
|
||||
TRIBITS_ADD_EXECUTABLE_AND_TEST(
|
||||
PerfTest
|
||||
NAME PerfTestExec
|
||||
COMM serial mpi
|
||||
NUM_MPI_PROCS 1
|
||||
CATEGORIES PERFORMANCE
|
||||
FAIL_REGULAR_EXPRESSION " FAILED "
|
||||
)
|
||||
|
|
|
@ -159,7 +159,7 @@ struct TextureFetch
|
|||
|
||||
Kokkos::Cuda::fence();
|
||||
|
||||
Kokkos::Impl::Timer timer;
|
||||
Kokkos::Timer timer;
|
||||
for (int j=0; j<10; ++j) {
|
||||
RandomReduce f(array,indexes);
|
||||
f.apply(reduce);
|
||||
|
|
|
@ -153,7 +153,7 @@ struct ModifiedGramSchmidt
|
|||
|
||||
Kokkos::deep_copy( one , (Scalar) 1 );
|
||||
|
||||
Kokkos::Impl::Timer timer ;
|
||||
Kokkos::Timer timer ;
|
||||
|
||||
for ( size_type j = 0 ; j < count ; ++j ) {
|
||||
// Reduction : tmp = dot( Q(:,j) , Q(:,j) );
|
||||
|
|
|
@ -252,7 +252,7 @@ struct HexGrad
|
|||
execution_space::fence();
|
||||
|
||||
for ( int i = 0 ; i < iter ; ++i ) {
|
||||
Kokkos::Impl::Timer timer ;
|
||||
Kokkos::Timer timer ;
|
||||
Kokkos::parallel_for( count , HexGrad<execution_space>( coord , grad ) );
|
||||
execution_space::fence();
|
||||
const double dt = timer.seconds();
|
||||
|
|
|
@ -414,24 +414,27 @@ void Loop(int loop, int test, const char* type_name) {
|
|||
|
||||
Kokkos::Impl::Timer timer;
|
||||
T res = LoopVariant<T>(loop,test);
|
||||
double time1 = timer.seconds();
|
||||
double time = timer.seconds();
|
||||
|
||||
timer.reset();
|
||||
T resNonAtomic = LoopVariantNonAtomic<T>(loop,test);
|
||||
double time2 = timer.seconds();
|
||||
double timeNonAtomic = timer.seconds();
|
||||
|
||||
timer.reset();
|
||||
T resSerial = LoopVariantSerial<T>(loop,test);
|
||||
double time3 = timer.seconds();
|
||||
double timeSerial = timer.seconds();
|
||||
|
||||
time1*=1e6/loop;
|
||||
time2*=1e6/loop;
|
||||
time3*=1e6/loop;
|
||||
time *=1e6/loop;
|
||||
timeNonAtomic*=1e6/loop;
|
||||
timeSerial *=1e6/loop;
|
||||
//textcolor_standard();
|
||||
bool passed = true;
|
||||
if(resSerial!=res) passed = false;
|
||||
//if(!passed) textcolor(RESET,BLACK,YELLOW);
|
||||
printf("%s Test %i %s --- Loop: %i Value (S,A,NA): %e %e %e Time: %7.4e %7.4e %7.4e Size of Type %i)",type_name,test,passed?"PASSED":"FAILED",loop,1.0*resSerial,1.0*res,1.0*resNonAtomic,time1,time2,time3,(int)sizeof(T));
|
||||
printf("%s Test %i %s --- Loop: %i Value (S,A,NA): %e %e %e Time: %7.4e %7.4e %7.4e Size of Type %i)",
|
||||
type_name,test,passed?"PASSED":"FAILED",loop,
|
||||
1.0*resSerial,1.0*res,1.0*resNonAtomic,
|
||||
timeSerial,time,timeNonAtomic,(int)sizeof(T));
|
||||
//if(!passed) textcolor_standard();
|
||||
printf("\n");
|
||||
}
|
||||
|
@ -452,7 +455,7 @@ void Test(int loop, int test, const char* type_name) {
|
|||
int main(int argc, char* argv[])
|
||||
{
|
||||
int type = -1;
|
||||
int loop = 1000000;
|
||||
int loop = 100000;
|
||||
int test = -1;
|
||||
|
||||
for(int i=0;i<argc;i++)
|
||||
|
|
|
@ -124,15 +124,31 @@ unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits:
|
|||
|
||||
#endif
|
||||
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
struct CudaLockArraysStruct {
|
||||
int* atomic;
|
||||
int* scratch;
|
||||
int* threadid;
|
||||
};
|
||||
}
|
||||
}
|
||||
__device__ __constant__
|
||||
#ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
|
||||
extern
|
||||
#endif
|
||||
int* kokkos_impl_cuda_atomic_lock_array ;
|
||||
Kokkos::Impl::CudaLockArraysStruct kokkos_impl_cuda_lock_arrays ;
|
||||
|
||||
#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
|
||||
#define CUDA_SPACE_ATOMIC_XOR_MASK 0x15A39
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
void* cuda_resize_scratch_space(size_t bytes, bool force_shrink = false);
|
||||
}
|
||||
}
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
__device__ inline
|
||||
|
@ -140,8 +156,7 @@ bool lock_address_cuda_space(void* ptr) {
|
|||
size_t offset = size_t(ptr);
|
||||
offset = offset >> 2;
|
||||
offset = offset & CUDA_SPACE_ATOMIC_MASK;
|
||||
//offset = offset xor CUDA_SPACE_ATOMIC_XOR_MASK;
|
||||
return (0 == atomicCAS(&kokkos_impl_cuda_atomic_lock_array[offset],0,1));
|
||||
return (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[offset],0,1));
|
||||
}
|
||||
|
||||
__device__ inline
|
||||
|
@ -149,8 +164,7 @@ void unlock_address_cuda_space(void* ptr) {
|
|||
size_t offset = size_t(ptr);
|
||||
offset = offset >> 2;
|
||||
offset = offset & CUDA_SPACE_ATOMIC_MASK;
|
||||
//offset = offset xor CUDA_SPACE_ATOMIC_XOR_MASK;
|
||||
atomicExch( &kokkos_impl_cuda_atomic_lock_array[ offset ], 0);
|
||||
atomicExch( &kokkos_impl_cuda_lock_arrays.atomic[ offset ], 0);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -232,8 +246,11 @@ struct CudaParallelLaunch< DriverType , true > {
|
|||
cudaMemcpyToSymbol( kokkos_impl_cuda_constant_memory_buffer , & driver , sizeof(DriverType) );
|
||||
|
||||
#ifndef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
|
||||
int* lock_array_ptr = lock_array_cuda_space_ptr();
|
||||
cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) );
|
||||
Kokkos::Impl::CudaLockArraysStruct locks;
|
||||
locks.atomic = atomic_lock_array_cuda_space_ptr(false);
|
||||
locks.scratch = scratch_lock_array_cuda_space_ptr(false);
|
||||
locks.threadid = threadid_lock_array_cuda_space_ptr(false);
|
||||
cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
|
||||
#endif
|
||||
|
||||
// Invoke the driver function on the device
|
||||
|
@ -271,8 +288,11 @@ struct CudaParallelLaunch< DriverType , false > {
|
|||
#endif
|
||||
|
||||
#ifndef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
|
||||
int* lock_array_ptr = lock_array_cuda_space_ptr();
|
||||
cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) );
|
||||
Kokkos::Impl::CudaLockArraysStruct locks;
|
||||
locks.atomic = atomic_lock_array_cuda_space_ptr(false);
|
||||
locks.scratch = scratch_lock_array_cuda_space_ptr(false);
|
||||
locks.threadid = threadid_lock_array_cuda_space_ptr(false);
|
||||
cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
|
||||
#endif
|
||||
|
||||
cuda_parallel_launch_local_memory< DriverType ><<< grid , block , shmem , stream >>>( driver );
|
||||
|
|
|
@ -51,10 +51,10 @@
|
|||
/* only compile this file if CUDA is enabled for Kokkos */
|
||||
#ifdef KOKKOS_HAVE_CUDA
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <Kokkos_Cuda.hpp>
|
||||
#include <Kokkos_CudaSpace.hpp>
|
||||
|
||||
#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Internal.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
|
||||
|
@ -107,68 +107,6 @@ void DeepCopyAsyncCuda( void * dst , const void * src , size_t n) {
|
|||
|
||||
namespace Kokkos {
|
||||
|
||||
#if ! KOKKOS_USING_EXP_VIEW
|
||||
|
||||
namespace {
|
||||
|
||||
void texture_object_attach_impl( Impl::AllocationTracker const & tracker
|
||||
, unsigned type_size
|
||||
, ::cudaChannelFormatDesc const & desc
|
||||
)
|
||||
{
|
||||
enum { TEXTURE_BOUND_1D = 2u << 27 };
|
||||
|
||||
if ( tracker.attribute() == NULL ) {
|
||||
// check for correct allocator
|
||||
const bool ok_alloc = tracker.allocator()->support_texture_binding();
|
||||
|
||||
const bool ok_count = (tracker.alloc_size() / type_size) < TEXTURE_BOUND_1D;
|
||||
|
||||
if (ok_alloc && ok_count) {
|
||||
Impl::TextureAttribute * attr = new Impl::TextureAttribute( tracker.alloc_ptr(), tracker.alloc_size(), desc );
|
||||
tracker.set_attribute( attr );
|
||||
}
|
||||
else {
|
||||
std::ostringstream oss;
|
||||
oss << "Error: Cannot attach texture object";
|
||||
if (!ok_alloc) {
|
||||
oss << ", incompatabile allocator " << tracker.allocator()->name();
|
||||
}
|
||||
if (!ok_count) {
|
||||
oss << ", array " << tracker.label() << " too large";
|
||||
}
|
||||
oss << ".";
|
||||
Kokkos::Impl::throw_runtime_exception( oss.str() );
|
||||
}
|
||||
}
|
||||
|
||||
if ( NULL == dynamic_cast<Impl::TextureAttribute *>(tracker.attribute()) ) {
|
||||
std::ostringstream oss;
|
||||
oss << "Error: Allocation " << tracker.label() << " already has an attribute attached.";
|
||||
Kokkos::Impl::throw_runtime_exception( oss.str() );
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} // unnamed namespace
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
Impl::AllocationTracker CudaSpace::allocate_and_track( const std::string & label, const size_t size )
|
||||
{
|
||||
return Impl::AllocationTracker( allocator(), size, label);
|
||||
}
|
||||
|
||||
void CudaSpace::texture_object_attach( Impl::AllocationTracker const & tracker
|
||||
, unsigned type_size
|
||||
, ::cudaChannelFormatDesc const & desc
|
||||
)
|
||||
{
|
||||
texture_object_attach_impl( tracker, type_size, desc );
|
||||
}
|
||||
|
||||
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
|
||||
|
||||
void CudaSpace::access_error()
|
||||
{
|
||||
const std::string msg("Kokkos::CudaSpace::access_error attempt to execute Cuda function from non-Cuda space" );
|
||||
|
@ -183,23 +121,6 @@ void CudaSpace::access_error( const void * const )
|
|||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
#if ! KOKKOS_USING_EXP_VIEW
|
||||
|
||||
Impl::AllocationTracker CudaUVMSpace::allocate_and_track( const std::string & label, const size_t size )
|
||||
{
|
||||
return Impl::AllocationTracker( allocator(), size, label);
|
||||
}
|
||||
|
||||
void CudaUVMSpace::texture_object_attach( Impl::AllocationTracker const & tracker
|
||||
, unsigned type_size
|
||||
, ::cudaChannelFormatDesc const & desc
|
||||
)
|
||||
{
|
||||
texture_object_attach_impl( tracker, type_size, desc );
|
||||
}
|
||||
|
||||
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
|
||||
|
||||
bool CudaUVMSpace::available()
|
||||
{
|
||||
#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && !defined(__APPLE__)
|
||||
|
@ -212,15 +133,6 @@ bool CudaUVMSpace::available()
|
|||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
#if ! KOKKOS_USING_EXP_VIEW
|
||||
|
||||
Impl::AllocationTracker CudaHostPinnedSpace::allocate_and_track( const std::string & label, const size_t size )
|
||||
{
|
||||
return Impl::AllocationTracker( allocator(), size, label);
|
||||
}
|
||||
|
||||
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
@ -824,16 +736,26 @@ print_records( std::ostream & s , const Kokkos::CudaHostPinnedSpace & space , bo
|
|||
|
||||
namespace Kokkos {
|
||||
namespace {
|
||||
__global__ void init_lock_array_kernel() {
|
||||
__global__ void init_lock_array_kernel_atomic() {
|
||||
unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
|
||||
if(i<CUDA_SPACE_ATOMIC_MASK+1)
|
||||
kokkos_impl_cuda_atomic_lock_array[i] = 0;
|
||||
kokkos_impl_cuda_lock_arrays.atomic[i] = 0;
|
||||
}
|
||||
|
||||
__global__ void init_lock_array_kernel_scratch_threadid(int N) {
|
||||
unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
|
||||
if(i<N) {
|
||||
kokkos_impl_cuda_lock_arrays.scratch[i] = 0;
|
||||
kokkos_impl_cuda_lock_arrays.threadid[i] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
namespace Impl {
|
||||
int* lock_array_cuda_space_ptr(bool deallocate) {
|
||||
int* atomic_lock_array_cuda_space_ptr(bool deallocate) {
|
||||
static int* ptr = NULL;
|
||||
if(deallocate) {
|
||||
cudaFree(ptr);
|
||||
|
@ -845,13 +767,60 @@ int* lock_array_cuda_space_ptr(bool deallocate) {
|
|||
return ptr;
|
||||
}
|
||||
|
||||
void init_lock_array_cuda_space() {
|
||||
int is_initialized = 0;
|
||||
if(! is_initialized) {
|
||||
int* lock_array_ptr = lock_array_cuda_space_ptr();
|
||||
cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) );
|
||||
init_lock_array_kernel<<<(CUDA_SPACE_ATOMIC_MASK+255)/256,256>>>();
|
||||
int* scratch_lock_array_cuda_space_ptr(bool deallocate) {
|
||||
static int* ptr = NULL;
|
||||
if(deallocate) {
|
||||
cudaFree(ptr);
|
||||
ptr = NULL;
|
||||
}
|
||||
|
||||
if(ptr==NULL && !deallocate)
|
||||
cudaMalloc(&ptr,sizeof(int)*(Cuda::concurrency()));
|
||||
return ptr;
|
||||
}
|
||||
|
||||
int* threadid_lock_array_cuda_space_ptr(bool deallocate) {
|
||||
static int* ptr = NULL;
|
||||
if(deallocate) {
|
||||
cudaFree(ptr);
|
||||
ptr = NULL;
|
||||
}
|
||||
|
||||
if(ptr==NULL && !deallocate)
|
||||
cudaMalloc(&ptr,sizeof(int)*(Cuda::concurrency()));
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void init_lock_arrays_cuda_space() {
|
||||
static int is_initialized = 0;
|
||||
if(! is_initialized) {
|
||||
Kokkos::Impl::CudaLockArraysStruct locks;
|
||||
locks.atomic = atomic_lock_array_cuda_space_ptr(false);
|
||||
locks.scratch = scratch_lock_array_cuda_space_ptr(false);
|
||||
locks.threadid = threadid_lock_array_cuda_space_ptr(false);
|
||||
cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
|
||||
init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK+255)/256,256>>>();
|
||||
init_lock_array_kernel_scratch_threadid<<<(Kokkos::Cuda::concurrency()+255)/256,256>>>(Kokkos::Cuda::concurrency());
|
||||
}
|
||||
}
|
||||
|
||||
void* cuda_resize_scratch_space(size_t bytes, bool force_shrink) {
|
||||
static void* ptr = NULL;
|
||||
static size_t current_size = 0;
|
||||
if(current_size == 0) {
|
||||
current_size = bytes;
|
||||
ptr = Kokkos::kokkos_malloc<Kokkos::CudaSpace>("CudaSpace::ScratchMemory",current_size);
|
||||
}
|
||||
if(bytes > current_size) {
|
||||
current_size = bytes;
|
||||
ptr = Kokkos::kokkos_realloc<Kokkos::CudaSpace>(ptr,current_size);
|
||||
}
|
||||
if((bytes < current_size) && (force_shrink)) {
|
||||
current_size = bytes;
|
||||
Kokkos::kokkos_free<Kokkos::CudaSpace>(ptr);
|
||||
ptr = Kokkos::kokkos_malloc<Kokkos::CudaSpace>("CudaSpace::ScratchMemory",current_size);
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -50,7 +50,6 @@
|
|||
#ifdef KOKKOS_HAVE_CUDA
|
||||
|
||||
#include <impl/Kokkos_Traits.hpp>
|
||||
#include <impl/Kokkos_AllocationTracker.hpp> // AllocatorAttributeBase
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
|
|
@ -1,198 +0,0 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
#if ! KOKKOS_USING_EXP_VIEW
|
||||
|
||||
/* only compile this file if CUDA is enabled for Kokkos */
|
||||
#ifdef KOKKOS_HAVE_CUDA
|
||||
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Error.hpp>
|
||||
|
||||
#include <sstream>
|
||||
|
||||
namespace Kokkos { namespace Impl {
|
||||
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
TextureAttribute::TextureAttribute( void * const alloc_ptr
|
||||
, size_t alloc_size
|
||||
, cudaChannelFormatDesc const & desc
|
||||
)
|
||||
: m_tex_obj(0)
|
||||
{
|
||||
cuda_device_synchronize();
|
||||
|
||||
struct cudaResourceDesc resDesc ;
|
||||
struct cudaTextureDesc texDesc ;
|
||||
|
||||
memset( & resDesc , 0 , sizeof(resDesc) );
|
||||
memset( & texDesc , 0 , sizeof(texDesc) );
|
||||
|
||||
resDesc.resType = cudaResourceTypeLinear ;
|
||||
resDesc.res.linear.desc = desc ;
|
||||
resDesc.res.linear.sizeInBytes = alloc_size ;
|
||||
resDesc.res.linear.devPtr = alloc_ptr ;
|
||||
|
||||
CUDA_SAFE_CALL( cudaCreateTextureObject( & m_tex_obj , & resDesc, & texDesc, NULL) );
|
||||
|
||||
cuda_device_synchronize();
|
||||
}
|
||||
|
||||
|
||||
TextureAttribute::~TextureAttribute()
|
||||
{
|
||||
if (m_tex_obj) {
|
||||
cudaDestroyTextureObject( m_tex_obj );
|
||||
}
|
||||
}
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
void * CudaMallocAllocator::allocate( size_t size )
|
||||
{
|
||||
void * ptr = NULL;
|
||||
|
||||
CUDA_SAFE_CALL( cudaMalloc( &ptr, size ) );
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void CudaMallocAllocator::deallocate( void * ptr, size_t /*size*/ )
|
||||
{
|
||||
try {
|
||||
CUDA_SAFE_CALL( cudaFree( ptr ) );
|
||||
} catch(...) {}
|
||||
}
|
||||
|
||||
void * CudaMallocAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
|
||||
{
|
||||
void * ptr = old_ptr;
|
||||
if (old_size != new_size) {
|
||||
ptr = allocate( new_size );
|
||||
size_t copy_size = old_size < new_size ? old_size : new_size;
|
||||
|
||||
CUDA_SAFE_CALL( cudaMemcpy( ptr , old_ptr , copy_size , cudaMemcpyDefault ) );
|
||||
|
||||
deallocate( old_ptr, old_size );
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
void * CudaUVMAllocator::allocate( size_t size )
|
||||
{
|
||||
#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION )
|
||||
void * ptr = NULL;
|
||||
CUDA_SAFE_CALL( cudaMallocManaged( &ptr, size, cudaMemAttachGlobal ) );
|
||||
return ptr;
|
||||
#else
|
||||
throw_runtime_exception( "CUDA VERSION does not support UVM" );
|
||||
return NULL;
|
||||
#endif
|
||||
}
|
||||
|
||||
void CudaUVMAllocator::deallocate( void * ptr, size_t /*size*/ )
|
||||
{
|
||||
try {
|
||||
CUDA_SAFE_CALL( cudaFree( ptr ) );
|
||||
} catch(...) {}
|
||||
}
|
||||
|
||||
void * CudaUVMAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
|
||||
{
|
||||
void * ptr = old_ptr;
|
||||
if (old_size != new_size) {
|
||||
ptr = allocate( new_size );
|
||||
size_t copy_size = old_size < new_size ? old_size : new_size;
|
||||
|
||||
CUDA_SAFE_CALL( cudaMemcpy( ptr , old_ptr , copy_size , cudaMemcpyDefault ) );
|
||||
|
||||
deallocate( old_ptr, old_size );
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
void * CudaHostAllocator::allocate( size_t size )
|
||||
{
|
||||
void * ptr = NULL;
|
||||
CUDA_SAFE_CALL( cudaHostAlloc( &ptr , size , cudaHostAllocDefault ) );
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void CudaHostAllocator::deallocate( void * ptr, size_t /*size*/ )
|
||||
{
|
||||
try {
|
||||
CUDA_SAFE_CALL( cudaFreeHost( ptr ) );
|
||||
} catch(...) {}
|
||||
}
|
||||
|
||||
void * CudaHostAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
|
||||
{
|
||||
void * ptr = old_ptr;
|
||||
if (old_size != new_size) {
|
||||
ptr = allocate( new_size );
|
||||
size_t copy_size = old_size < new_size ? old_size : new_size;
|
||||
|
||||
CUDA_SAFE_CALL( cudaMemcpy( ptr , old_ptr , copy_size , cudaMemcpyHostToHost ) );
|
||||
|
||||
deallocate( old_ptr, old_size );
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
#endif //KOKKOS_HAVE_CUDA
|
||||
|
||||
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
|
||||
|
|
@ -1,190 +0,0 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_CUDA_BASIC_ALLOCATORS_HPP
|
||||
#define KOKKOS_CUDA_BASIC_ALLOCATORS_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
#if ! KOKKOS_USING_EXP_VIEW
|
||||
|
||||
/* only compile this file if CUDA is enabled for Kokkos */
|
||||
#ifdef KOKKOS_HAVE_CUDA
|
||||
|
||||
#include <impl/Kokkos_Traits.hpp>
|
||||
#include <impl/Kokkos_AllocationTracker.hpp> // AllocatorAttributeBase
|
||||
|
||||
namespace Kokkos { namespace Impl {
|
||||
|
||||
|
||||
// Cuda 5.0 <texture_types.h> defines 'cudaTextureObject_t'
|
||||
// to be an 'unsigned long long'. This chould change with
|
||||
// future version of Cuda and this typedef would have to
|
||||
// change accordingly.
|
||||
|
||||
#if defined( CUDA_VERSION ) && ( 5000 <= CUDA_VERSION )
|
||||
|
||||
typedef enable_if<
|
||||
sizeof(::cudaTextureObject_t) == sizeof(const void *) ,
|
||||
::cudaTextureObject_t >::type cuda_texture_object_type ;
|
||||
|
||||
#else
|
||||
|
||||
typedef const void * cuda_texture_object_type ;
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
struct TextureAttribute : public AllocatorAttributeBase
|
||||
{
|
||||
cuda_texture_object_type m_tex_obj ;
|
||||
|
||||
TextureAttribute( void * const alloc_ptr
|
||||
, size_t alloc_size
|
||||
, cudaChannelFormatDesc const & desc
|
||||
);
|
||||
|
||||
~TextureAttribute();
|
||||
};
|
||||
|
||||
/// class CudaUnmanagedAllocator
|
||||
/// does nothing when deallocate(ptr,size) is called
|
||||
struct CudaUnmanagedAllocator
|
||||
{
|
||||
static const char * name()
|
||||
{
|
||||
return "Cuda Unmanaged Allocator";
|
||||
}
|
||||
|
||||
static void deallocate(void * /*ptr*/, size_t /*size*/) {}
|
||||
|
||||
static bool support_texture_binding() { return true; }
|
||||
};
|
||||
|
||||
/// class CudaUnmanagedAllocator
|
||||
/// does nothing when deallocate(ptr,size) is called
|
||||
struct CudaUnmanagedUVMAllocator
|
||||
{
|
||||
static const char * name()
|
||||
{
|
||||
return "Cuda Unmanaged UVM Allocator";
|
||||
}
|
||||
|
||||
static void deallocate(void * /*ptr*/, size_t /*size*/) {}
|
||||
|
||||
static bool support_texture_binding() { return true; }
|
||||
};
|
||||
|
||||
/// class CudaUnmanagedHostAllocator
|
||||
/// does nothing when deallocate(ptr,size) is called
|
||||
class CudaUnmanagedHostAllocator
|
||||
{
|
||||
public:
|
||||
static const char * name()
|
||||
{
|
||||
return "Cuda Unmanaged Host Allocator";
|
||||
}
|
||||
// Unmanaged deallocate does nothing
|
||||
static void deallocate(void * /*ptr*/, size_t /*size*/) {}
|
||||
};
|
||||
|
||||
/// class CudaMallocAllocator
|
||||
class CudaMallocAllocator
|
||||
{
|
||||
public:
|
||||
static const char * name()
|
||||
{
|
||||
return "Cuda Malloc Allocator";
|
||||
}
|
||||
|
||||
static void* allocate(size_t size);
|
||||
|
||||
static void deallocate(void * ptr, size_t);
|
||||
|
||||
static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
|
||||
|
||||
static bool support_texture_binding() { return true; }
|
||||
};
|
||||
|
||||
/// class CudaUVMAllocator
|
||||
class CudaUVMAllocator
|
||||
{
|
||||
public:
|
||||
static const char * name()
|
||||
{
|
||||
return "Cuda UVM Allocator";
|
||||
}
|
||||
|
||||
static void* allocate(size_t size);
|
||||
|
||||
static void deallocate(void * ptr, size_t);
|
||||
|
||||
static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
|
||||
|
||||
static bool support_texture_binding() { return true; }
|
||||
};
|
||||
|
||||
/// class CudaHostAllocator
|
||||
class CudaHostAllocator
|
||||
{
|
||||
public:
|
||||
static const char * name()
|
||||
{
|
||||
return "Cuda Host Allocator";
|
||||
}
|
||||
|
||||
static void* allocate(size_t size);
|
||||
|
||||
static void deallocate(void * ptr, size_t);
|
||||
|
||||
static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
|
||||
};
|
||||
|
||||
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
#endif //KOKKOS_HAVE_CUDA
|
||||
|
||||
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
|
||||
|
||||
#endif //KOKKOS_CUDA_BASIC_ALLOCATORS_HPP
|
|
@ -51,8 +51,8 @@
|
|||
|
||||
#include <Cuda/Kokkos_Cuda_Error.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Internal.hpp>
|
||||
#include <impl/Kokkos_AllocationTracker.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
#include <impl/Kokkos_Profiling_Interface.hpp>
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/* Standard 'C' libraries */
|
||||
|
@ -70,7 +70,7 @@ __device__ __constant__
|
|||
unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits::ConstantMemoryUsage / sizeof(unsigned long) ] ;
|
||||
|
||||
__device__ __constant__
|
||||
int* kokkos_impl_cuda_atomic_lock_array ;
|
||||
Kokkos::Impl::CudaLockArraysStruct kokkos_impl_cuda_lock_arrays ;
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -190,7 +190,7 @@ namespace {
|
|||
|
||||
class CudaInternalDevices {
|
||||
public:
|
||||
enum { MAXIMUM_DEVICE_COUNT = 8 };
|
||||
enum { MAXIMUM_DEVICE_COUNT = 64 };
|
||||
struct cudaDeviceProp m_cudaProp[ MAXIMUM_DEVICE_COUNT ] ;
|
||||
int m_cudaDevCount ;
|
||||
|
||||
|
@ -206,6 +206,9 @@ CudaInternalDevices::CudaInternalDevices()
|
|||
|
||||
CUDA_SAFE_CALL (cudaGetDeviceCount( & m_cudaDevCount ) );
|
||||
|
||||
if(m_cudaDevCount > MAXIMUM_DEVICE_COUNT) {
|
||||
Kokkos::abort("Sorry, you have more GPUs per node than we thought anybody would ever have. Please report this to github.com/kokkos/kokkos.");
|
||||
}
|
||||
for ( int i = 0 ; i < m_cudaDevCount ; ++i ) {
|
||||
CUDA_SAFE_CALL( cudaGetDeviceProperties( m_cudaProp + i , i ) );
|
||||
}
|
||||
|
@ -226,14 +229,6 @@ private:
|
|||
CudaInternal( const CudaInternal & );
|
||||
CudaInternal & operator = ( const CudaInternal & );
|
||||
|
||||
#if ! KOKKOS_USING_EXP_VIEW
|
||||
|
||||
AllocationTracker m_scratchFlagsTracker;
|
||||
AllocationTracker m_scratchSpaceTracker;
|
||||
AllocationTracker m_scratchUnifiedTracker;
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
public:
|
||||
|
||||
|
@ -255,6 +250,8 @@ public:
|
|||
size_type * m_scratchUnified ;
|
||||
cudaStream_t * m_stream ;
|
||||
|
||||
static int was_initialized;
|
||||
static int was_finalized;
|
||||
|
||||
static CudaInternal & singleton();
|
||||
|
||||
|
@ -293,6 +290,8 @@ public:
|
|||
size_type * scratch_unified( const size_type size );
|
||||
};
|
||||
|
||||
int CudaInternal::was_initialized = 0;
|
||||
int CudaInternal::was_finalized = 0;
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
@ -367,6 +366,10 @@ CudaInternal & CudaInternal::singleton()
|
|||
|
||||
void CudaInternal::initialize( int cuda_device_id , int stream_count )
|
||||
{
|
||||
if ( was_finalized ) Kokkos::abort("Calling Cuda::initialize after Cuda::finalize is illegal\n");
|
||||
was_initialized = 1;
|
||||
if ( is_initialized() ) return;
|
||||
|
||||
enum { WordSize = sizeof(size_type) };
|
||||
|
||||
if ( ! HostSpace::execution_space::is_initialized() ) {
|
||||
|
@ -526,11 +529,14 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
|
|||
cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
|
||||
|
||||
// Init the array for used for arbitrarily sized atomics
|
||||
Impl::init_lock_array_cuda_space();
|
||||
Impl::init_lock_arrays_cuda_space();
|
||||
|
||||
#ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
|
||||
int* lock_array_ptr = lock_array_cuda_space_ptr();
|
||||
cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) );
|
||||
Kokkos::Impl::CudaLockArraysStruct locks;
|
||||
locks.atomic = atomic_lock_array_cuda_space_ptr(false);
|
||||
locks.scratch = scratch_lock_array_cuda_space_ptr(false);
|
||||
locks.threadid = threadid_lock_array_cuda_space_ptr(false);
|
||||
cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -548,14 +554,6 @@ CudaInternal::scratch_flags( const Cuda::size_type size )
|
|||
|
||||
m_scratchFlagsCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
|
||||
|
||||
#if ! KOKKOS_USING_EXP_VIEW
|
||||
|
||||
m_scratchFlagsTracker = CudaSpace::allocate_and_track( std::string("InternalScratchFlags") , sizeof( ScratchGrain ) * m_scratchFlagsCount );
|
||||
|
||||
m_scratchFlags = reinterpret_cast<size_type *>(m_scratchFlagsTracker.alloc_ptr());
|
||||
|
||||
#else
|
||||
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;
|
||||
|
||||
Record * const r = Record::allocate( Kokkos::CudaSpace()
|
||||
|
@ -566,9 +564,6 @@ CudaInternal::scratch_flags( const Cuda::size_type size )
|
|||
|
||||
m_scratchFlags = reinterpret_cast<size_type *>( r->data() );
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
CUDA_SAFE_CALL( cudaMemset( m_scratchFlags , 0 , m_scratchFlagsCount * sizeScratchGrain ) );
|
||||
}
|
||||
|
||||
|
@ -582,26 +577,15 @@ CudaInternal::scratch_space( const Cuda::size_type size )
|
|||
|
||||
m_scratchSpaceCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
|
||||
|
||||
#if ! KOKKOS_USING_EXP_VIEW
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;
|
||||
|
||||
m_scratchSpaceTracker = CudaSpace::allocate_and_track( std::string("InternalScratchSpace") , sizeof( ScratchGrain ) * m_scratchSpaceCount );
|
||||
Record * const r = Record::allocate( Kokkos::CudaSpace()
|
||||
, "InternalScratchSpace"
|
||||
, ( sizeof( ScratchGrain ) * m_scratchSpaceCount ) );
|
||||
|
||||
m_scratchSpace = reinterpret_cast<size_type *>(m_scratchSpaceTracker.alloc_ptr());
|
||||
|
||||
#else
|
||||
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;
|
||||
|
||||
Record * const r = Record::allocate( Kokkos::CudaSpace()
|
||||
, "InternalScratchSpace"
|
||||
, ( sizeof( ScratchGrain ) * m_scratchSpaceCount ) );
|
||||
|
||||
Record::increment( r );
|
||||
|
||||
m_scratchSpace = reinterpret_cast<size_type *>( r->data() );
|
||||
|
||||
#endif
|
||||
Record::increment( r );
|
||||
|
||||
m_scratchSpace = reinterpret_cast<size_type *>( r->data() );
|
||||
}
|
||||
|
||||
return m_scratchSpace ;
|
||||
|
@ -615,14 +599,6 @@ CudaInternal::scratch_unified( const Cuda::size_type size )
|
|||
|
||||
m_scratchUnifiedCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
|
||||
|
||||
#if ! KOKKOS_USING_EXP_VIEW
|
||||
|
||||
m_scratchUnifiedTracker = CudaHostPinnedSpace::allocate_and_track( std::string("InternalScratchUnified") , sizeof( ScratchGrain ) * m_scratchUnifiedCount );
|
||||
|
||||
m_scratchUnified = reinterpret_cast<size_type *>( m_scratchUnifiedTracker.alloc_ptr() );
|
||||
|
||||
#else
|
||||
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void > Record ;
|
||||
|
||||
Record * const r = Record::allocate( Kokkos::CudaHostPinnedSpace()
|
||||
|
@ -632,9 +608,6 @@ CudaInternal::scratch_unified( const Cuda::size_type size )
|
|||
Record::increment( r );
|
||||
|
||||
m_scratchUnified = reinterpret_cast<size_type *>( r->data() );
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
return m_scratchUnified ;
|
||||
|
@ -644,9 +617,13 @@ CudaInternal::scratch_unified( const Cuda::size_type size )
|
|||
|
||||
void CudaInternal::finalize()
|
||||
{
|
||||
was_finalized = 1;
|
||||
if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {
|
||||
|
||||
lock_array_cuda_space_ptr(true);
|
||||
atomic_lock_array_cuda_space_ptr(false);
|
||||
scratch_lock_array_cuda_space_ptr(false);
|
||||
threadid_lock_array_cuda_space_ptr(false);
|
||||
|
||||
if ( m_stream ) {
|
||||
for ( size_type i = 1 ; i < m_streamCount ; ++i ) {
|
||||
cudaStreamDestroy( m_stream[i] );
|
||||
|
@ -655,14 +632,6 @@ void CudaInternal::finalize()
|
|||
::free( m_stream );
|
||||
}
|
||||
|
||||
#if ! KOKKOS_USING_EXP_VIEW
|
||||
|
||||
m_scratchSpaceTracker.clear();
|
||||
m_scratchFlagsTracker.clear();
|
||||
m_scratchUnifiedTracker.clear();
|
||||
|
||||
#else
|
||||
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< CudaSpace > RecordCuda ;
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< CudaHostPinnedSpace > RecordHost ;
|
||||
|
||||
|
@ -670,8 +639,6 @@ void CudaInternal::finalize()
|
|||
RecordCuda::decrement( RecordCuda::get_record( m_scratchSpace ) );
|
||||
RecordHost::decrement( RecordHost::get_record( m_scratchUnified ) );
|
||||
|
||||
#endif
|
||||
|
||||
m_cudaDev = -1 ;
|
||||
m_multiProcCount = 0 ;
|
||||
m_maxWarpCount = 0 ;
|
||||
|
@ -730,7 +697,13 @@ int Cuda::is_initialized()
|
|||
{ return Impl::CudaInternal::singleton().is_initialized(); }
|
||||
|
||||
void Cuda::initialize( const Cuda::SelectDevice config , size_t num_instances )
|
||||
{ Impl::CudaInternal::singleton().initialize( config.cuda_device_id , num_instances ); }
|
||||
{
|
||||
Impl::CudaInternal::singleton().initialize( config.cuda_device_id , num_instances );
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
Kokkos::Profiling::initialize();
|
||||
#endif
|
||||
}
|
||||
|
||||
std::vector<unsigned>
|
||||
Cuda::detect_device_arch()
|
||||
|
@ -763,7 +736,13 @@ Cuda::size_type Cuda::device_arch()
|
|||
}
|
||||
|
||||
void Cuda::finalize()
|
||||
{ Impl::CudaInternal::singleton().finalize(); }
|
||||
{
|
||||
Impl::CudaInternal::singleton().finalize();
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
Kokkos::Profiling::finalize();
|
||||
#endif
|
||||
}
|
||||
|
||||
Cuda::Cuda()
|
||||
: m_device( Impl::CudaInternal::singleton().m_cudaDev )
|
||||
|
|
|
@ -57,17 +57,20 @@ template<class DriverType, bool Large>
|
|||
struct CudaGetMaxBlockSize;
|
||||
|
||||
template<class DriverType, bool Large = (CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>
|
||||
int cuda_get_max_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
|
||||
return CudaGetMaxBlockSize<DriverType,Large>::get_block_size(f,vector_length, shmem_extra);
|
||||
int cuda_get_max_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
|
||||
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
|
||||
return CudaGetMaxBlockSize<DriverType,Large>::get_block_size(f,vector_length, shmem_extra_block,shmem_extra_thread);
|
||||
}
|
||||
|
||||
|
||||
template<class DriverType>
|
||||
struct CudaGetMaxBlockSize<DriverType,true> {
|
||||
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
|
||||
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
|
||||
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
|
||||
int numBlocks;
|
||||
int blockSize=32;
|
||||
int sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
|
||||
int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
|
||||
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks,
|
||||
cuda_parallel_launch_constant_memory<DriverType>,
|
||||
|
@ -76,7 +79,8 @@ struct CudaGetMaxBlockSize<DriverType,true> {
|
|||
|
||||
while (blockSize<1024 && numBlocks>0) {
|
||||
blockSize*=2;
|
||||
sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length);
|
||||
sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
|
||||
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
|
||||
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks,
|
||||
|
@ -91,11 +95,13 @@ struct CudaGetMaxBlockSize<DriverType,true> {
|
|||
|
||||
template<class DriverType>
|
||||
struct CudaGetMaxBlockSize<DriverType,false> {
|
||||
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
|
||||
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
|
||||
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
|
||||
int numBlocks;
|
||||
|
||||
int blockSize=32;
|
||||
int sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
|
||||
int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
|
||||
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks,
|
||||
cuda_parallel_launch_local_memory<DriverType>,
|
||||
|
@ -104,7 +110,8 @@ struct CudaGetMaxBlockSize<DriverType,false> {
|
|||
|
||||
while (blockSize<1024 && numBlocks>0) {
|
||||
blockSize*=2;
|
||||
sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
|
||||
sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
|
||||
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
|
||||
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks,
|
||||
|
@ -123,13 +130,15 @@ template<class DriverType, bool Large>
|
|||
struct CudaGetOptBlockSize;
|
||||
|
||||
template<class DriverType, bool Large = (CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>
|
||||
int cuda_get_opt_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
|
||||
return CudaGetOptBlockSize<DriverType,Large>::get_block_size(f,vector_length,shmem_extra);
|
||||
int cuda_get_opt_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
|
||||
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
|
||||
return CudaGetOptBlockSize<DriverType,Large>::get_block_size(f,vector_length,shmem_extra_block,shmem_extra_thread);
|
||||
}
|
||||
|
||||
template<class DriverType>
|
||||
struct CudaGetOptBlockSize<DriverType,true> {
|
||||
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
|
||||
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
|
||||
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
|
||||
int blockSize=16;
|
||||
int numBlocks;
|
||||
int sharedmem;
|
||||
|
@ -140,7 +149,8 @@ struct CudaGetOptBlockSize<DriverType,true> {
|
|||
blockSize*=2;
|
||||
|
||||
//calculate the occupancy with that optBlockSize and check whether its larger than the largest one found so far
|
||||
sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
|
||||
sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
|
||||
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks,
|
||||
cuda_parallel_launch_constant_memory<DriverType>,
|
||||
|
@ -157,7 +167,8 @@ struct CudaGetOptBlockSize<DriverType,true> {
|
|||
|
||||
template<class DriverType>
|
||||
struct CudaGetOptBlockSize<DriverType,false> {
|
||||
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
|
||||
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
|
||||
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
|
||||
int blockSize=16;
|
||||
int numBlocks;
|
||||
int sharedmem;
|
||||
|
@ -166,7 +177,8 @@ struct CudaGetOptBlockSize<DriverType,false> {
|
|||
|
||||
while(blockSize<1024) {
|
||||
blockSize*=2;
|
||||
sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
|
||||
sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
|
||||
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
|
||||
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks,
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -130,16 +130,17 @@ inline void cuda_intra_block_reduction( ValueType& value,
|
|||
cuda_inter_warp_reduction(value,join,max_active_thread);
|
||||
}
|
||||
|
||||
template< class FunctorType , class JoinOp>
|
||||
template< class FunctorType , class JoinOp , class ArgTag = void >
|
||||
__device__
|
||||
bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , void >::reference_type value,
|
||||
bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgTag >::reference_type value,
|
||||
typename FunctorValueTraits< FunctorType , ArgTag >::reference_type neutral,
|
||||
const JoinOp& join,
|
||||
Cuda::size_type * const m_scratch_space,
|
||||
typename FunctorValueTraits< FunctorType , void >::pointer_type const result,
|
||||
typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type const result,
|
||||
Cuda::size_type * const m_scratch_flags,
|
||||
const int max_active_thread = blockDim.y) {
|
||||
typedef typename FunctorValueTraits< FunctorType , void >::pointer_type pointer_type;
|
||||
typedef typename FunctorValueTraits< FunctorType , void >::value_type value_type;
|
||||
typedef typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type pointer_type;
|
||||
typedef typename FunctorValueTraits< FunctorType , ArgTag >::value_type value_type;
|
||||
|
||||
//Do the intra-block reduction with shfl operations and static shared memory
|
||||
cuda_intra_block_reduction(value,join,max_active_thread);
|
||||
|
@ -170,7 +171,7 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , void
|
|||
if(id == 0)
|
||||
*m_scratch_flags = 0;
|
||||
last_block = true;
|
||||
value = 0;
|
||||
value = neutral;
|
||||
|
||||
pointer_type const volatile global = (pointer_type) m_scratch_space ;
|
||||
|
||||
|
@ -366,7 +367,12 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType & functor ,
|
|||
size_type * const shared = shared_data + word_count.value * BlockSizeMask ;
|
||||
size_type * const global = global_data + word_count.value * block_id ;
|
||||
|
||||
#if (__CUDA_ARCH__ < 500)
|
||||
for ( size_type i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i] ; }
|
||||
#else
|
||||
for ( size_type i = 0 ; i < word_count.value ; i += 1 ) { global[i] = shared[i] ; }
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
// Contributing blocks note that their contribution has been completed via an atomic-increment flag
|
||||
|
|
|
@ -0,0 +1,179 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
#if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY )
|
||||
|
||||
#include <impl/Kokkos_TaskQueue_impl.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template class TaskQueue< Kokkos::Cuda > ;
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
__device__
|
||||
void TaskQueueSpecialization< Kokkos::Cuda >::driver
|
||||
( TaskQueueSpecialization< Kokkos::Cuda >::queue_type * const queue )
|
||||
{
|
||||
using Member = TaskExec< Kokkos::Cuda > ;
|
||||
using Queue = TaskQueue< Kokkos::Cuda > ;
|
||||
using task_root_type = TaskBase< Kokkos::Cuda , void , void > ;
|
||||
|
||||
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
|
||||
|
||||
Member single_exec( 1 );
|
||||
Member team_exec( blockDim.y );
|
||||
|
||||
const int warp_lane = threadIdx.x + threadIdx.y * blockDim.x ;
|
||||
|
||||
union {
|
||||
task_root_type * ptr ;
|
||||
int raw[2] ;
|
||||
} task ;
|
||||
|
||||
// Loop until all queues are empty and no tasks in flight
|
||||
|
||||
do {
|
||||
|
||||
// Each team lead attempts to acquire either a thread team task
|
||||
// or collection of single thread tasks for the team.
|
||||
|
||||
if ( 0 == warp_lane ) {
|
||||
|
||||
task.ptr = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
|
||||
|
||||
// Loop by priority and then type
|
||||
for ( int i = 0 ; i < Queue::NumQueue && end == task.ptr ; ++i ) {
|
||||
for ( int j = 0 ; j < 2 && end == task.ptr ; ++j ) {
|
||||
task.ptr = Queue::pop_task( & queue->m_ready[i][j] );
|
||||
}
|
||||
}
|
||||
|
||||
#if 0
|
||||
printf("TaskQueue<Cuda>::driver(%d,%d) task(%lx)\n",threadIdx.z,blockIdx.x
|
||||
, uintptr_t(task.ptr));
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
// shuffle broadcast
|
||||
|
||||
task.raw[0] = __shfl( task.raw[0] , 0 );
|
||||
task.raw[1] = __shfl( task.raw[1] , 0 );
|
||||
|
||||
if ( 0 == task.ptr ) break ; // 0 == queue->m_ready_count
|
||||
|
||||
if ( end != task.ptr ) {
|
||||
if ( task_root_type::TaskTeam == task.ptr->m_task_type ) {
|
||||
// Thread Team Task
|
||||
(*task.ptr->m_apply)( task.ptr , & team_exec );
|
||||
}
|
||||
else if ( 0 == threadIdx.y ) {
|
||||
// Single Thread Task
|
||||
(*task.ptr->m_apply)( task.ptr , & single_exec );
|
||||
}
|
||||
|
||||
if ( 0 == warp_lane ) {
|
||||
queue->complete( task.ptr );
|
||||
}
|
||||
}
|
||||
} while(1);
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
__global__
|
||||
void cuda_task_queue_execute( TaskQueue< Kokkos::Cuda > * queue )
|
||||
{ TaskQueueSpecialization< Kokkos::Cuda >::driver( queue ); }
|
||||
|
||||
}
|
||||
|
||||
void TaskQueueSpecialization< Kokkos::Cuda >::execute
|
||||
( TaskQueue< Kokkos::Cuda > * const queue )
|
||||
{
|
||||
const int warps_per_block = 4 ;
|
||||
const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 );
|
||||
const dim3 block( 1 , Kokkos::Impl::CudaTraits::WarpSize , warps_per_block );
|
||||
const int shared = 0 ;
|
||||
const cudaStream_t stream = 0 ;
|
||||
|
||||
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
|
||||
|
||||
#if 0
|
||||
printf("cuda_task_queue_execute before\n");
|
||||
#endif
|
||||
|
||||
// Query the stack size, in bytes:
|
||||
//
|
||||
// size_t stack_size = 0 ;
|
||||
// CUDA_SAFE_CALL( cudaDeviceGetLimit( & stack_size , cudaLimitStackSize ) );
|
||||
//
|
||||
// If not large enough then set the stack size, in bytes:
|
||||
//
|
||||
// CUDA_SAFE_CALL( cudaDeviceSetLimit( cudaLimitStackSize , stack_size ) );
|
||||
|
||||
cuda_task_queue_execute<<< grid , block , shared , stream >>>( queue );
|
||||
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
|
||||
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
|
||||
|
||||
#if 0
|
||||
printf("cuda_task_queue_execute after\n");
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
}} /* namespace Kokkos::Impl */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
|
||||
|
||||
|
|
@ -0,0 +1,519 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_IMPL_CUDA_TASK_HPP
|
||||
#define KOKKOS_IMPL_CUDA_TASK_HPP
|
||||
|
||||
#if defined( KOKKOS_ENABLE_TASKPOLICY )
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
namespace {
|
||||
|
||||
template< typename TaskType >
|
||||
__global__
|
||||
void set_cuda_task_base_apply_function_pointer
|
||||
( TaskBase<Kokkos::Cuda,void,void>::function_type * ptr )
|
||||
{ *ptr = TaskType::apply ; }
|
||||
|
||||
}
|
||||
|
||||
template<>
|
||||
class TaskQueueSpecialization< Kokkos::Cuda >
|
||||
{
|
||||
public:
|
||||
|
||||
using execution_space = Kokkos::Cuda ;
|
||||
using memory_space = Kokkos::CudaUVMSpace ;
|
||||
using queue_type = TaskQueue< execution_space > ;
|
||||
|
||||
static
|
||||
void iff_single_thread_recursive_execute( queue_type * const ) {}
|
||||
|
||||
__device__
|
||||
static void driver( queue_type * const );
|
||||
|
||||
static
|
||||
void execute( queue_type * const );
|
||||
|
||||
template< typename FunctorType >
|
||||
static
|
||||
void proc_set_apply( TaskBase<execution_space,void,void>::function_type * ptr )
|
||||
{
|
||||
using TaskType = TaskBase< execution_space
|
||||
, typename FunctorType::value_type
|
||||
, FunctorType > ;
|
||||
|
||||
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
|
||||
|
||||
set_cuda_task_base_apply_function_pointer<TaskType><<<1,1>>>(ptr);
|
||||
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
|
||||
}
|
||||
};
|
||||
|
||||
extern template class TaskQueue< Kokkos::Cuda > ;
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
/**\brief Impl::TaskExec<Cuda> is the TaskPolicy<Cuda>::member_type
|
||||
* passed to tasks running in a Cuda space.
|
||||
*
|
||||
* Cuda thread blocks for tasking are dimensioned:
|
||||
* blockDim.x == vector length
|
||||
* blockDim.y == team size
|
||||
* blockDim.z == number of teams
|
||||
* where
|
||||
* blockDim.x * blockDim.y == WarpSize
|
||||
*
|
||||
* Both single thread and thread team tasks are run by a full Cuda warp.
|
||||
* A single thread task is called by warp lane #0 and the remaining
|
||||
* lanes of the warp are idle.
|
||||
*/
|
||||
template<>
|
||||
class TaskExec< Kokkos::Cuda >
|
||||
{
|
||||
private:
|
||||
|
||||
TaskExec( TaskExec && ) = delete ;
|
||||
TaskExec( TaskExec const & ) = delete ;
|
||||
TaskExec & operator = ( TaskExec && ) = delete ;
|
||||
TaskExec & operator = ( TaskExec const & ) = delete ;
|
||||
|
||||
friend class Kokkos::Impl::TaskQueue< Kokkos::Cuda > ;
|
||||
friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::Cuda > ;
|
||||
|
||||
const int m_team_size ;
|
||||
|
||||
__device__
|
||||
TaskExec( int arg_team_size = blockDim.y )
|
||||
: m_team_size( arg_team_size ) {}
|
||||
|
||||
public:
|
||||
|
||||
#if defined( __CUDA_ARCH__ )
|
||||
__device__ void team_barrier() { /* __threadfence_block(); */ }
|
||||
__device__ int team_rank() const { return threadIdx.y ; }
|
||||
__device__ int team_size() const { return m_team_size ; }
|
||||
#else
|
||||
__host__ void team_barrier() {}
|
||||
__host__ int team_rank() const { return 0 ; }
|
||||
__host__ int team_size() const { return 0 ; }
|
||||
#endif
|
||||
|
||||
};
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template<typename iType>
|
||||
struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
|
||||
{
|
||||
typedef iType index_type;
|
||||
const iType start ;
|
||||
const iType end ;
|
||||
const iType increment ;
|
||||
const TaskExec< Kokkos::Cuda > & thread;
|
||||
|
||||
#if defined( __CUDA_ARCH__ )
|
||||
|
||||
__device__ inline
|
||||
TeamThreadRangeBoundariesStruct
|
||||
( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count)
|
||||
: start( threadIdx.y )
|
||||
, end(arg_count)
|
||||
, increment( blockDim.y )
|
||||
, thread(arg_thread)
|
||||
{}
|
||||
|
||||
__device__ inline
|
||||
TeamThreadRangeBoundariesStruct
|
||||
( const TaskExec< Kokkos::Cuda > & arg_thread
|
||||
, const iType & arg_start
|
||||
, const iType & arg_end
|
||||
)
|
||||
: start( arg_start + threadIdx.y )
|
||||
, end( arg_end)
|
||||
, increment( blockDim.y )
|
||||
, thread( arg_thread )
|
||||
{}
|
||||
|
||||
#else
|
||||
|
||||
TeamThreadRangeBoundariesStruct
|
||||
( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count);
|
||||
|
||||
TeamThreadRangeBoundariesStruct
|
||||
( const TaskExec< Kokkos::Cuda > & arg_thread
|
||||
, const iType & arg_start
|
||||
, const iType & arg_end
|
||||
);
|
||||
|
||||
#endif
|
||||
|
||||
};
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template<typename iType>
|
||||
struct ThreadVectorRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
|
||||
{
|
||||
typedef iType index_type;
|
||||
const iType start ;
|
||||
const iType end ;
|
||||
const iType increment ;
|
||||
const TaskExec< Kokkos::Cuda > & thread;
|
||||
|
||||
#if defined( __CUDA_ARCH__ )
|
||||
|
||||
__device__ inline
|
||||
ThreadVectorRangeBoundariesStruct
|
||||
( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count)
|
||||
: start( threadIdx.x )
|
||||
, end(arg_count)
|
||||
, increment( blockDim.x )
|
||||
, thread(arg_thread)
|
||||
{}
|
||||
|
||||
#else
|
||||
|
||||
ThreadVectorRangeBoundariesStruct
|
||||
( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count);
|
||||
|
||||
#endif
|
||||
|
||||
};
|
||||
|
||||
}} /* namespace Kokkos::Impl */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
|
||||
TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread
|
||||
, const iType & count )
|
||||
{
|
||||
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >(thread,count);
|
||||
}
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
|
||||
TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread, const iType & start , const iType & end )
|
||||
{
|
||||
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Cuda > >(thread,start,end);
|
||||
}
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
|
||||
ThreadVectorRange( const Impl::TaskExec< Kokkos::Cuda > & thread
|
||||
, const iType & count )
|
||||
{
|
||||
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >(thread,count);
|
||||
}
|
||||
|
||||
/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all threads of the the calling thread team.
|
||||
* This functionality requires C++11 support.
|
||||
*/
|
||||
template<typename iType, class Lambda>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_for
|
||||
( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Cuda > >& loop_boundaries
|
||||
, const Lambda& lambda
|
||||
)
|
||||
{
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i);
|
||||
}
|
||||
}
|
||||
|
||||
// reduce across corresponding lanes between team members within warp
|
||||
// assume stride*team_size == warp_size
|
||||
template< typename ValueType, class JoinType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void strided_shfl_warp_reduction
|
||||
(const JoinType& join,
|
||||
ValueType& val,
|
||||
int team_size,
|
||||
int stride)
|
||||
{
|
||||
for (int lane_delta=(team_size*stride)>>1; lane_delta>=stride; lane_delta>>=1) {
|
||||
join(val, Kokkos::shfl_down(val, lane_delta, team_size*stride));
|
||||
}
|
||||
}
|
||||
|
||||
// multiple within-warp non-strided reductions
|
||||
template< typename ValueType, class JoinType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void multi_shfl_warp_reduction
|
||||
(const JoinType& join,
|
||||
ValueType& val,
|
||||
int vec_length)
|
||||
{
|
||||
for (int lane_delta=vec_length>>1; lane_delta; lane_delta>>=1) {
|
||||
join(val, Kokkos::shfl_down(val, lane_delta, vec_length));
|
||||
}
|
||||
}
|
||||
|
||||
// broadcast within warp
|
||||
template< class ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ValueType shfl_warp_broadcast
|
||||
(ValueType& val,
|
||||
int src_lane,
|
||||
int width)
|
||||
{
|
||||
return Kokkos::shfl(val, src_lane, width);
|
||||
}
|
||||
|
||||
// all-reduce across corresponding vector lanes between team members within warp
|
||||
// assume vec_length*team_size == warp_size
|
||||
// blockDim.x == vec_length == stride
|
||||
// blockDim.y == team_size
|
||||
// threadIdx.x == position in vec
|
||||
// threadIdx.y == member number
|
||||
template< typename iType, class Lambda, typename ValueType, class JoinType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
|
||||
const Lambda & lambda,
|
||||
const JoinType& join,
|
||||
ValueType& initialized_result) {
|
||||
|
||||
ValueType result = initialized_result;
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i,result);
|
||||
}
|
||||
initialized_result = result;
|
||||
|
||||
strided_shfl_warp_reduction<ValueType, JoinType>(
|
||||
join,
|
||||
initialized_result,
|
||||
loop_boundaries.thread.team_size(),
|
||||
blockDim.x);
|
||||
initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, threadIdx.x, Impl::CudaTraits::WarpSize );
|
||||
}
|
||||
|
||||
// all-reduce across corresponding vector lanes between team members within warp
|
||||
// if no join() provided, use sum
|
||||
// assume vec_length*team_size == warp_size
|
||||
// blockDim.x == vec_length == stride
|
||||
// blockDim.y == team_size
|
||||
// threadIdx.x == position in vec
|
||||
// threadIdx.y == member number
|
||||
template< typename iType, class Lambda, typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
|
||||
const Lambda & lambda,
|
||||
ValueType& initialized_result) {
|
||||
|
||||
//TODO what is the point of creating this temporary?
|
||||
ValueType result = initialized_result;
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i,result);
|
||||
}
|
||||
initialized_result = result;
|
||||
|
||||
strided_shfl_warp_reduction(
|
||||
[&] (ValueType& val1, const ValueType& val2) { val1 += val2; },
|
||||
initialized_result,
|
||||
loop_boundaries.thread.team_size(),
|
||||
blockDim.x);
|
||||
initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, threadIdx.x, Impl::CudaTraits::WarpSize );
|
||||
}
|
||||
|
||||
// all-reduce within team members within warp
|
||||
// assume vec_length*team_size == warp_size
|
||||
// blockDim.x == vec_length == stride
|
||||
// blockDim.y == team_size
|
||||
// threadIdx.x == position in vec
|
||||
// threadIdx.y == member number
|
||||
template< typename iType, class Lambda, typename ValueType, class JoinType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
|
||||
const Lambda & lambda,
|
||||
const JoinType& join,
|
||||
ValueType& initialized_result) {
|
||||
|
||||
ValueType result = initialized_result;
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i,result);
|
||||
}
|
||||
initialized_result = result;
|
||||
|
||||
multi_shfl_warp_reduction<ValueType, JoinType>(join, initialized_result, blockDim.x);
|
||||
initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, 0, blockDim.x );
|
||||
}
|
||||
|
||||
// all-reduce within team members within warp
|
||||
// if no join() provided, use sum
|
||||
// assume vec_length*team_size == warp_size
|
||||
// blockDim.x == vec_length == stride
|
||||
// blockDim.y == team_size
|
||||
// threadIdx.x == position in vec
|
||||
// threadIdx.y == member number
|
||||
template< typename iType, class Lambda, typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
|
||||
const Lambda & lambda,
|
||||
ValueType& initialized_result) {
|
||||
|
||||
ValueType result = initialized_result;
|
||||
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i,result);
|
||||
}
|
||||
|
||||
initialized_result = result;
|
||||
|
||||
//initialized_result = multi_shfl_warp_reduction(
|
||||
multi_shfl_warp_reduction(
|
||||
[&] (ValueType& val1, const ValueType& val2) { val1 += val2; },
|
||||
initialized_result,
|
||||
blockDim.x);
|
||||
initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, 0, blockDim.x );
|
||||
}
|
||||
|
||||
// scan across corresponding vector lanes between team members within warp
|
||||
// assume vec_length*team_size == warp_size
|
||||
// blockDim.x == vec_length == stride
|
||||
// blockDim.y == team_size
|
||||
// threadIdx.x == position in vec
|
||||
// threadIdx.y == member number
|
||||
template< typename ValueType, typename iType, class Lambda >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_scan
|
||||
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
|
||||
const Lambda & lambda) {
|
||||
|
||||
ValueType accum = 0 ;
|
||||
ValueType val, y, local_total;
|
||||
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
val = 0;
|
||||
lambda(i,val,false);
|
||||
|
||||
// intra-blockDim.y exclusive scan on 'val'
|
||||
// accum = accumulated, sum in total for this iteration
|
||||
|
||||
// INCLUSIVE scan
|
||||
for( int offset = blockDim.x ; offset < Impl::CudaTraits::WarpSize ; offset <<= 1 ) {
|
||||
y = Kokkos::shfl_up(val, offset, Impl::CudaTraits::WarpSize);
|
||||
if(threadIdx.y*blockDim.x >= offset) { val += y; }
|
||||
}
|
||||
|
||||
// pass accum to all threads
|
||||
local_total = shfl_warp_broadcast<ValueType>(val,
|
||||
threadIdx.x+Impl::CudaTraits::WarpSize-blockDim.x,
|
||||
Impl::CudaTraits::WarpSize);
|
||||
|
||||
// make EXCLUSIVE scan by shifting values over one
|
||||
val = Kokkos::shfl_up(val, blockDim.x, Impl::CudaTraits::WarpSize);
|
||||
if ( threadIdx.y == 0 ) { val = 0 ; }
|
||||
|
||||
val += accum;
|
||||
lambda(i,val,true);
|
||||
accum += local_total;
|
||||
}
|
||||
}
|
||||
|
||||
// scan within team member (vector) within warp
|
||||
// assume vec_length*team_size == warp_size
|
||||
// blockDim.x == vec_length == stride
|
||||
// blockDim.y == team_size
|
||||
// threadIdx.x == position in vec
|
||||
// threadIdx.y == member number
|
||||
template< typename iType, class Lambda, typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_scan
|
||||
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
|
||||
const Lambda & lambda)
|
||||
{
|
||||
ValueType accum = 0 ;
|
||||
ValueType val, y, local_total;
|
||||
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
val = 0;
|
||||
lambda(i,val,false);
|
||||
|
||||
// intra-blockDim.x exclusive scan on 'val'
|
||||
// accum = accumulated, sum in total for this iteration
|
||||
|
||||
// INCLUSIVE scan
|
||||
for( int offset = 1 ; offset < blockDim.x ; offset <<= 1 ) {
|
||||
y = Kokkos::shfl_up(val, offset, blockDim.x);
|
||||
if(threadIdx.x >= offset) { val += y; }
|
||||
}
|
||||
|
||||
// pass accum to all threads
|
||||
local_total = shfl_warp_broadcast<ValueType>(val, blockDim.x-1, blockDim.x);
|
||||
|
||||
// make EXCLUSIVE scan by shifting values over one
|
||||
val = Kokkos::shfl_up(val, 1, blockDim.x);
|
||||
if ( threadIdx.x == 0 ) { val = 0 ; }
|
||||
|
||||
val += accum;
|
||||
lambda(i,val,true);
|
||||
accum += local_total;
|
||||
}
|
||||
}
|
||||
|
||||
} /* namespace Kokkos */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
|
||||
#endif /* #ifndef KOKKOS_IMPL_CUDA_TASK_HPP */
|
||||
|
|
@ -46,9 +46,10 @@
|
|||
#include <stdio.h>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_TaskPolicy.hpp>
|
||||
|
||||
#if defined( KOKKOS_ENABLE_CUDA_TASK_POLICY )
|
||||
#if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY )
|
||||
|
||||
// #define DETAILED_PRINT
|
||||
|
||||
|
@ -93,9 +94,8 @@ CudaTaskPolicyQueue
|
|||
, const unsigned arg_team_size
|
||||
)
|
||||
: m_space( Kokkos::CudaUVMSpace()
|
||||
, arg_task_max_size
|
||||
, arg_task_max_size * arg_task_max_count
|
||||
, 1 /* only one level of memory pool */
|
||||
, arg_task_max_size * arg_task_max_count * 1.2
|
||||
, 16 /* log2(superblock size) */
|
||||
)
|
||||
, m_team { 0 , 0 , 0 }
|
||||
, m_serial { 0 , 0 , 0 }
|
||||
|
@ -172,6 +172,8 @@ if ( IS_TEAM_LEAD && 0 != team_task ) {
|
|||
member( kokkos_impl_cuda_shared_memory<void>()
|
||||
, 16 /* shared_begin */
|
||||
, team_task->m_shmem_size /* shared size */
|
||||
, 0 /* scratch level 1 pointer */
|
||||
, 0 /* scratch level 1 size */
|
||||
, 0 /* league rank */
|
||||
, 1 /* league size */
|
||||
);
|
||||
|
@ -926,5 +928,5 @@ void Task::clear_dependence()
|
|||
} /* namespace Kokkos */
|
||||
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_CUDA_TASK_POLICY ) */
|
||||
#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
|
||||
|
||||
|
|
|
@ -47,19 +47,11 @@
|
|||
#define KOKKOS_CUDA_TASKPOLICY_HPP
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
#if defined( KOKKOS_HAVE_CUDA ) && \
|
||||
defined( KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE )
|
||||
|
||||
#define KOKKOS_ENABLE_CUDA_TASK_POLICY
|
||||
|
||||
/* The TaskPolicy< Cuda > capability requires nvcc using the option:
|
||||
* --relocatable-device-code=true
|
||||
*/
|
||||
|
||||
#include <Kokkos_Cuda.hpp>
|
||||
#include <Kokkos_TaskPolicy.hpp>
|
||||
|
||||
#if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY )
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
|
@ -81,8 +73,6 @@ public:
|
|||
|
||||
private:
|
||||
|
||||
friend struct CudaTaskPolicyQueue ;
|
||||
|
||||
CudaTaskPolicyQueue * m_policy ;
|
||||
TaskMember * volatile * m_queue ;
|
||||
function_team_type m_team ; ///< Apply function on CUDA
|
||||
|
@ -819,9 +809,11 @@ public:
|
|||
static member_type member_single()
|
||||
{
|
||||
return
|
||||
member_type( 0 /* shared memory */
|
||||
, 0 /* shared memory begin */
|
||||
, 0 /* shared memory size */
|
||||
member_type( 0 /* shared memory pointer */
|
||||
, 0 /* shared memory begin offset */
|
||||
, 0 /* shared memory end offset */
|
||||
, 0 /* scratch level_1 pointer */
|
||||
, 0 /* scratch level_1 size */
|
||||
, 0 /* league rank */
|
||||
, 1 /* league size */ );
|
||||
}
|
||||
|
@ -832,10 +824,10 @@ public:
|
|||
} /* namespace Experimental */
|
||||
} /* namespace Kokkos */
|
||||
|
||||
#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE ) */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
|
||||
#endif /* #ifndef KOKKOS_CUDA_TASKPOLICY_HPP */
|
||||
|
||||
|
||||
|
|
|
@ -56,8 +56,6 @@
|
|||
#include <impl/Kokkos_Shape.hpp>
|
||||
#include <Kokkos_View.hpp>
|
||||
|
||||
#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
@ -90,343 +88,6 @@ struct AssertShapeBoundsAbort< CudaSpace >
|
|||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#if ! KOKKOS_USING_EXP_VIEW
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// Cuda Texture fetches can be performed for 4, 8 and 16 byte objects (int,int2,int4)
|
||||
// Via reinterpret_case this can be used to support all scalar types of those sizes.
|
||||
// Any other scalar type falls back to either normal reads out of global memory,
|
||||
// or using the __ldg intrinsic on Kepler GPUs or newer (Compute Capability >= 3.0)
|
||||
|
||||
template< typename ValueType
|
||||
, class MemorySpace
|
||||
, class AliasType =
|
||||
typename Kokkos::Impl::if_c< ( sizeof(ValueType) == 4 ) , int ,
|
||||
typename Kokkos::Impl::if_c< ( sizeof(ValueType) == 8 ) , ::int2 ,
|
||||
typename Kokkos::Impl::if_c< ( sizeof(ValueType) == 16 ) , ::int4 ,
|
||||
typename Kokkos::Impl::if_c< ( sizeof(ValueType) == 32 ) , ::float4 ,void
|
||||
>::type
|
||||
>::type
|
||||
>::type
|
||||
>::type
|
||||
>
|
||||
class CudaTextureFetch {
|
||||
private:
|
||||
|
||||
cuda_texture_object_type m_obj ;
|
||||
const ValueType * m_alloc_ptr ;
|
||||
int m_offset ;
|
||||
|
||||
void attach( const ValueType * const arg_ptr, AllocationTracker const & tracker )
|
||||
{
|
||||
typedef char const * const byte;
|
||||
|
||||
m_alloc_ptr = reinterpret_cast<ValueType *>(tracker.alloc_ptr());
|
||||
|
||||
size_t byte_offset = reinterpret_cast<byte>(arg_ptr) - reinterpret_cast<byte>(m_alloc_ptr);
|
||||
const bool ok_aligned = 0 == byte_offset % sizeof(ValueType);
|
||||
|
||||
const size_t count = tracker.alloc_size() / sizeof(ValueType);
|
||||
const bool ok_contains = (m_alloc_ptr <= arg_ptr) && (arg_ptr < (m_alloc_ptr + count));
|
||||
|
||||
if (ok_aligned && ok_contains) {
|
||||
if (tracker.attribute() == NULL ) {
|
||||
MemorySpace::texture_object_attach(
|
||||
tracker
|
||||
, sizeof(ValueType)
|
||||
, cudaCreateChannelDesc< AliasType >()
|
||||
);
|
||||
}
|
||||
m_obj = dynamic_cast<TextureAttribute*>(tracker.attribute())->m_tex_obj;
|
||||
m_offset = arg_ptr - m_alloc_ptr;
|
||||
}
|
||||
else if( !ok_contains ) {
|
||||
throw_runtime_exception("Error: cannot attach a texture object to a tracker which does not bound the pointer.");
|
||||
}
|
||||
else {
|
||||
throw_runtime_exception("Error: cannot attach a texture object to an incorrectly aligned pointer.");
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
CudaTextureFetch() : m_obj() , m_alloc_ptr() , m_offset() {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
~CudaTextureFetch() {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
CudaTextureFetch( const CudaTextureFetch & rhs )
|
||||
: m_obj( rhs.m_obj )
|
||||
, m_alloc_ptr( rhs.m_alloc_ptr )
|
||||
, m_offset( rhs.m_offset )
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
CudaTextureFetch & operator = ( const CudaTextureFetch & rhs )
|
||||
{
|
||||
m_obj = rhs.m_obj ;
|
||||
m_alloc_ptr = rhs.m_alloc_ptr ;
|
||||
m_offset = rhs.m_offset ;
|
||||
return *this ;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION explicit
|
||||
CudaTextureFetch( const ValueType * const arg_ptr, AllocationTracker const & tracker )
|
||||
: m_obj( 0 ) , m_alloc_ptr(0) , m_offset(0)
|
||||
{
|
||||
#if defined( KOKKOS_USE_LDG_INTRINSIC )
|
||||
m_alloc_ptr(arg_ptr);
|
||||
#elif defined( __CUDACC__ ) && ! defined( __CUDA_ARCH__ )
|
||||
if ( arg_ptr != NULL ) {
|
||||
if ( tracker.is_valid() ) {
|
||||
attach( arg_ptr, tracker );
|
||||
}
|
||||
else {
|
||||
AllocationTracker found_tracker = AllocationTracker::find<typename MemorySpace::allocator>(arg_ptr);
|
||||
if ( found_tracker.is_valid() ) {
|
||||
attach( arg_ptr, found_tracker );
|
||||
} else {
|
||||
throw_runtime_exception("Error: cannot attach a texture object to an untracked pointer!");
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
operator const ValueType * () const { return m_alloc_ptr + m_offset ; }
|
||||
|
||||
|
||||
template< typename iType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ValueType operator[]( const iType & i ) const
|
||||
{
|
||||
#if defined( KOKKOS_USE_LDG_INTRINSIC ) && defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
|
||||
AliasType v = __ldg(reinterpret_cast<AliasType*>(&m_alloc_ptr[i]));
|
||||
return *(reinterpret_cast<ValueType*> (&v));
|
||||
#elif defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
|
||||
AliasType v = tex1Dfetch<AliasType>( m_obj , i + m_offset );
|
||||
return *(reinterpret_cast<ValueType*> (&v));
|
||||
#else
|
||||
return m_alloc_ptr[ i + m_offset ];
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template< typename ValueType, class MemorySpace >
|
||||
class CudaTextureFetch< const ValueType, MemorySpace, float4 > {
|
||||
private:
|
||||
typedef float4 AliasType;
|
||||
cuda_texture_object_type m_obj ;
|
||||
const ValueType * m_alloc_ptr ;
|
||||
int m_offset ;
|
||||
|
||||
void attach( const ValueType * const arg_ptr, AllocationTracker const & tracker )
|
||||
{
|
||||
typedef char const * const byte;
|
||||
|
||||
m_alloc_ptr = reinterpret_cast<ValueType *>(tracker.alloc_ptr());
|
||||
|
||||
size_t byte_offset = reinterpret_cast<byte>(arg_ptr) - reinterpret_cast<byte>(m_alloc_ptr);
|
||||
const bool ok_aligned = 0 == byte_offset % sizeof(ValueType);
|
||||
|
||||
const size_t count = tracker.alloc_size() / sizeof(ValueType);
|
||||
const bool ok_contains = (m_alloc_ptr <= arg_ptr) && (arg_ptr < (m_alloc_ptr + count));
|
||||
|
||||
if (ok_aligned && ok_contains) {
|
||||
if (tracker.attribute() == NULL ) {
|
||||
MemorySpace::texture_object_attach(
|
||||
tracker
|
||||
, sizeof(ValueType)
|
||||
, cudaCreateChannelDesc< AliasType >()
|
||||
);
|
||||
}
|
||||
m_obj = dynamic_cast<TextureAttribute*>(tracker.attribute())->m_tex_obj;
|
||||
m_offset = arg_ptr - m_alloc_ptr;
|
||||
}
|
||||
else if( !ok_contains ) {
|
||||
throw_runtime_exception("Error: cannot attach a texture object to a tracker which does not bound the pointer.");
|
||||
}
|
||||
else {
|
||||
throw_runtime_exception("Error: cannot attach a texture object to an incorrectly aligned pointer.");
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
CudaTextureFetch() : m_obj() , m_alloc_ptr() , m_offset() {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
~CudaTextureFetch() {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
CudaTextureFetch( const CudaTextureFetch & rhs )
|
||||
: m_obj( rhs.m_obj )
|
||||
, m_alloc_ptr( rhs.m_alloc_ptr )
|
||||
, m_offset( rhs.m_offset )
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
CudaTextureFetch & operator = ( const CudaTextureFetch & rhs )
|
||||
{
|
||||
m_obj = rhs.m_obj ;
|
||||
m_alloc_ptr = rhs.m_alloc_ptr ;
|
||||
m_offset = rhs.m_offset ;
|
||||
return *this ;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION explicit
|
||||
CudaTextureFetch( const ValueType * const arg_ptr, AllocationTracker const & tracker )
|
||||
: m_obj( 0 ) , m_alloc_ptr(0) , m_offset(0)
|
||||
{
|
||||
#if defined( KOKKOS_USE_LDG_INTRINSIC )
|
||||
m_alloc_ptr(arg_ptr);
|
||||
#elif defined( __CUDACC__ ) && ! defined( __CUDA_ARCH__ )
|
||||
if ( arg_ptr != NULL ) {
|
||||
if ( tracker.is_valid() ) {
|
||||
attach( arg_ptr, tracker );
|
||||
}
|
||||
else {
|
||||
AllocationTracker found_tracker = AllocationTracker::find<typename MemorySpace::allocator>(arg_ptr);
|
||||
if ( found_tracker.is_valid() ) {
|
||||
attach( arg_ptr, found_tracker );
|
||||
} else {
|
||||
throw_runtime_exception("Error: cannot attach a texture object to an untracked pointer!");
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
operator const ValueType * () const { return m_alloc_ptr + m_offset ; }
|
||||
|
||||
|
||||
template< typename iType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ValueType operator[]( const iType & i ) const
|
||||
{
|
||||
#if defined( KOKKOS_USE_LDG_INTRINSIC ) && defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
|
||||
AliasType v = __ldg(reinterpret_cast<AliasType*>(&m_alloc_ptr[i]));
|
||||
return *(reinterpret_cast<ValueType*> (&v));
|
||||
#elif defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
|
||||
union Float4ValueType {
|
||||
float4 f4[2];
|
||||
ValueType val;
|
||||
};
|
||||
Float4ValueType convert;
|
||||
convert.f4[0] = tex1Dfetch<AliasType>( m_obj , 2*(i + m_offset) );
|
||||
convert.f4[1] = tex1Dfetch<AliasType>( m_obj , 2*(i + m_offset)+1 );
|
||||
return convert.val;
|
||||
#else
|
||||
return m_alloc_ptr[ i + m_offset ];
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template< typename ValueType, class MemorySpace >
|
||||
class CudaTextureFetch< const ValueType, MemorySpace, void >
|
||||
{
|
||||
private:
|
||||
const ValueType * m_ptr ;
|
||||
public:
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
CudaTextureFetch() : m_ptr(0) {};
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
~CudaTextureFetch() {
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
CudaTextureFetch( const ValueType * ptr, const AllocationTracker & ) : m_ptr(ptr) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
CudaTextureFetch( const CudaTextureFetch & rhs ) : m_ptr(rhs.m_ptr) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
CudaTextureFetch & operator = ( const CudaTextureFetch & rhs ) {
|
||||
m_ptr = rhs.m_ptr;
|
||||
return *this ;
|
||||
}
|
||||
|
||||
explicit KOKKOS_INLINE_FUNCTION
|
||||
CudaTextureFetch( ValueType * const base_view_ptr, AllocationTracker const & /*tracker*/ ) {
|
||||
m_ptr = base_view_ptr;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
CudaTextureFetch & operator = (const ValueType* base_view_ptr) {
|
||||
m_ptr = base_view_ptr;
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
operator const ValueType * () const { return m_ptr ; }
|
||||
|
||||
|
||||
template< typename iType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ValueType operator[]( const iType & i ) const
|
||||
{
|
||||
return m_ptr[ i ];
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
/** \brief Replace Default ViewDataHandle with Cuda texture fetch specialization
|
||||
* if 'const' value type, CudaSpace and random access.
|
||||
*/
|
||||
template< class ViewTraits >
|
||||
class ViewDataHandle< ViewTraits ,
|
||||
typename enable_if< ( is_same< typename ViewTraits::memory_space,CudaSpace>::value ||
|
||||
is_same< typename ViewTraits::memory_space,CudaUVMSpace>::value )
|
||||
&&
|
||||
is_same<typename ViewTraits::const_value_type,typename ViewTraits::value_type>::value
|
||||
&&
|
||||
ViewTraits::memory_traits::RandomAccess
|
||||
>::type >
|
||||
{
|
||||
public:
|
||||
enum { ReturnTypeIsReference = false };
|
||||
|
||||
typedef Impl::CudaTextureFetch< typename ViewTraits::value_type
|
||||
, typename ViewTraits::memory_space> handle_type;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static handle_type create_handle( typename ViewTraits::value_type * arg_data_ptr, AllocationTracker const & arg_tracker )
|
||||
{
|
||||
return handle_type(arg_data_ptr, arg_tracker);
|
||||
}
|
||||
|
||||
typedef typename ViewTraits::value_type return_type;
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif // KOKKOS_HAVE_CUDA
|
||||
#endif /* #ifndef KOKKOS_CUDA_VIEW_HPP */
|
||||
|
||||
|
|
|
@ -0,0 +1,611 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
|
||||
#define KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
|
||||
|
||||
#include <Kokkos_ExecPolicy.hpp>
|
||||
#include <Kokkos_Parallel.hpp>
|
||||
#include <initializer_list>
|
||||
|
||||
#if defined(KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION) && defined(KOKKOS_HAVE_PRAGMA_IVDEP) && !defined(__CUDA_ARCH__)
|
||||
#define KOKKOS_MDRANGE_IVDEP
|
||||
#endif
|
||||
|
||||
namespace Kokkos { namespace Experimental {
|
||||
|
||||
enum class Iterate
|
||||
{
|
||||
Default, // Default for the device
|
||||
Left, // Left indices stride fastest
|
||||
Right, // Right indices stride fastest
|
||||
Flat, // Do not tile, only valid for inner direction
|
||||
};
|
||||
|
||||
template <typename ExecSpace>
|
||||
struct default_outer_direction
|
||||
{
|
||||
using type = Iterate;
|
||||
static constexpr Iterate value = Iterate::Right;
|
||||
};
|
||||
|
||||
template <typename ExecSpace>
|
||||
struct default_inner_direction
|
||||
{
|
||||
using type = Iterate;
|
||||
static constexpr Iterate value = Iterate::Right;
|
||||
};
|
||||
|
||||
|
||||
// Iteration Pattern
|
||||
template < unsigned N
|
||||
, Iterate OuterDir = Iterate::Default
|
||||
, Iterate InnerDir = Iterate::Default
|
||||
>
|
||||
struct Rank
|
||||
{
|
||||
static_assert( N != 0u, "Kokkos Error: rank 0 undefined");
|
||||
static_assert( N != 1u, "Kokkos Error: rank 1 is not a multi-dimensional range");
|
||||
static_assert( N < 4u, "Kokkos Error: Unsupported rank...");
|
||||
|
||||
using iteration_pattern = Rank<N, OuterDir, InnerDir>;
|
||||
|
||||
static constexpr int rank = N;
|
||||
static constexpr Iterate outer_direction = OuterDir;
|
||||
static constexpr Iterate inner_direction = InnerDir;
|
||||
};
|
||||
|
||||
|
||||
|
||||
// multi-dimensional iteration pattern
|
||||
template <typename... Properties>
|
||||
struct MDRangePolicy
|
||||
{
|
||||
using range_policy = RangePolicy<Properties...>;
|
||||
|
||||
static_assert( !std::is_same<range_policy,void>::value
|
||||
, "Kokkos Error: MD iteration pattern not defined" );
|
||||
|
||||
using iteration_pattern = typename range_policy::iteration_pattern;
|
||||
using work_tag = typename range_policy::work_tag;
|
||||
|
||||
static constexpr int rank = iteration_pattern::rank;
|
||||
|
||||
static constexpr int outer_direction = static_cast<int> (
|
||||
(iteration_pattern::outer_direction != Iterate::Default && iteration_pattern::outer_direction != Iterate::Flat)
|
||||
? iteration_pattern::outer_direction
|
||||
: default_outer_direction< typename range_policy::execution_space>::value );
|
||||
|
||||
static constexpr int inner_direction = static_cast<int> (
|
||||
iteration_pattern::inner_direction != Iterate::Default
|
||||
? iteration_pattern::inner_direction
|
||||
: default_inner_direction< typename range_policy::execution_space>::value ) ;
|
||||
|
||||
|
||||
// Ugly ugly workaround intel 14 not handling scoped enum correctly
|
||||
static constexpr int Flat = static_cast<int>( Iterate::Flat );
|
||||
static constexpr int Right = static_cast<int>( Iterate::Right );
|
||||
|
||||
|
||||
using size_type = typename range_policy::index_type;
|
||||
using index_type = typename std::make_signed<size_type>::type;
|
||||
|
||||
|
||||
template <typename I>
|
||||
MDRangePolicy( std::initializer_list<I> upper_corner )
|
||||
{
|
||||
static_assert( std::is_integral<I>::value, "Kokkos Error: corner defined with non-integral type" );
|
||||
|
||||
// TODO check size of lists equal to rank
|
||||
// static_asserts on initializer_list.size() require c++14
|
||||
|
||||
//static_assert( upper_corner.size() == rank, "Kokkos Error: upper_corner has incorrect rank" );
|
||||
|
||||
const auto u = upper_corner.begin();
|
||||
|
||||
m_num_tiles = 1;
|
||||
for (int i=0; i<rank; ++i) {
|
||||
m_offset[i] = static_cast<index_type>(0);
|
||||
m_dim[i] = static_cast<index_type>(u[i]);
|
||||
if (inner_direction != Flat) {
|
||||
// default tile size to 4
|
||||
m_tile[i] = 4;
|
||||
} else {
|
||||
m_tile[i] = 1;
|
||||
}
|
||||
m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i];
|
||||
m_num_tiles *= m_tile_dim[i];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename IA, typename IB>
|
||||
MDRangePolicy( std::initializer_list<IA> corner_a
|
||||
, std::initializer_list<IB> corner_b
|
||||
)
|
||||
{
|
||||
static_assert( std::is_integral<IA>::value, "Kokkos Error: corner A defined with non-integral type" );
|
||||
static_assert( std::is_integral<IB>::value, "Kokkos Error: corner B defined with non-integral type" );
|
||||
|
||||
// TODO check size of lists equal to rank
|
||||
// static_asserts on initializer_list.size() require c++14
|
||||
//static_assert( corner_a.size() == rank, "Kokkos Error: corner_a has incorrect rank" );
|
||||
//static_assert( corner_b.size() == rank, "Kokkos Error: corner_b has incorrect rank" );
|
||||
|
||||
|
||||
using A = typename std::make_signed<IA>::type;
|
||||
using B = typename std::make_signed<IB>::type;
|
||||
|
||||
const auto a = [=](int i) { return static_cast<A>(corner_a.begin()[i]); };
|
||||
const auto b = [=](int i) { return static_cast<B>(corner_b.begin()[i]); };
|
||||
|
||||
m_num_tiles = 1;
|
||||
for (int i=0; i<rank; ++i) {
|
||||
m_offset[i] = static_cast<index_type>(a(i) <= b(i) ? a(i) : b(i));
|
||||
m_dim[i] = static_cast<index_type>(a(i) <= b(i) ? b(i) - a(i) : a(i) - b(i));
|
||||
if (inner_direction != Flat) {
|
||||
// default tile size to 4
|
||||
m_tile[i] = 4;
|
||||
} else {
|
||||
m_tile[i] = 1;
|
||||
}
|
||||
m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i];
|
||||
m_num_tiles *= m_tile_dim[i];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename IA, typename IB, typename T>
|
||||
MDRangePolicy( std::initializer_list<IA> corner_a
|
||||
, std::initializer_list<IB> corner_b
|
||||
, std::initializer_list<T> tile
|
||||
)
|
||||
{
|
||||
static_assert( std::is_integral<IA>::value, "Kokkos Error: corner A defined with non-integral type" );
|
||||
static_assert( std::is_integral<IB>::value, "Kokkos Error: corner B defined with non-integral type" );
|
||||
static_assert( std::is_integral<T>::value, "Kokkos Error: tile defined with non-integral type" );
|
||||
static_assert( inner_direction != Flat, "Kokkos Error: tiling not support with flat iteration" );
|
||||
|
||||
// TODO check size of lists equal to rank
|
||||
// static_asserts on initializer_list.size() require c++14
|
||||
//static_assert( corner_a.size() == rank, "Kokkos Error: corner_a has incorrect rank" );
|
||||
//static_assert( corner_b.size() == rank, "Kokkos Error: corner_b has incorrect rank" );
|
||||
//static_assert( tile.size() == rank, "Kokkos Error: tile has incorrect rank" );
|
||||
|
||||
using A = typename std::make_signed<IA>::type;
|
||||
using B = typename std::make_signed<IB>::type;
|
||||
|
||||
const auto a = [=](int i) { return static_cast<A>(corner_a.begin()[i]); };
|
||||
const auto b = [=](int i) { return static_cast<B>(corner_b.begin()[i]); };
|
||||
const auto t = tile.begin();
|
||||
|
||||
m_num_tiles = 1;
|
||||
for (int i=0; i<rank; ++i) {
|
||||
m_offset[i] = static_cast<index_type>(a(i) <= b(i) ? a(i) : b(i));
|
||||
m_dim[i] = static_cast<index_type>(a(i) <= b(i) ? b(i) - a(i) : a(i) - b(i));
|
||||
m_tile[i] = static_cast<int>(t[i] > (T)0 ? t[i] : (T)1 );
|
||||
m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i];
|
||||
m_num_tiles *= m_tile_dim[i];
|
||||
}
|
||||
}
|
||||
|
||||
index_type m_offset[rank];
|
||||
index_type m_dim[rank];
|
||||
int m_tile[rank];
|
||||
index_type m_tile_dim[rank];
|
||||
size_type m_num_tiles; // product of tile dims
|
||||
};
|
||||
|
||||
namespace Impl {
|
||||
|
||||
// Serial, Threads, OpenMP
|
||||
// use enable_if to overload for Cuda
|
||||
template < typename MDRange, typename Functor, typename Enable = void >
|
||||
struct MDForFunctor
|
||||
{
|
||||
using work_tag = typename MDRange::work_tag;
|
||||
using index_type = typename MDRange::index_type;
|
||||
using size_type = typename MDRange::size_type;
|
||||
|
||||
MDRange m_range;
|
||||
Functor m_func;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
MDForFunctor( MDRange const& range, Functor const& f )
|
||||
: m_range(range)
|
||||
, m_func( f )
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
MDForFunctor( MDRange const& range, Functor && f )
|
||||
: m_range(range)
|
||||
, m_func( std::forward<Functor>(f) )
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
MDForFunctor( MDRange && range, Functor const& f )
|
||||
: m_range( std::forward<MDRange>(range) )
|
||||
, m_func( f )
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
MDForFunctor( MDRange && range, Functor && f )
|
||||
: m_range( std::forward<MDRange>(range) )
|
||||
, m_func( std::forward<Functor>(f) )
|
||||
{}
|
||||
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
MDForFunctor( MDForFunctor const& ) = default;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
MDForFunctor& operator=( MDForFunctor const& ) = default;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
MDForFunctor( MDForFunctor && ) = default;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
MDForFunctor& operator=( MDForFunctor && ) = default;
|
||||
|
||||
// Rank-2, Flat, No Tag
|
||||
template <typename Idx>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
typename std::enable_if<( std::is_integral<Idx>::value
|
||||
&& std::is_same<void, work_tag>::value
|
||||
&& MDRange::rank == 2
|
||||
&& MDRange::inner_direction == MDRange::Flat
|
||||
)>::type
|
||||
operator()(Idx t) const
|
||||
{
|
||||
if ( MDRange::outer_direction == MDRange::Right ) {
|
||||
m_func( m_range.m_offset[0] + ( t / m_range.m_dim[1] )
|
||||
, m_range.m_offset[1] + ( t % m_range.m_dim[1] ) );
|
||||
} else {
|
||||
m_func( m_range.m_offset[0] + ( t % m_range.m_dim[0] )
|
||||
, m_range.m_offset[1] + ( t / m_range.m_dim[0] ) );
|
||||
}
|
||||
}
|
||||
|
||||
// Rank-2, Flat, Tag
|
||||
template <typename Idx>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
typename std::enable_if<( std::is_integral<Idx>::value
|
||||
&& !std::is_same<void, work_tag>::value
|
||||
&& MDRange::rank == 2
|
||||
&& MDRange::inner_direction == MDRange::Flat
|
||||
)>::type
|
||||
operator()(Idx t) const
|
||||
{
|
||||
if ( MDRange::outer_direction == MDRange::Right ) {
|
||||
m_func( work_tag{}, m_range.m_offset[0] + ( t / m_range.m_dim[1] )
|
||||
, m_range.m_offset[1] + ( t % m_range.m_dim[1] ) );
|
||||
} else {
|
||||
m_func( work_tag{}, m_range.m_offset[0] + ( t % m_range.m_dim[0] )
|
||||
, m_range.m_offset[1] + ( t / m_range.m_dim[0] ) );
|
||||
}
|
||||
}
|
||||
|
||||
// Rank-2, Not Flat, No Tag
|
||||
template <typename Idx>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
typename std::enable_if<( std::is_integral<Idx>::value
|
||||
&& std::is_same<void, work_tag>::value
|
||||
&& MDRange::rank == 2
|
||||
&& MDRange::inner_direction != MDRange::Flat
|
||||
)>::type
|
||||
operator()(Idx t) const
|
||||
{
|
||||
index_type t0, t1;
|
||||
if ( MDRange::outer_direction == MDRange::Right ) {
|
||||
t0 = t / m_range.m_tile_dim[1];
|
||||
t1 = t % m_range.m_tile_dim[1];
|
||||
} else {
|
||||
t0 = t % m_range.m_tile_dim[0];
|
||||
t1 = t / m_range.m_tile_dim[0];
|
||||
}
|
||||
|
||||
const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
|
||||
const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
|
||||
|
||||
const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
|
||||
const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
|
||||
|
||||
if ( MDRange::inner_direction == MDRange::Right ) {
|
||||
for (int i0=b0; i0<e0; ++i0) {
|
||||
#if defined(KOKKOS_MDRANGE_IVDEP)
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for (int i1=b1; i1<e1; ++i1) {
|
||||
m_func( i0, i1 );
|
||||
}}
|
||||
} else {
|
||||
for (int i1=b1; i1<e1; ++i1) {
|
||||
#if defined(KOKKOS_MDRANGE_IVDEP)
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for (int i0=b0; i0<e0; ++i0) {
|
||||
m_func( i0, i1 );
|
||||
}}
|
||||
}
|
||||
}
|
||||
|
||||
// Rank-2, Not Flat, Tag
|
||||
template <typename Idx>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
typename std::enable_if<( std::is_integral<Idx>::value
|
||||
&& !std::is_same<void, work_tag>::value
|
||||
&& MDRange::rank == 2
|
||||
&& MDRange::inner_direction != MDRange::Flat
|
||||
)>::type
|
||||
operator()(Idx t) const
|
||||
{
|
||||
work_tag tag;
|
||||
|
||||
index_type t0, t1;
|
||||
if ( MDRange::outer_direction == MDRange::Right ) {
|
||||
t0 = t / m_range.m_tile_dim[1];
|
||||
t1 = t % m_range.m_tile_dim[1];
|
||||
} else {
|
||||
t0 = t % m_range.m_tile_dim[0];
|
||||
t1 = t / m_range.m_tile_dim[0];
|
||||
}
|
||||
|
||||
const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
|
||||
const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
|
||||
|
||||
const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
|
||||
const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
|
||||
|
||||
if ( MDRange::inner_direction == MDRange::Right ) {
|
||||
for (int i0=b0; i0<e0; ++i0) {
|
||||
#if defined(KOKKOS_MDRANGE_IVDEP)
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for (int i1=b1; i1<e1; ++i1) {
|
||||
m_func( tag, i0, i1 );
|
||||
}}
|
||||
} else {
|
||||
for (int i1=b1; i1<e1; ++i1) {
|
||||
#if defined(KOKKOS_MDRANGE_IVDEP)
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for (int i0=b0; i0<e0; ++i0) {
|
||||
m_func( tag, i0, i1 );
|
||||
}}
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
// Rank-3, Flat, No Tag
|
||||
template <typename Idx>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
typename std::enable_if<( std::is_integral<Idx>::value
|
||||
&& std::is_same<void, work_tag>::value
|
||||
&& MDRange::rank == 3
|
||||
&& MDRange::inner_direction == MDRange::Flat
|
||||
)>::type
|
||||
operator()(Idx t) const
|
||||
{
|
||||
if ( MDRange::outer_direction == MDRange::Right ) {
|
||||
const int64_t tmp_prod = m_range.m_dim[1]*m_range.m_dim[2];
|
||||
m_func( m_range.m_offset[0] + ( t / tmp_prod )
|
||||
, m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[2] )
|
||||
, m_range.m_offset[2] + ( (t % tmp_prod) % m_range.m_dim[2] )
|
||||
);
|
||||
} else {
|
||||
const int64_t tmp_prod = m_range.m_dim[0]*m_range.m_dim[1];
|
||||
m_func( m_range.m_offset[0] + ( (t % tmp_prod) % m_range.m_dim[0] )
|
||||
, m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[0] )
|
||||
, m_range.m_offset[2] + ( t / tmp_prod )
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Rank-3, Flat, Tag
|
||||
template <typename Idx>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
typename std::enable_if<( std::is_integral<Idx>::value
|
||||
&& !std::is_same<void, work_tag>::value
|
||||
&& MDRange::rank == 3
|
||||
&& MDRange::inner_direction == MDRange::Flat
|
||||
)>::type
|
||||
operator()(Idx t) const
|
||||
{
|
||||
if ( MDRange::outer_direction == MDRange::Right ) {
|
||||
const int64_t tmp_prod = m_range.m_dim[1]*m_range.m_dim[2];
|
||||
m_func( work_tag{}
|
||||
, m_range.m_offset[0] + ( t / tmp_prod )
|
||||
, m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[2] )
|
||||
, m_range.m_offset[2] + ( (t % tmp_prod) % m_range.m_dim[2] )
|
||||
);
|
||||
} else {
|
||||
const int64_t tmp_prod = m_range.m_dim[0]*m_range.m_dim[1];
|
||||
m_func( work_tag{}
|
||||
, m_range.m_offset[0] + ( (t % tmp_prod) % m_range.m_dim[0] )
|
||||
, m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[0] )
|
||||
, m_range.m_offset[2] + ( t / tmp_prod )
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Rank-3, Not Flat, No Tag
|
||||
template <typename Idx>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
typename std::enable_if<( std::is_integral<Idx>::value
|
||||
&& std::is_same<void, work_tag>::value
|
||||
&& MDRange::rank == 3
|
||||
&& MDRange::inner_direction != MDRange::Flat
|
||||
)>::type
|
||||
operator()(Idx t) const
|
||||
{
|
||||
index_type t0, t1, t2;
|
||||
if ( MDRange::outer_direction == MDRange::Right ) {
|
||||
const index_type tmp_prod = ( m_range.m_tile_dim[1]*m_range.m_tile_dim[2]);
|
||||
t0 = t / tmp_prod;
|
||||
t1 = ( t % tmp_prod ) / m_range.m_tile_dim[2];
|
||||
t2 = ( t % tmp_prod ) % m_range.m_tile_dim[2];
|
||||
} else {
|
||||
const index_type tmp_prod = ( m_range.m_tile_dim[0]*m_range.m_tile_dim[1]);
|
||||
t0 = ( t % tmp_prod ) % m_range.m_tile_dim[0];
|
||||
t1 = ( t % tmp_prod ) / m_range.m_tile_dim[0];
|
||||
t2 = t / tmp_prod;
|
||||
}
|
||||
|
||||
const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
|
||||
const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
|
||||
const index_type b2 = t2 * m_range.m_tile[2] + m_range.m_offset[2];
|
||||
|
||||
const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
|
||||
const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
|
||||
const index_type e2 = b2 + m_range.m_tile[2] <= (m_range.m_dim[2] + m_range.m_offset[2] ) ? b2 + m_range.m_tile[2] : ( m_range.m_dim[2] + m_range.m_offset[2] );
|
||||
|
||||
if ( MDRange::inner_direction == MDRange::Right ) {
|
||||
for (int i0=b0; i0<e0; ++i0) {
|
||||
for (int i1=b1; i1<e1; ++i1) {
|
||||
#if defined(KOKKOS_MDRANGE_IVDEP)
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for (int i2=b2; i2<e2; ++i2) {
|
||||
m_func( i0, i1, i2 );
|
||||
}}}
|
||||
} else {
|
||||
for (int i2=b2; i2<e2; ++i2) {
|
||||
for (int i1=b1; i1<e1; ++i1) {
|
||||
#if defined(KOKKOS_MDRANGE_IVDEP)
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for (int i0=b0; i0<e0; ++i0) {
|
||||
m_func( i0, i1, i2 );
|
||||
}}}
|
||||
}
|
||||
}
|
||||
|
||||
// Rank-3, Not Flat, Tag
|
||||
template <typename Idx>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
typename std::enable_if<( std::is_integral<Idx>::value
|
||||
&& !std::is_same<void, work_tag>::value
|
||||
&& MDRange::rank == 3
|
||||
&& MDRange::inner_direction != MDRange::Flat
|
||||
)>::type
|
||||
operator()(Idx t) const
|
||||
{
|
||||
work_tag tag;
|
||||
|
||||
index_type t0, t1, t2;
|
||||
if ( MDRange::outer_direction == MDRange::Right ) {
|
||||
const index_type tmp_prod = ( m_range.m_tile_dim[1]*m_range.m_tile_dim[2]);
|
||||
t0 = t / tmp_prod;
|
||||
t1 = ( t % tmp_prod ) / m_range.m_tile_dim[2];
|
||||
t2 = ( t % tmp_prod ) % m_range.m_tile_dim[2];
|
||||
} else {
|
||||
const index_type tmp_prod = ( m_range.m_tile_dim[0]*m_range.m_tile_dim[1]);
|
||||
t0 = ( t % tmp_prod ) % m_range.m_tile_dim[0];
|
||||
t1 = ( t % tmp_prod ) / m_range.m_tile_dim[0];
|
||||
t2 = t / tmp_prod;
|
||||
}
|
||||
|
||||
const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
|
||||
const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
|
||||
const index_type b2 = t2 * m_range.m_tile[2] + m_range.m_offset[2];
|
||||
|
||||
const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
|
||||
const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
|
||||
const index_type e2 = b2 + m_range.m_tile[2] <= (m_range.m_dim[2] + m_range.m_offset[2] ) ? b2 + m_range.m_tile[2] : ( m_range.m_dim[2] + m_range.m_offset[2] );
|
||||
|
||||
if ( MDRange::inner_direction == MDRange::Right ) {
|
||||
for (int i0=b0; i0<e0; ++i0) {
|
||||
for (int i1=b1; i1<e1; ++i1) {
|
||||
#if defined(KOKKOS_MDRANGE_IVDEP)
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for (int i2=b2; i2<e2; ++i2) {
|
||||
m_func( tag, i0, i1, i2 );
|
||||
}}}
|
||||
} else {
|
||||
for (int i2=b2; i2<e2; ++i2) {
|
||||
for (int i1=b1; i1<e1; ++i1) {
|
||||
#if defined(KOKKOS_MDRANGE_IVDEP)
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for (int i0=b0; i0<e0; ++i0) {
|
||||
m_func( tag, i0, i1, i2 );
|
||||
}}}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
} // namespace Impl
|
||||
|
||||
|
||||
template <typename MDRange, typename Functor>
|
||||
void md_parallel_for( MDRange const& range
|
||||
, Functor const& f
|
||||
, const std::string& str = ""
|
||||
)
|
||||
{
|
||||
Impl::MDForFunctor<MDRange, Functor> g(range, f);
|
||||
|
||||
using range_policy = typename MDRange::range_policy;
|
||||
|
||||
Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
|
||||
}
|
||||
|
||||
template <typename MDRange, typename Functor>
|
||||
void md_parallel_for( const std::string& str
|
||||
, MDRange const& range
|
||||
, Functor const& f
|
||||
)
|
||||
{
|
||||
Impl::MDForFunctor<MDRange, Functor> g(range, f);
|
||||
|
||||
using range_policy = typename MDRange::range_policy;
|
||||
|
||||
Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
|
||||
}
|
||||
|
||||
}} // namespace Kokkos::Experimental
|
||||
|
||||
#endif //KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
|
||||
|
File diff suppressed because it is too large
Load Diff
|
@ -121,13 +121,22 @@ public:
|
|||
return *this;
|
||||
}
|
||||
|
||||
//! Assignment operator.
|
||||
/// \brief Assignment operator, for volatile <tt>*this</tt> and
|
||||
/// nonvolatile input.
|
||||
///
|
||||
/// \param src [in] Input; right-hand side of the assignment.
|
||||
///
|
||||
/// This operator returns \c void instead of <tt>volatile
|
||||
/// complex<RealType>& </tt>. See Kokkos Issue #177 for the
|
||||
/// explanation. In practice, this means that you should not chain
|
||||
/// assignments with volatile lvalues.
|
||||
template<class InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
volatile complex<RealType>& operator= (const complex<InputRealType>& src) volatile {
|
||||
void operator= (const complex<InputRealType>& src) volatile {
|
||||
re_ = src.re_;
|
||||
im_ = src.im_;
|
||||
return *this;
|
||||
// We deliberately do not return anything here. See explanation
|
||||
// in public documentation above.
|
||||
}
|
||||
|
||||
//! Assignment operator.
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
|
@ -36,86 +36,43 @@
|
|||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_BASIC_ALLOCATORS_HPP
|
||||
#define KOKKOS_BASIC_ALLOCATORS_HPP
|
||||
#ifndef KOKKOS_CORE_CONCEPTS_HPP
|
||||
#define KOKKOS_CORE_CONCEPTS_HPP
|
||||
|
||||
#if ! KOKKOS_USING_EXP_VIEW
|
||||
#include <type_traits>
|
||||
|
||||
namespace Kokkos { namespace Impl {
|
||||
namespace Kokkos {
|
||||
//Schedules for Execution Policies
|
||||
struct Static {};
|
||||
struct Dynamic {};
|
||||
|
||||
/// class UnmanagedAllocator
|
||||
/// does nothing when deallocate(ptr,size) is called
|
||||
class UnmanagedAllocator
|
||||
//Schedule Wrapper Type
|
||||
template<class T>
|
||||
struct Schedule
|
||||
{
|
||||
public:
|
||||
static const char * name() { return "Unmanaged Allocator"; }
|
||||
|
||||
static void deallocate(void * /*ptr*/, size_t /*size*/) {}
|
||||
static_assert( std::is_same<T,Static>::value
|
||||
|| std::is_same<T,Dynamic>::value
|
||||
, "Kokkos: Invalid Schedule<> type."
|
||||
);
|
||||
using schedule_type = Schedule<T>;
|
||||
using type = T;
|
||||
};
|
||||
|
||||
|
||||
/// class MallocAllocator
|
||||
class MallocAllocator
|
||||
//Specify Iteration Index Type
|
||||
template<typename T>
|
||||
struct IndexType
|
||||
{
|
||||
public:
|
||||
static const char * name()
|
||||
{
|
||||
return "Malloc Allocator";
|
||||
}
|
||||
|
||||
static void* allocate(size_t size);
|
||||
|
||||
static void deallocate(void * ptr, size_t size);
|
||||
|
||||
static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
|
||||
static_assert(std::is_integral<T>::value,"Kokkos: Invalid IndexType<>.");
|
||||
using index_type = IndexType<T>;
|
||||
using type = T;
|
||||
};
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
/// class AlignedAllocator
|
||||
/// memory aligned to Kokkos::Impl::MEMORY_ALIGNMENT
|
||||
class AlignedAllocator
|
||||
{
|
||||
public:
|
||||
static const char * name()
|
||||
{
|
||||
return "Aligned Allocator";
|
||||
}
|
||||
|
||||
static void* allocate(size_t size);
|
||||
|
||||
static void deallocate(void * ptr, size_t size);
|
||||
|
||||
static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
|
||||
};
|
||||
|
||||
|
||||
/// class PageAlignedAllocator
|
||||
/// memory aligned to PAGE_SIZE
|
||||
class PageAlignedAllocator
|
||||
{
|
||||
public:
|
||||
static const char * name()
|
||||
{
|
||||
return "Page Aligned Allocator";
|
||||
}
|
||||
|
||||
static void* allocate(size_t size);
|
||||
|
||||
static void deallocate(void * ptr, size_t size);
|
||||
|
||||
static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
|
||||
};
|
||||
|
||||
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
|
||||
|
||||
#endif //KOKKOS_BASIC_ALLOCATORS_HPP
|
||||
|
||||
#endif // KOKKOS_CORE_CONCEPTS_HPP
|
||||
|
|
@ -159,8 +159,6 @@ void * kokkos_realloc( void * arg_alloc , const size_t arg_alloc_size )
|
|||
} // namespace Kokkos
|
||||
|
||||
|
||||
#if KOKKOS_USING_EXP_VIEW
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
using Kokkos::Experimental::kokkos_malloc ;
|
||||
|
@ -169,76 +167,6 @@ using Kokkos::Experimental::kokkos_free ;
|
|||
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
namespace Impl {
|
||||
// should only by used by kokkos_malloc and kokkos_free
|
||||
struct MallocHelper
|
||||
{
|
||||
static void increment_ref_count( AllocationTracker const & tracker )
|
||||
{
|
||||
tracker.increment_ref_count();
|
||||
}
|
||||
|
||||
static void decrement_ref_count( AllocationTracker const & tracker )
|
||||
{
|
||||
tracker.decrement_ref_count();
|
||||
}
|
||||
};
|
||||
} // namespace Impl
|
||||
|
||||
/* Allocate memory from a memory space.
|
||||
* The allocation is tracked in Kokkos memory tracking system, so
|
||||
* leaked memory can be identified.
|
||||
*/
|
||||
template< class Arg = DefaultExecutionSpace>
|
||||
void* kokkos_malloc(const std::string label, size_t count) {
|
||||
if(count == 0) return NULL;
|
||||
typedef typename Arg::memory_space MemorySpace;
|
||||
Impl::AllocationTracker tracker = MemorySpace::allocate_and_track(label,count);;
|
||||
Impl::MallocHelper::increment_ref_count( tracker );
|
||||
return tracker.alloc_ptr();
|
||||
}
|
||||
|
||||
template< class Arg = DefaultExecutionSpace>
|
||||
void* kokkos_malloc(const size_t& count) {
|
||||
return kokkos_malloc<Arg>("DefaultLabel",count);
|
||||
}
|
||||
|
||||
|
||||
/* Free memory from a memory space.
|
||||
*/
|
||||
template< class Arg = DefaultExecutionSpace>
|
||||
void kokkos_free(const void* ptr) {
|
||||
typedef typename Arg::memory_space MemorySpace;
|
||||
typedef typename MemorySpace::allocator allocator;
|
||||
Impl::AllocationTracker tracker = Impl::AllocationTracker::find<allocator>(ptr);
|
||||
if (tracker.is_valid()) {
|
||||
Impl::MallocHelper::decrement_ref_count( tracker );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template< class Arg = DefaultExecutionSpace>
|
||||
void* kokkos_realloc(const void* old_ptr, size_t size) {
|
||||
if(old_ptr == NULL)
|
||||
return kokkos_malloc<Arg>(size);
|
||||
|
||||
typedef typename Arg::memory_space MemorySpace;
|
||||
typedef typename MemorySpace::allocator allocator;
|
||||
Impl::AllocationTracker tracker = Impl::AllocationTracker::find<allocator>(old_ptr);
|
||||
|
||||
tracker.reallocate(size);
|
||||
|
||||
return tracker.alloc_ptr();
|
||||
}
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
|
|
@ -69,6 +69,9 @@ namespace {
|
|||
/**\brief Token to indicate that a parameter's value is to be automatically selected */
|
||||
constexpr AUTO_t AUTO = Kokkos::AUTO_t();
|
||||
}
|
||||
|
||||
struct InvalidType {};
|
||||
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
@ -205,7 +208,7 @@ namespace Impl {
|
|||
template< class Functor
|
||||
, class Policy
|
||||
, class EnableFunctor = void
|
||||
, class EnablePolicy = void
|
||||
, class EnablePolicy = void
|
||||
>
|
||||
struct FunctorPolicyExecutionSpace;
|
||||
|
||||
|
@ -225,7 +228,7 @@ template< class FunctorType , class ExecPolicy , class ExecutionSpace =
|
|||
///
|
||||
/// This is an implementation detail of parallel_reduce. Users should
|
||||
/// skip this and go directly to the nonmember function parallel_reduce.
|
||||
template< class FunctorType , class ExecPolicy , class ExecutionSpace =
|
||||
template< class FunctorType , class ExecPolicy , class ReducerType = InvalidType, class ExecutionSpace =
|
||||
typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
|
||||
> class ParallelReduce ;
|
||||
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
|
@ -36,7 +36,7 @@
|
|||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
@ -56,11 +56,14 @@
|
|||
#include <Kokkos_CudaSpace.hpp>
|
||||
|
||||
#include <Kokkos_Parallel.hpp>
|
||||
#include <Kokkos_TaskPolicy.hpp>
|
||||
#include <Kokkos_Layout.hpp>
|
||||
#include <Kokkos_ScratchSpace.hpp>
|
||||
#include <Kokkos_MemoryTraits.hpp>
|
||||
#include <impl/Kokkos_Tags.hpp>
|
||||
|
||||
#include <KokkosExp_MDRangePolicy.hpp>
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
|
@ -108,7 +111,7 @@ public:
|
|||
//! This execution space's preferred array layout.
|
||||
typedef LayoutLeft array_layout ;
|
||||
|
||||
//!
|
||||
//!
|
||||
typedef ScratchMemorySpace< Cuda > scratch_memory_space ;
|
||||
|
||||
//@}
|
||||
|
@ -257,10 +260,10 @@ struct VerifyExecutionCanAccessMemorySpace
|
|||
#include <Cuda/Kokkos_CudaExec.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_View.hpp>
|
||||
|
||||
#include <KokkosExp_View.hpp>
|
||||
#include <Cuda/KokkosExp_Cuda_View.hpp>
|
||||
|
||||
#include <Cuda/Kokkos_Cuda_Parallel.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Task.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
|
|
@ -54,10 +54,7 @@
|
|||
|
||||
#include <Kokkos_HostSpace.hpp>
|
||||
|
||||
#include <impl/Kokkos_AllocationTracker.hpp>
|
||||
|
||||
#include <Cuda/Kokkos_Cuda_abort.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
|
@ -77,33 +74,6 @@ public:
|
|||
|
||||
/*--------------------------------*/
|
||||
|
||||
#if ! KOKKOS_USING_EXP_VIEW
|
||||
|
||||
typedef Impl::CudaMallocAllocator allocator;
|
||||
|
||||
/** \brief Allocate a contiguous block of memory.
|
||||
*
|
||||
* The input label is associated with the block of memory.
|
||||
* The block of memory is tracked via reference counting where
|
||||
* allocation gives it a reference count of one.
|
||||
*/
|
||||
static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
|
||||
|
||||
/*--------------------------------*/
|
||||
/** \brief Cuda specific function to attached texture object to an allocation.
|
||||
* Output the texture object, base pointer, and offset from the input pointer.
|
||||
*/
|
||||
#if defined( __CUDACC__ )
|
||||
static void texture_object_attach( Impl::AllocationTracker const & tracker
|
||||
, unsigned type_size
|
||||
, ::cudaChannelFormatDesc const & desc
|
||||
);
|
||||
#endif
|
||||
|
||||
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
|
||||
|
||||
/*--------------------------------*/
|
||||
|
||||
CudaSpace();
|
||||
CudaSpace( CudaSpace && rhs ) = default ;
|
||||
CudaSpace( const CudaSpace & rhs ) = default ;
|
||||
|
@ -137,7 +107,7 @@ namespace Impl {
|
|||
/// where the hash value is derived from the address of the
|
||||
/// object for which an atomic operation is performed.
|
||||
/// This function initializes the locks to zero (unset).
|
||||
void init_lock_array_cuda_space();
|
||||
void init_lock_arrays_cuda_space();
|
||||
|
||||
/// \brief Retrieve the pointer to the lock array for arbitrary size atomics.
|
||||
///
|
||||
|
@ -146,7 +116,23 @@ void init_lock_array_cuda_space();
|
|||
/// object for which an atomic operation is performed.
|
||||
/// This function retrieves the lock array pointer.
|
||||
/// If the array is not yet allocated it will do so.
|
||||
int* lock_array_cuda_space_ptr(bool deallocate = false);
|
||||
int* atomic_lock_array_cuda_space_ptr(bool deallocate = false);
|
||||
|
||||
/// \brief Retrieve the pointer to the scratch array for team and thread private global memory.
|
||||
///
|
||||
/// Team and Thread private scratch allocations in
|
||||
/// global memory are aquired via locks.
|
||||
/// This function retrieves the lock array pointer.
|
||||
/// If the array is not yet allocated it will do so.
|
||||
int* scratch_lock_array_cuda_space_ptr(bool deallocate = false);
|
||||
|
||||
/// \brief Retrieve the pointer to the scratch array for unique identifiers.
|
||||
///
|
||||
/// Unique identifiers in the range 0-Cuda::concurrency
|
||||
/// are provided via locks.
|
||||
/// This function retrieves the lock array pointer.
|
||||
/// If the array is not yet allocated it will do so.
|
||||
int* threadid_lock_array_cuda_space_ptr(bool deallocate = false);
|
||||
}
|
||||
} // namespace Kokkos
|
||||
|
||||
|
@ -172,33 +158,6 @@ public:
|
|||
|
||||
/*--------------------------------*/
|
||||
|
||||
#if ! KOKKOS_USING_EXP_VIEW
|
||||
|
||||
typedef Impl::CudaUVMAllocator allocator;
|
||||
|
||||
/** \brief Allocate a contiguous block of memory.
|
||||
*
|
||||
* The input label is associated with the block of memory.
|
||||
* The block of memory is tracked via reference counting where
|
||||
* allocation gives it a reference count of one.
|
||||
*/
|
||||
static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
|
||||
|
||||
|
||||
/** \brief Cuda specific function to attached texture object to an allocation.
|
||||
* Output the texture object, base pointer, and offset from the input pointer.
|
||||
*/
|
||||
#if defined( __CUDACC__ )
|
||||
static void texture_object_attach( Impl::AllocationTracker const & tracker
|
||||
, unsigned type_size
|
||||
, ::cudaChannelFormatDesc const & desc
|
||||
);
|
||||
#endif
|
||||
|
||||
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
|
||||
|
||||
/*--------------------------------*/
|
||||
|
||||
CudaUVMSpace();
|
||||
CudaUVMSpace( CudaUVMSpace && rhs ) = default ;
|
||||
CudaUVMSpace( const CudaUVMSpace & rhs ) = default ;
|
||||
|
@ -242,22 +201,6 @@ public:
|
|||
|
||||
/*--------------------------------*/
|
||||
|
||||
#if ! KOKKOS_USING_EXP_VIEW
|
||||
|
||||
typedef Impl::CudaHostAllocator allocator ;
|
||||
|
||||
/** \brief Allocate a contiguous block of memory.
|
||||
*
|
||||
* The input label is associated with the block of memory.
|
||||
* The block of memory is tracked via reference counting where
|
||||
* allocation gives it a reference count of one.
|
||||
*/
|
||||
static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
|
||||
|
||||
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
|
||||
|
||||
/*--------------------------------*/
|
||||
|
||||
CudaHostPinnedSpace();
|
||||
CudaHostPinnedSpace( CudaHostPinnedSpace && rhs ) = default ;
|
||||
CudaHostPinnedSpace( const CudaHostPinnedSpace & rhs ) = default ;
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
|
@ -36,7 +36,7 @@
|
|||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
@ -47,167 +47,15 @@
|
|||
#include <Kokkos_Core_fwd.hpp>
|
||||
#include <impl/Kokkos_Traits.hpp>
|
||||
#include <impl/Kokkos_StaticAssert.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
#include <impl/Kokkos_Tags.hpp>
|
||||
#include <impl/Kokkos_AnalyzePolicy.hpp>
|
||||
#include <Kokkos_Concepts.hpp>
|
||||
#include <iostream>
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
//Schedules for Execution Policies
|
||||
struct Static {
|
||||
};
|
||||
|
||||
struct Dynamic {
|
||||
};
|
||||
|
||||
//Schedule Wrapper Type
|
||||
template<class ScheduleType>
|
||||
struct Schedule {
|
||||
static_assert(std::is_same<ScheduleType,Static>::value ||
|
||||
std::is_same<ScheduleType,Dynamic>::value,
|
||||
"Kokkos: Invalid Schedule<> type.");
|
||||
typedef Schedule<ScheduleType> schedule_type;
|
||||
typedef ScheduleType type;
|
||||
};
|
||||
|
||||
//Specif Iteration Index Type
|
||||
template<typename iType>
|
||||
struct IndexType {
|
||||
static_assert(std::is_integral<iType>::value,"Kokkos: Invalid IndexType<>.");
|
||||
typedef IndexType<iType> index_type;
|
||||
typedef iType type;
|
||||
};
|
||||
|
||||
namespace Impl {
|
||||
|
||||
template<class Arg>
|
||||
struct is_schedule_type {
|
||||
enum { value = 0};
|
||||
};
|
||||
|
||||
template<class ScheduleType>
|
||||
struct is_schedule_type<Schedule<ScheduleType> > {
|
||||
enum {value = 1 };
|
||||
};
|
||||
|
||||
template<class Arg>
|
||||
struct is_index_type {
|
||||
enum { value = 0 };
|
||||
};
|
||||
|
||||
template<typename iType>
|
||||
struct is_index_type<IndexType<iType> > {
|
||||
enum { value = 1 };
|
||||
};
|
||||
|
||||
template<typename Arg>
|
||||
struct is_tag_type {
|
||||
enum { value = !(is_execution_space<Arg>::value ||
|
||||
is_schedule_type<Arg>::value ||
|
||||
is_index_type<Arg>::value ||
|
||||
std::is_integral<Arg>::value)};
|
||||
};
|
||||
|
||||
//Policy Traits
|
||||
template<class ... Properties>
|
||||
struct PolicyTraits;
|
||||
|
||||
template<>
|
||||
struct PolicyTraits<void> {
|
||||
typedef void execution_space;
|
||||
typedef void schedule_type;
|
||||
typedef void index_type;
|
||||
typedef void tag_type;
|
||||
};
|
||||
|
||||
|
||||
//Strip off ExecutionSpace
|
||||
template<class ExecutionSpace, class ... Props>
|
||||
struct PolicyTraits<typename std::enable_if<is_execution_space<ExecutionSpace>::value >::type,ExecutionSpace,Props ...> {
|
||||
static_assert( std::is_same<typename PolicyTraits<void, Props ...>::execution_space, void>::value,
|
||||
"ExecutionPolicy: Only one execution space template argument may be used.");
|
||||
typedef ExecutionSpace execution_space;
|
||||
typedef typename PolicyTraits<void, Props ...>::schedule_type schedule_type;
|
||||
typedef typename PolicyTraits<void, Props ...>::index_type index_type;
|
||||
typedef typename PolicyTraits<void, Props ...>::tag_type tag_type;
|
||||
};
|
||||
|
||||
//Strip off ScheduleType
|
||||
template<class ScheduleType, class ... Props>
|
||||
struct PolicyTraits<typename std::enable_if<is_schedule_type<Schedule<ScheduleType> >::value >::type,Schedule<ScheduleType>,Props ...> {
|
||||
static_assert( std::is_same<typename PolicyTraits<void, Props ...>::schedule_type, void>::value,
|
||||
"ExecutionPolicy: Only one Schedule<..> template argument may be used.");
|
||||
typedef typename PolicyTraits<void, Props ...>::execution_space execution_space;
|
||||
typedef ScheduleType schedule_type;
|
||||
typedef typename PolicyTraits<void, Props ...>::index_type index_type;
|
||||
typedef typename PolicyTraits<void, Props ...>::tag_type tag_type;
|
||||
};
|
||||
|
||||
//Strip off IndexType
|
||||
template<typename iType, class ... Props>
|
||||
struct PolicyTraits<void, IndexType<iType>,Props ...> {
|
||||
static_assert( std::is_same<typename PolicyTraits<void, Props ...>::index_type, void>::value,
|
||||
"ExecutionPolicy: Only one IndexType<..> template argument may be used.");
|
||||
typedef typename PolicyTraits<void, Props ...>::execution_space execution_space;
|
||||
typedef typename PolicyTraits<void, Props ...>::schedule_type schedule_type;
|
||||
typedef iType index_type;
|
||||
typedef typename PolicyTraits<void, Props ...>::tag_type tag_type;
|
||||
};
|
||||
|
||||
//Strip off raw IndexType
|
||||
template<typename iType, class ... Props>
|
||||
struct PolicyTraits<typename std::enable_if<std::is_integral<iType>::value>::type, iType,Props ...> {
|
||||
static_assert( std::is_same<typename PolicyTraits<void, Props ...>::index_type, void>::value,
|
||||
"ExecutionPolicy: Only one IndexType<..> template argument may be used.");
|
||||
typedef typename PolicyTraits<void, Props ...>::execution_space execution_space;
|
||||
typedef typename PolicyTraits<void, Props ...>::schedule_type schedule_type;
|
||||
typedef iType index_type;
|
||||
typedef typename PolicyTraits<void, Props ...>::tag_type tag_type;
|
||||
};
|
||||
|
||||
//Strip off TagType
|
||||
template<class TagType, class ... Props>
|
||||
struct PolicyTraits<typename std::enable_if<!is_schedule_type<TagType>::value &&
|
||||
!is_execution_space<TagType>::value &&
|
||||
!is_index_type<TagType>::value &&
|
||||
!std::is_integral<TagType>::value
|
||||
>::type,
|
||||
TagType,Props ...> {
|
||||
static_assert( std::is_same<typename PolicyTraits<void, Props ...>::tag_type, void>::value,
|
||||
"ExecutionPolicy: Only one tag type template argument may be used.");
|
||||
|
||||
typedef typename PolicyTraits<void, Props ...>::execution_space execution_space;
|
||||
typedef typename PolicyTraits<void, Props ...>::schedule_type schedule_type;
|
||||
typedef typename PolicyTraits<void, Props ...>::index_type index_type;
|
||||
typedef TagType tag_type;
|
||||
};
|
||||
|
||||
|
||||
template<class ... Props>
|
||||
struct PolicyTraits {
|
||||
#ifdef KOKKOS_DIRECT_VARIADIC_EXPANSION
|
||||
typedef typename std::conditional<std::is_same<void, typename PolicyTraits<void, Props ...>::execution_space>::value,
|
||||
Kokkos::DefaultExecutionSpace, typename PolicyTraits<void,Props ...>::execution_space>::type execution_space;
|
||||
typedef typename std::conditional<std::is_same<void, typename PolicyTraits<void, Props ...>::schedule_type>::value,
|
||||
Kokkos::Static, typename PolicyTraits<void,Props ...>::schedule_type>::type schedule_type;
|
||||
typedef typename std::conditional<std::is_same<void, typename PolicyTraits<void, Props ...>::index_type>::value,
|
||||
typename execution_space::size_type, typename PolicyTraits<void,Props ...>::index_type>::type index_type;
|
||||
typedef typename std::conditional<std::is_same<void, typename PolicyTraits<void, Props ...>::tag_type>::value,
|
||||
void, typename PolicyTraits<void,Props ...>::tag_type>::type work_tag;
|
||||
#else
|
||||
typedef typename has_condition<Kokkos::DefaultExecutionSpace,is_execution_space,Props ...>::type execution_space;
|
||||
typedef typename has_condition<Kokkos::Schedule<Kokkos::Static>,is_schedule_type,Props ...>::type schedule_type;
|
||||
typedef typename has_condition<void,is_tag_type,Props ...>::type work_tag;
|
||||
typedef typename has_condition<typename execution_space::size_type, std::is_integral, Props ... >::type default_index_type;
|
||||
typedef typename has_condition<Kokkos::IndexType<default_index_type>,is_index_type,Props ...>::type::type index_type;
|
||||
#endif
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
namespace Kokkos {
|
||||
/** \brief Execution policy for work over a range of an integral type.
|
||||
*
|
||||
* Valid template argument options:
|
||||
|
@ -230,7 +78,9 @@ namespace Kokkos {
|
|||
* Blocking is the granularity of partitioning the range among threads.
|
||||
*/
|
||||
template<class ... Properties>
|
||||
class RangePolicy: public Impl::PolicyTraits<Properties ... > {
|
||||
class RangePolicy
|
||||
: public Impl::PolicyTraits<Properties ... >
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Impl::PolicyTraits<Properties ... > traits;
|
||||
|
@ -243,6 +93,7 @@ private:
|
|||
public:
|
||||
|
||||
//! Tag this class as an execution policy
|
||||
typedef RangePolicy execution_policy;
|
||||
typedef typename traits::index_type member_type ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION const typename traits::execution_space & space() const { return m_space ; }
|
||||
|
@ -348,7 +199,7 @@ public:
|
|||
: m_begin(0), m_end(0)
|
||||
{
|
||||
if ( part_size ) {
|
||||
|
||||
|
||||
// Split evenly among partitions, then round up to the granularity.
|
||||
const member_type work_part =
|
||||
( ( ( ( range.end() - range.begin() ) + ( part_size - 1 ) ) / part_size )
|
||||
|
@ -356,7 +207,7 @@ public:
|
|||
|
||||
m_begin = range.begin() + work_part * part_rank ;
|
||||
m_end = m_begin + work_part ;
|
||||
|
||||
|
||||
if ( range.end() < m_begin ) m_begin = range.end() ;
|
||||
if ( range.end() < m_end ) m_end = range.end() ;
|
||||
}
|
||||
|
@ -366,10 +217,11 @@ public:
|
|||
member_type m_end ;
|
||||
WorkRange();
|
||||
WorkRange & operator = ( const WorkRange & );
|
||||
|
||||
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
@ -377,38 +229,6 @@ public:
|
|||
|
||||
namespace Kokkos {
|
||||
|
||||
namespace Experimental {
|
||||
|
||||
/** \brief Scratch memory request accepting per team and per thread value
|
||||
*
|
||||
* An instance of this class can be given as the last argument to a
|
||||
* TeamPolicy constructor. It sets the amount of user requested shared
|
||||
* memory for the team.
|
||||
*/
|
||||
|
||||
template< class MemorySpace >
|
||||
class TeamScratchRequest {
|
||||
size_t m_per_team;
|
||||
size_t m_per_thread;
|
||||
|
||||
public:
|
||||
TeamScratchRequest(size_t per_team_, size_t per_thread_ = 0):
|
||||
m_per_team(per_team_), m_per_thread(per_thread_) {
|
||||
}
|
||||
|
||||
size_t per_team() const {
|
||||
return m_per_team;
|
||||
}
|
||||
size_t per_thread() const {
|
||||
return m_per_thread;
|
||||
}
|
||||
size_t total(const size_t team_size) const {
|
||||
return m_per_team + m_per_thread * team_size;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
namespace Impl {
|
||||
|
||||
|
||||
|
@ -451,11 +271,9 @@ public:
|
|||
|
||||
TeamPolicyInternal( int league_size_request , const Kokkos::AUTO_t & , int vector_length_request = 1 );
|
||||
|
||||
template<class MemorySpace>
|
||||
TeamPolicyInternal( int league_size_request , int team_size_request , const Experimental::TeamScratchRequest<MemorySpace>& team_scratch_memory_request );
|
||||
/* TeamPolicyInternal( int league_size_request , int team_size_request );
|
||||
|
||||
template<class MemorySpace>
|
||||
TeamPolicyInternal( int league_size_request , const Kokkos::AUTO_t & , const Experimental::TeamScratchRequest<MemorySpace>& team_scratch_memory_request );
|
||||
TeamPolicyInternal( int league_size_request , const Kokkos::AUTO_t & );*/
|
||||
|
||||
/** \brief The actual league size (number of teams) of the policy.
|
||||
*
|
||||
|
@ -574,12 +392,14 @@ class TeamPolicy: public
|
|||
typedef Impl::TeamPolicyInternal<
|
||||
typename Impl::PolicyTraits<Properties ... >::execution_space,
|
||||
Properties ...> internal_policy;
|
||||
|
||||
typedef Impl::PolicyTraits<Properties ... > traits;
|
||||
|
||||
public:
|
||||
typedef TeamPolicy execution_policy;
|
||||
|
||||
TeamPolicy& operator = (const TeamPolicy&) = default;
|
||||
|
||||
|
||||
/** \brief Construct policy with the given instance of the execution space */
|
||||
TeamPolicy( const typename traits::execution_space & , int league_size_request , int team_size_request , int vector_length_request = 1 )
|
||||
: internal_policy(typename traits::execution_space(),league_size_request,team_size_request, vector_length_request) {}
|
||||
|
@ -594,13 +414,11 @@ public:
|
|||
TeamPolicy( int league_size_request , const Kokkos::AUTO_t & , int vector_length_request = 1 )
|
||||
: internal_policy(league_size_request,Kokkos::AUTO(), vector_length_request) {}
|
||||
|
||||
template<class MemorySpace>
|
||||
TeamPolicy( int league_size_request , int team_size_request , const Experimental::TeamScratchRequest<MemorySpace>& team_scratch_memory_request )
|
||||
: internal_policy(league_size_request,team_size_request, team_scratch_memory_request) {}
|
||||
/* TeamPolicy( int league_size_request , int team_size_request )
|
||||
: internal_policy(league_size_request,team_size_request) {}
|
||||
|
||||
template<class MemorySpace>
|
||||
TeamPolicy( int league_size_request , const Kokkos::AUTO_t & , const Experimental::TeamScratchRequest<MemorySpace>& team_scratch_memory_request )
|
||||
: internal_policy(league_size_request,Kokkos::AUTO(), team_scratch_memory_request) {}
|
||||
TeamPolicy( int league_size_request , const Kokkos::AUTO_t & )
|
||||
: internal_policy(league_size_request,Kokkos::AUTO()) {}*/
|
||||
|
||||
private:
|
||||
TeamPolicy(const internal_policy& p):internal_policy(p) {}
|
||||
|
@ -744,6 +562,7 @@ Impl::ThreadVectorRangeBoundariesStruct<iType,TeamMemberType> ThreadVectorRange(
|
|||
|
||||
} // namespace Kokkos
|
||||
|
||||
|
||||
#endif /* #define KOKKOS_EXECPOLICY_HPP */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
|
|
@ -120,21 +120,6 @@ public:
|
|||
//! This memory space preferred device_type
|
||||
typedef Kokkos::Device<execution_space,memory_space> device_type;
|
||||
|
||||
/*--------------------------------*/
|
||||
#if ! KOKKOS_USING_EXP_VIEW
|
||||
|
||||
typedef Impl::HBWMallocAllocator allocator ;
|
||||
|
||||
/** \brief Allocate a contiguous block of memory.
|
||||
*
|
||||
* The input label is associated with the block of memory.
|
||||
* The block of memory is tracked via reference counting where
|
||||
* allocation gives it a reference count of one.
|
||||
*/
|
||||
static Kokkos::Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
|
||||
|
||||
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
|
||||
|
||||
/*--------------------------------*/
|
||||
/* Functions unique to the HBWSpace */
|
||||
static int in_parallel();
|
||||
|
|
|
@ -55,9 +55,6 @@
|
|||
#include <impl/Kokkos_Traits.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
|
||||
#include <impl/Kokkos_AllocationTracker.hpp>
|
||||
#include <impl/Kokkos_BasicAllocators.hpp>
|
||||
|
||||
#include <impl/KokkosExp_SharedAlloc.hpp>
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
@ -128,25 +125,6 @@ public:
|
|||
//! This memory space preferred device_type
|
||||
typedef Kokkos::Device<execution_space,memory_space> device_type;
|
||||
|
||||
/*--------------------------------*/
|
||||
#if ! KOKKOS_USING_EXP_VIEW
|
||||
|
||||
#if defined( KOKKOS_USE_PAGE_ALIGNED_HOST_MEMORY )
|
||||
typedef Impl::PageAlignedAllocator allocator ;
|
||||
#else
|
||||
typedef Impl::AlignedAllocator allocator ;
|
||||
#endif
|
||||
|
||||
/** \brief Allocate a contiguous block of memory.
|
||||
*
|
||||
* The input label is associated with the block of memory.
|
||||
* The block of memory is tracked via reference counting where
|
||||
* allocation gives it a reference count of one.
|
||||
*/
|
||||
static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
|
||||
|
||||
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
|
||||
|
||||
/*--------------------------------*/
|
||||
/* Functions unique to the HostSpace */
|
||||
static int in_parallel();
|
||||
|
|
|
@ -133,11 +133,23 @@
|
|||
// still identifies as 7.0
|
||||
#error "Cuda version 7.5 or greater required for host-to-device Lambda support"
|
||||
#endif
|
||||
#if ( CUDA_VERSION < 8000 )
|
||||
#define KOKKOS_LAMBDA [=]__device__
|
||||
#else
|
||||
#define KOKKOS_LAMBDA [=]__host__ __device__
|
||||
#endif
|
||||
#define KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA 1
|
||||
#endif
|
||||
#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( __CUDACC__ ) */
|
||||
|
||||
|
||||
#if defined(KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA)
|
||||
// Cuda version 8.0 still needs the functor wrapper
|
||||
#if (KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA /* && (CUDA_VERSION < 8000) */ )
|
||||
#define KOKKOS_IMPL_NEED_FUNCTOR_WRAPPER
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/* Language info: C++, CUDA, OPENMP */
|
||||
|
||||
|
@ -440,27 +452,16 @@
|
|||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
/* Transitional macro to change between old and new View,
|
||||
* default to use new View.
|
||||
/* Transitional macro to change between old and new View
|
||||
* are no longer supported.
|
||||
*/
|
||||
|
||||
#if ! defined( KOKKOS_USING_EXP_VIEW )
|
||||
#if defined( KOKKOS_USING_DEPRECATED_VIEW )
|
||||
#define KOKKOS_USING_EXP_VIEW 0
|
||||
#else
|
||||
#define KOKKOS_USING_EXP_VIEW 1
|
||||
#endif
|
||||
#error "Kokkos deprecated View has been removed"
|
||||
#endif
|
||||
|
||||
#if KOKKOS_USING_EXP_VIEW
|
||||
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
#define KOKKOS_USING_EXP_VIEW 1
|
||||
#define KOKKOS_USING_EXPERIMENTAL_VIEW
|
||||
#endif
|
||||
#else /* ! KOKKOS_USING_EXP_VIEW */
|
||||
#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
#error "KOKKOS_USING_EXP_VIEW and KOKKOS_USING_EXPERIMENAL_VIEW are both defined and are incompatible"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,13 +1,13 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
|
@ -36,7 +36,7 @@
|
|||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
@ -58,9 +58,11 @@
|
|||
#endif
|
||||
#include <Kokkos_ScratchSpace.hpp>
|
||||
#include <Kokkos_Parallel.hpp>
|
||||
#include <Kokkos_TaskPolicy.hpp>
|
||||
#include <Kokkos_Layout.hpp>
|
||||
#include <impl/Kokkos_Tags.hpp>
|
||||
|
||||
#include <KokkosExp_MDRangePolicy.hpp>
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
|
@ -177,6 +179,7 @@ struct VerifyExecutionCanAccessMemorySpace
|
|||
|
||||
#include <OpenMP/Kokkos_OpenMPexec.hpp>
|
||||
#include <OpenMP/Kokkos_OpenMP_Parallel.hpp>
|
||||
#include <OpenMP/Kokkos_OpenMP_Task.hpp>
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
|
@ -35,7 +35,7 @@
|
|||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
|
||||
|
@ -125,17 +125,26 @@ struct pair
|
|||
return *this;
|
||||
}
|
||||
|
||||
/// \brief Assignment operator.
|
||||
|
||||
/// \brief Assignment operator, for volatile <tt>*this</tt>.
|
||||
///
|
||||
/// This calls the assignment operators of T1 and T2. It won't
|
||||
/// \param p [in] Input; right-hand side of the assignment.
|
||||
///
|
||||
/// This calls the assignment operators of T1 and T2. It will not
|
||||
/// compile if the assignment operators are not defined and public.
|
||||
///
|
||||
/// This operator returns \c void instead of <tt>volatile pair<T1,
|
||||
/// T2>& </tt>. See Kokkos Issue #177 for the explanation. In
|
||||
/// practice, this means that you should not chain assignments with
|
||||
/// volatile lvalues.
|
||||
template <class U, class V>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
volatile pair<T1, T2> & operator=(const volatile pair<U,V> &p) volatile
|
||||
void operator=(const volatile pair<U,V> &p) volatile
|
||||
{
|
||||
first = p.first;
|
||||
second = p.second;
|
||||
return *this;
|
||||
// We deliberately do not return anything here. See explanation
|
||||
// in public documentation above.
|
||||
}
|
||||
|
||||
// from std::pair<U,V>
|
||||
|
|
|
@ -57,7 +57,6 @@
|
|||
#include <typeinfo>
|
||||
#endif
|
||||
|
||||
#include <impl/Kokkos_AllocationTracker.hpp>
|
||||
#include <impl/Kokkos_Tags.hpp>
|
||||
#include <impl/Kokkos_Traits.hpp>
|
||||
#include <impl/Kokkos_FunctorAdapter.hpp>
|
||||
|
@ -178,8 +177,8 @@ void parallel_for( const ExecPolicy & policy
|
|||
{
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
uint64_t kpID = 0;
|
||||
if(Kokkos::Experimental::profileLibraryLoaded()) {
|
||||
Kokkos::Experimental::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -190,8 +189,8 @@ void parallel_for( const ExecPolicy & policy
|
|||
closure.execute();
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Experimental::profileLibraryLoaded()) {
|
||||
Kokkos::Experimental::endParallelFor(kpID);
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::endParallelFor(kpID);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
@ -210,8 +209,8 @@ void parallel_for( const size_t work_count
|
|||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
uint64_t kpID = 0;
|
||||
if(Kokkos::Experimental::profileLibraryLoaded()) {
|
||||
Kokkos::Experimental::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -222,8 +221,8 @@ void parallel_for( const size_t work_count
|
|||
closure.execute();
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Experimental::profileLibraryLoaded()) {
|
||||
Kokkos::Experimental::endParallelFor(kpID);
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::endParallelFor(kpID);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
@ -248,405 +247,9 @@ void parallel_for( const std::string & str
|
|||
(void) str;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
/** \brief Parallel reduction
|
||||
*
|
||||
* Example of a parallel_reduce functor for a POD (plain old data) value type:
|
||||
* \code
|
||||
* class FunctorType { // For POD value type
|
||||
* public:
|
||||
* typedef ... execution_space ;
|
||||
* typedef <podType> value_type ;
|
||||
* void operator()( <intType> iwork , <podType> & update ) const ;
|
||||
* void init( <podType> & update ) const ;
|
||||
* void join( volatile <podType> & update ,
|
||||
* volatile const <podType> & input ) const ;
|
||||
*
|
||||
* typedef true_type has_final ;
|
||||
* void final( <podType> & update ) const ;
|
||||
* };
|
||||
* \endcode
|
||||
*
|
||||
* Example of a parallel_reduce functor for an array of POD (plain old data) values:
|
||||
* \code
|
||||
* class FunctorType { // For array of POD value
|
||||
* public:
|
||||
* typedef ... execution_space ;
|
||||
* typedef <podType> value_type[] ;
|
||||
* void operator()( <intType> , <podType> update[] ) const ;
|
||||
* void init( <podType> update[] ) const ;
|
||||
* void join( volatile <podType> update[] ,
|
||||
* volatile const <podType> input[] ) const ;
|
||||
*
|
||||
* typedef true_type has_final ;
|
||||
* void final( <podType> update[] ) const ;
|
||||
* };
|
||||
* \endcode
|
||||
*/
|
||||
template< class ExecPolicy , class FunctorType >
|
||||
inline
|
||||
void parallel_reduce( const ExecPolicy & policy
|
||||
, const FunctorType & functor
|
||||
, const std::string& str = ""
|
||||
, typename Impl::enable_if< ! Impl::is_integral< ExecPolicy >::value >::type * = 0
|
||||
)
|
||||
{
|
||||
// typedef typename
|
||||
// Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
|
||||
// execution_space ;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag > ValueTraits ;
|
||||
|
||||
typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
|
||||
, typename ValueTraits::value_type
|
||||
, typename ValueTraits::pointer_type
|
||||
>::type value_type ;
|
||||
|
||||
Kokkos::View< value_type
|
||||
, HostSpace
|
||||
, Kokkos::MemoryUnmanaged
|
||||
>
|
||||
result_view ;
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
uint64_t kpID = 0;
|
||||
if(Kokkos::Experimental::profileLibraryLoaded()) {
|
||||
Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
|
||||
}
|
||||
#endif
|
||||
|
||||
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
|
||||
Impl::ParallelReduce< FunctorType , ExecPolicy > closure( functor , policy , result_view );
|
||||
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
|
||||
|
||||
closure.execute();
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Experimental::profileLibraryLoaded()) {
|
||||
Kokkos::Experimental::endParallelReduce(kpID);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// integral range policy
|
||||
template< class FunctorType >
|
||||
inline
|
||||
void parallel_reduce( const size_t work_count
|
||||
, const FunctorType & functor
|
||||
, const std::string& str = ""
|
||||
)
|
||||
{
|
||||
typedef typename
|
||||
Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
|
||||
execution_space ;
|
||||
|
||||
typedef RangePolicy< execution_space > policy ;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
|
||||
|
||||
typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
|
||||
, typename ValueTraits::value_type
|
||||
, typename ValueTraits::pointer_type
|
||||
>::type value_type ;
|
||||
|
||||
Kokkos::View< value_type
|
||||
, HostSpace
|
||||
, Kokkos::MemoryUnmanaged
|
||||
>
|
||||
result_view ;
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
uint64_t kpID = 0;
|
||||
if(Kokkos::Experimental::profileLibraryLoaded()) {
|
||||
Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
|
||||
}
|
||||
#endif
|
||||
|
||||
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
|
||||
Impl::ParallelReduce< FunctorType , policy > closure( functor , policy(0,work_count) , result_view );
|
||||
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
|
||||
|
||||
closure.execute();
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Experimental::profileLibraryLoaded()) {
|
||||
Kokkos::Experimental::endParallelReduce(kpID);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
// general policy and view ouput
|
||||
template< class ExecPolicy , class FunctorType , class ViewType >
|
||||
inline
|
||||
void parallel_reduce( const ExecPolicy & policy
|
||||
, const FunctorType & functor
|
||||
, const ViewType & result_view
|
||||
, const std::string& str = ""
|
||||
, typename Impl::enable_if<
|
||||
( Kokkos::is_view<ViewType>::value && ! Impl::is_integral< ExecPolicy >::value
|
||||
#ifdef KOKKOS_HAVE_CUDA
|
||||
&& ! Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value
|
||||
#endif
|
||||
)>::type * = 0 )
|
||||
{
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
uint64_t kpID = 0;
|
||||
if(Kokkos::Experimental::profileLibraryLoaded()) {
|
||||
Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
|
||||
}
|
||||
#endif
|
||||
|
||||
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
|
||||
Impl::ParallelReduce< FunctorType, ExecPolicy > closure( functor , policy , result_view );
|
||||
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
|
||||
|
||||
closure.execute();
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Experimental::profileLibraryLoaded()) {
|
||||
Kokkos::Experimental::endParallelReduce(kpID);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
// general policy and pod or array of pod output
|
||||
template< class ExecPolicy , class FunctorType >
|
||||
void parallel_reduce( const ExecPolicy & policy
|
||||
, const FunctorType & functor
|
||||
#ifdef KOKKOS_HAVE_CUDA
|
||||
, typename Impl::enable_if<
|
||||
( ! Impl::is_integral< ExecPolicy >::value &&
|
||||
! Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value )
|
||||
, typename Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >::reference_type>::type result_ref
|
||||
, const std::string& str = ""
|
||||
, typename Impl::enable_if<! Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value >::type* = 0
|
||||
)
|
||||
#else
|
||||
, typename Impl::enable_if<
|
||||
( ! Impl::is_integral< ExecPolicy >::value)
|
||||
, typename Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >::reference_type
|
||||
>::type result_ref
|
||||
, const std::string& str = ""
|
||||
)
|
||||
#endif
|
||||
{
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueOps< FunctorType , typename ExecPolicy::work_tag > ValueOps ;
|
||||
|
||||
// Wrap the result output request in a view to inform the implementation
|
||||
// of the type and memory space.
|
||||
|
||||
typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
|
||||
, typename ValueTraits::value_type
|
||||
, typename ValueTraits::pointer_type
|
||||
>::type value_type ;
|
||||
|
||||
Kokkos::View< value_type
|
||||
, HostSpace
|
||||
, Kokkos::MemoryUnmanaged
|
||||
>
|
||||
result_view( ValueOps::pointer( result_ref )
|
||||
, ValueTraits::value_count( functor )
|
||||
);
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
uint64_t kpID = 0;
|
||||
if(Kokkos::Experimental::profileLibraryLoaded()) {
|
||||
Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
|
||||
}
|
||||
#endif
|
||||
|
||||
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
|
||||
Impl::ParallelReduce< FunctorType, ExecPolicy > closure( functor , policy , result_view );
|
||||
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
|
||||
|
||||
closure.execute();
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Experimental::profileLibraryLoaded()) {
|
||||
Kokkos::Experimental::endParallelReduce(kpID);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
// integral range policy and view ouput
|
||||
template< class FunctorType , class ViewType >
|
||||
inline
|
||||
void parallel_reduce( const size_t work_count
|
||||
, const FunctorType & functor
|
||||
, const ViewType & result_view
|
||||
, const std::string& str = ""
|
||||
, typename Impl::enable_if<( Kokkos::is_view<ViewType>::value
|
||||
#ifdef KOKKOS_HAVE_CUDA
|
||||
&& ! Impl::is_same<
|
||||
typename Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space,
|
||||
Kokkos::Cuda>::value
|
||||
#endif
|
||||
)>::type * = 0 )
|
||||
{
|
||||
typedef typename
|
||||
Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
|
||||
execution_space ;
|
||||
|
||||
typedef RangePolicy< execution_space > ExecPolicy ;
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
uint64_t kpID = 0;
|
||||
if(Kokkos::Experimental::profileLibraryLoaded()) {
|
||||
Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
|
||||
}
|
||||
#endif
|
||||
|
||||
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
|
||||
Impl::ParallelReduce< FunctorType, ExecPolicy > closure( functor , ExecPolicy(0,work_count) , result_view );
|
||||
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
|
||||
|
||||
closure.execute();
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Experimental::profileLibraryLoaded()) {
|
||||
Kokkos::Experimental::endParallelReduce(kpID);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
// integral range policy and pod or array of pod output
|
||||
template< class FunctorType >
|
||||
inline
|
||||
void parallel_reduce( const size_t work_count
|
||||
, const FunctorType & functor
|
||||
, typename Kokkos::Impl::FunctorValueTraits<
|
||||
typename Impl::if_c<Impl::is_execution_policy<FunctorType>::value ||
|
||||
Impl::is_integral<FunctorType>::value,
|
||||
void,FunctorType>::type
|
||||
, void >::reference_type result
|
||||
, const std::string& str = ""
|
||||
, typename Impl::enable_if< true
|
||||
#ifdef KOKKOS_HAVE_CUDA
|
||||
&& ! Impl::is_same<
|
||||
typename Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space,
|
||||
Kokkos::Cuda>::value
|
||||
#endif
|
||||
>::type * = 0 )
|
||||
{
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueOps< FunctorType , void > ValueOps ;
|
||||
|
||||
typedef typename
|
||||
Kokkos::Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
|
||||
execution_space ;
|
||||
|
||||
typedef Kokkos::RangePolicy< execution_space > policy ;
|
||||
|
||||
// Wrap the result output request in a view to inform the implementation
|
||||
// of the type and memory space.
|
||||
|
||||
typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
|
||||
, typename ValueTraits::value_type
|
||||
, typename ValueTraits::pointer_type
|
||||
>::type value_type ;
|
||||
|
||||
Kokkos::View< value_type
|
||||
, HostSpace
|
||||
, Kokkos::MemoryUnmanaged
|
||||
>
|
||||
result_view( ValueOps::pointer( result )
|
||||
, ValueTraits::value_count( functor )
|
||||
);
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
uint64_t kpID = 0;
|
||||
if(Kokkos::Experimental::profileLibraryLoaded()) {
|
||||
Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
|
||||
}
|
||||
#endif
|
||||
|
||||
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
|
||||
Impl::ParallelReduce< FunctorType , policy > closure( functor , policy(0,work_count) , result_view );
|
||||
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
|
||||
|
||||
closure.execute();
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Experimental::profileLibraryLoaded()) {
|
||||
Kokkos::Experimental::endParallelReduce(kpID);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
#ifndef KOKKOS_HAVE_CUDA
|
||||
template< class ExecPolicy , class FunctorType , class ResultType >
|
||||
inline
|
||||
void parallel_reduce( const std::string & str
|
||||
, const ExecPolicy & policy
|
||||
, const FunctorType & functor
|
||||
, ResultType * result)
|
||||
{
|
||||
#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
|
||||
Kokkos::fence();
|
||||
std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl;
|
||||
#endif
|
||||
|
||||
parallel_reduce(policy,functor,result,str);
|
||||
|
||||
#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
|
||||
Kokkos::fence();
|
||||
std::cout << "KOKKOS_DEBUG End parallel_reduce kernel: " << str << std::endl;
|
||||
#endif
|
||||
(void) str;
|
||||
}
|
||||
|
||||
template< class ExecPolicy , class FunctorType , class ResultType >
|
||||
inline
|
||||
void parallel_reduce( const std::string & str
|
||||
, const ExecPolicy & policy
|
||||
, const FunctorType & functor
|
||||
, ResultType & result)
|
||||
{
|
||||
#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
|
||||
Kokkos::fence();
|
||||
std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl;
|
||||
#endif
|
||||
|
||||
parallel_reduce(policy,functor,result,str);
|
||||
|
||||
#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
|
||||
Kokkos::fence();
|
||||
std::cout << "KOKKOS_DEBUG End parallel_reduce kernel: " << str << std::endl;
|
||||
#endif
|
||||
(void) str;
|
||||
}
|
||||
|
||||
template< class ExecPolicy , class FunctorType >
|
||||
inline
|
||||
void parallel_reduce( const std::string & str
|
||||
, const ExecPolicy & policy
|
||||
, const FunctorType & functor)
|
||||
{
|
||||
#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
|
||||
Kokkos::fence();
|
||||
std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl;
|
||||
#endif
|
||||
|
||||
parallel_reduce(policy,functor,str);
|
||||
|
||||
#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
|
||||
Kokkos::fence();
|
||||
std::cout << "KOKKOS_DEBUG End parallel_reduce kernel: " << str << std::endl;
|
||||
#endif
|
||||
(void) str;
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
#include <Kokkos_Parallel_Reduce.hpp>
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
@ -816,8 +419,8 @@ void parallel_scan( const ExecutionPolicy & policy
|
|||
{
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
uint64_t kpID = 0;
|
||||
if(Kokkos::Experimental::profileLibraryLoaded()) {
|
||||
Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -828,8 +431,8 @@ void parallel_scan( const ExecutionPolicy & policy
|
|||
closure.execute();
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Experimental::profileLibraryLoaded()) {
|
||||
Kokkos::Experimental::endParallelScan(kpID);
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::endParallelScan(kpID);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -849,8 +452,8 @@ void parallel_scan( const size_t work_count
|
|||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
uint64_t kpID = 0;
|
||||
if(Kokkos::Experimental::profileLibraryLoaded()) {
|
||||
Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -861,8 +464,8 @@ void parallel_scan( const size_t work_count
|
|||
closure.execute();
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Experimental::profileLibraryLoaded()) {
|
||||
Kokkos::Experimental::endParallelScan(kpID);
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::endParallelScan(kpID);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -66,11 +66,15 @@ public:
|
|||
|
||||
private:
|
||||
|
||||
mutable char * m_iter ;
|
||||
char * m_end ;
|
||||
mutable char * m_iter_L0 ;
|
||||
char * m_end_L0 ;
|
||||
mutable char * m_iter_L1 ;
|
||||
char * m_end_L1 ;
|
||||
|
||||
|
||||
mutable int m_multiplier;
|
||||
mutable int m_offset;
|
||||
mutable int m_default_level;
|
||||
|
||||
ScratchMemorySpace();
|
||||
ScratchMemorySpace & operator = ( const ScratchMemorySpace & );
|
||||
|
@ -95,34 +99,58 @@ public:
|
|||
|
||||
template< typename IntType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void* get_shmem (const IntType& size) const {
|
||||
void* tmp = m_iter + m_offset * align (size);
|
||||
if (m_end < (m_iter += align (size) * m_multiplier)) {
|
||||
m_iter -= align (size) * m_multiplier; // put it back like it was
|
||||
#ifdef KOKKOS_HAVE_DEBUG
|
||||
// mfh 23 Jun 2015: printf call consumes 25 registers
|
||||
// in a CUDA build, so only print in debug mode. The
|
||||
// function still returns NULL if not enough memory.
|
||||
printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate "
|
||||
"%ld byte(s); remaining capacity is %ld byte(s)\n", long(size),
|
||||
long(m_end-m_iter));
|
||||
#endif // KOKKOS_HAVE_DEBUG
|
||||
tmp = 0;
|
||||
void* get_shmem (const IntType& size, int level = -1) const {
|
||||
if(level == -1)
|
||||
level = m_default_level;
|
||||
if(level == 0) {
|
||||
void* tmp = m_iter_L0 + m_offset * align (size);
|
||||
if (m_end_L0 < (m_iter_L0 += align (size) * m_multiplier)) {
|
||||
m_iter_L0 -= align (size) * m_multiplier; // put it back like it was
|
||||
#ifdef KOKKOS_HAVE_DEBUG
|
||||
// mfh 23 Jun 2015: printf call consumes 25 registers
|
||||
// in a CUDA build, so only print in debug mode. The
|
||||
// function still returns NULL if not enough memory.
|
||||
printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate "
|
||||
"%ld byte(s); remaining capacity is %ld byte(s)\n", long(size),
|
||||
long(m_end_L0-m_iter_L0));
|
||||
#endif // KOKKOS_HAVE_DEBUG
|
||||
tmp = 0;
|
||||
}
|
||||
return tmp;
|
||||
} else {
|
||||
void* tmp = m_iter_L1 + m_offset * align (size);
|
||||
if (m_end_L1 < (m_iter_L1 += align (size) * m_multiplier)) {
|
||||
m_iter_L1 -= align (size) * m_multiplier; // put it back like it was
|
||||
#ifdef KOKKOS_HAVE_DEBUG
|
||||
// mfh 23 Jun 2015: printf call consumes 25 registers
|
||||
// in a CUDA build, so only print in debug mode. The
|
||||
// function still returns NULL if not enough memory.
|
||||
printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate "
|
||||
"%ld byte(s); remaining capacity is %ld byte(s)\n", long(size),
|
||||
long(m_end_L1-m_iter_L1));
|
||||
#endif // KOKKOS_HAVE_DEBUG
|
||||
tmp = 0;
|
||||
}
|
||||
return tmp;
|
||||
|
||||
}
|
||||
return tmp;
|
||||
}
|
||||
|
||||
template< typename IntType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ScratchMemorySpace( void * ptr , const IntType & size )
|
||||
: m_iter( (char *) ptr )
|
||||
, m_end( m_iter + size )
|
||||
ScratchMemorySpace( void * ptr_L0 , const IntType & size_L0 , void * ptr_L1 = NULL , const IntType & size_L1 = 0)
|
||||
: m_iter_L0( (char *) ptr_L0 )
|
||||
, m_end_L0( m_iter_L0 + size_L0 )
|
||||
, m_iter_L1( (char *) ptr_L1 )
|
||||
, m_end_L1( m_iter_L1 + size_L1 )
|
||||
, m_multiplier( 1 )
|
||||
, m_offset( 0 )
|
||||
, m_default_level( 0 )
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const ScratchMemorySpace& set_team_thread_mode(const int& multiplier, const int& offset) const {
|
||||
const ScratchMemorySpace& set_team_thread_mode(const int& level, const int& multiplier, const int& offset) const {
|
||||
m_default_level = level;
|
||||
m_multiplier = multiplier;
|
||||
m_offset = offset;
|
||||
return *this;
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
|
@ -36,7 +36,7 @@
|
|||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
@ -50,12 +50,17 @@
|
|||
#include <cstddef>
|
||||
#include <iosfwd>
|
||||
#include <Kokkos_Parallel.hpp>
|
||||
#include <Kokkos_TaskPolicy.hpp>
|
||||
#include <Kokkos_Layout.hpp>
|
||||
#include <Kokkos_HostSpace.hpp>
|
||||
#include <Kokkos_ScratchSpace.hpp>
|
||||
#include <Kokkos_MemoryTraits.hpp>
|
||||
#include <impl/Kokkos_Tags.hpp>
|
||||
#include <impl/Kokkos_FunctorAdapter.hpp>
|
||||
#include <impl/Kokkos_Profiling_Interface.hpp>
|
||||
|
||||
|
||||
#include <KokkosExp_MDRangePolicy.hpp>
|
||||
|
||||
#if defined( KOKKOS_HAVE_SERIAL )
|
||||
|
||||
|
@ -142,7 +147,9 @@ public:
|
|||
|
||||
// Init the array of locks used for arbitrarily sized atomics
|
||||
Impl::init_lock_array_host_space();
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
Kokkos::Profiling::initialize();
|
||||
#endif
|
||||
}
|
||||
|
||||
static int is_initialized() { return 1 ; }
|
||||
|
@ -151,7 +158,11 @@ public:
|
|||
static int concurrency() {return 1;};
|
||||
|
||||
//! Free any resources being consumed by the device.
|
||||
static void finalize() {}
|
||||
static void finalize() {
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
Kokkos::Profiling::finalize();
|
||||
#endif
|
||||
}
|
||||
|
||||
//! Print configuration information to the given output stream.
|
||||
static void print_configuration( std::ostream & , const bool /* detail */ = false ) {}
|
||||
|
@ -307,8 +318,8 @@ class TeamPolicyInternal< Kokkos::Serial , Properties ... >:public PolicyTraits<
|
|||
{
|
||||
private:
|
||||
|
||||
size_t m_team_scratch_size ;
|
||||
size_t m_thread_scratch_size ;
|
||||
size_t m_team_scratch_size[2] ;
|
||||
size_t m_thread_scratch_size[2] ;
|
||||
int m_league_size ;
|
||||
int m_chunk_size;
|
||||
|
||||
|
@ -324,8 +335,10 @@ public:
|
|||
|
||||
TeamPolicyInternal& operator = (const TeamPolicyInternal& p) {
|
||||
m_league_size = p.m_league_size;
|
||||
m_team_scratch_size = p.m_team_scratch_size;
|
||||
m_thread_scratch_size = p.m_thread_scratch_size;
|
||||
m_team_scratch_size[0] = p.m_team_scratch_size[0];
|
||||
m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
|
||||
m_team_scratch_size[1] = p.m_team_scratch_size[1];
|
||||
m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
|
||||
m_chunk_size = p.m_chunk_size;
|
||||
return *this;
|
||||
}
|
||||
|
@ -348,15 +361,15 @@ public:
|
|||
|
||||
inline int team_size() const { return 1 ; }
|
||||
inline int league_size() const { return m_league_size ; }
|
||||
inline size_t scratch_size() const { return m_team_scratch_size + m_thread_scratch_size; }
|
||||
inline size_t scratch_size(const int& level, int = 0) const { return m_team_scratch_size[level] + m_thread_scratch_size[level]; }
|
||||
|
||||
/** \brief Specify league size, request team size */
|
||||
TeamPolicyInternal( execution_space &
|
||||
, int league_size_request
|
||||
, int /* team_size_request */
|
||||
, int /* vector_length_request */ = 1 )
|
||||
: m_team_scratch_size ( 0 )
|
||||
, m_thread_scratch_size ( 0 )
|
||||
: m_team_scratch_size { 0 , 0 }
|
||||
, m_thread_scratch_size { 0 , 0 }
|
||||
, m_league_size( league_size_request )
|
||||
, m_chunk_size ( 32 )
|
||||
{}
|
||||
|
@ -365,8 +378,8 @@ public:
|
|||
, int league_size_request
|
||||
, const Kokkos::AUTO_t & /* team_size_request */
|
||||
, int /* vector_length_request */ = 1 )
|
||||
: m_team_scratch_size ( 0 )
|
||||
, m_thread_scratch_size ( 0 )
|
||||
: m_team_scratch_size { 0 , 0 }
|
||||
, m_thread_scratch_size { 0 , 0 }
|
||||
, m_league_size( league_size_request )
|
||||
, m_chunk_size ( 32 )
|
||||
{}
|
||||
|
@ -374,8 +387,8 @@ public:
|
|||
TeamPolicyInternal( int league_size_request
|
||||
, int /* team_size_request */
|
||||
, int /* vector_length_request */ = 1 )
|
||||
: m_team_scratch_size ( 0 )
|
||||
, m_thread_scratch_size ( 0 )
|
||||
: m_team_scratch_size { 0 , 0 }
|
||||
, m_thread_scratch_size { 0 , 0 }
|
||||
, m_league_size( league_size_request )
|
||||
, m_chunk_size ( 32 )
|
||||
{}
|
||||
|
@ -383,8 +396,8 @@ public:
|
|||
TeamPolicyInternal( int league_size_request
|
||||
, const Kokkos::AUTO_t & /* team_size_request */
|
||||
, int /* vector_length_request */ = 1 )
|
||||
: m_team_scratch_size ( 0 )
|
||||
, m_thread_scratch_size ( 0 )
|
||||
: m_team_scratch_size { 0 , 0 }
|
||||
, m_thread_scratch_size { 0 , 0 }
|
||||
, m_league_size( league_size_request )
|
||||
, m_chunk_size ( 32 )
|
||||
{}
|
||||
|
@ -401,26 +414,23 @@ public:
|
|||
|
||||
/** \brief set per team scratch size for a specific level of the scratch hierarchy */
|
||||
inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
|
||||
(void) level;
|
||||
TeamPolicyInternal p = *this;
|
||||
p.m_team_scratch_size = per_team.value;
|
||||
p.m_team_scratch_size[level] = per_team.value;
|
||||
return p;
|
||||
};
|
||||
|
||||
/** \brief set per thread scratch size for a specific level of the scratch hierarchy */
|
||||
inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
|
||||
(void) level;
|
||||
TeamPolicyInternal p = *this;
|
||||
p.m_thread_scratch_size = per_thread.value;
|
||||
p.m_thread_scratch_size[level] = per_thread.value;
|
||||
return p;
|
||||
};
|
||||
|
||||
/** \brief set per thread and per team scratch size for a specific level of the scratch hierarchy */
|
||||
inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
|
||||
(void) level;
|
||||
TeamPolicyInternal p = *this;
|
||||
p.m_team_scratch_size = per_team.value;
|
||||
p.m_thread_scratch_size = per_thread.value;
|
||||
p.m_team_scratch_size[level] = per_team.value;
|
||||
p.m_thread_scratch_size[level] = per_thread.value;
|
||||
return p;
|
||||
};
|
||||
|
||||
|
@ -440,7 +450,7 @@ namespace Kokkos {
|
|||
namespace Impl {
|
||||
|
||||
template< class FunctorType , class ... Traits >
|
||||
class ParallelFor< FunctorType ,
|
||||
class ParallelFor< FunctorType ,
|
||||
Kokkos::RangePolicy< Traits ... > ,
|
||||
Kokkos::Serial
|
||||
>
|
||||
|
@ -489,9 +499,10 @@ public:
|
|||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
template< class FunctorType , class ... Traits >
|
||||
template< class FunctorType , class ReducerType , class ... Traits >
|
||||
class ParallelReduce< FunctorType
|
||||
, Kokkos::RangePolicy< Traits ... >
|
||||
, ReducerType
|
||||
, Kokkos::Serial
|
||||
>
|
||||
{
|
||||
|
@ -499,14 +510,19 @@ private:
|
|||
|
||||
typedef Kokkos::RangePolicy< Traits ... > Policy ;
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType , WorkTag > ValueInit ;
|
||||
|
||||
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
|
||||
typedef typename ReducerConditional::type ReducerTypeFwd;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTag > ValueInit ;
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
const ReducerType m_reducer ;
|
||||
const pointer_type m_result_ptr ;
|
||||
|
||||
|
||||
|
@ -515,15 +531,15 @@ private:
|
|||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec( pointer_type ptr ) const
|
||||
{
|
||||
reference_type update = ValueInit::init( m_functor , ptr );
|
||||
reference_type update = ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , ptr );
|
||||
|
||||
const typename Policy::member_type e = m_policy.end();
|
||||
for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
|
||||
m_functor( i , update );
|
||||
}
|
||||
|
||||
Kokkos::Impl::FunctorFinal< FunctorType , TagType >::
|
||||
final( m_functor , ptr );
|
||||
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
|
||||
final( ReducerConditional::select(m_functor , m_reducer) , ptr );
|
||||
}
|
||||
|
||||
template< class TagType >
|
||||
|
@ -532,15 +548,15 @@ private:
|
|||
exec( pointer_type ptr ) const
|
||||
{
|
||||
const TagType t{} ;
|
||||
reference_type update = ValueInit::init( m_functor , ptr );
|
||||
reference_type update = ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , ptr );
|
||||
|
||||
const typename Policy::member_type e = m_policy.end();
|
||||
for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
|
||||
m_functor( t , i , update );
|
||||
}
|
||||
|
||||
Kokkos::Impl::FunctorFinal< FunctorType , TagType >::
|
||||
final( m_functor , ptr );
|
||||
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
|
||||
final( ReducerConditional::select(m_functor , m_reducer) , ptr );
|
||||
}
|
||||
|
||||
public:
|
||||
|
@ -549,25 +565,43 @@ public:
|
|||
void execute() const
|
||||
{
|
||||
pointer_type ptr = (pointer_type) Kokkos::Serial::scratch_memory_resize
|
||||
( ValueTraits::value_size( m_functor ) , 0 );
|
||||
( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
|
||||
|
||||
this-> template exec< WorkTag >( m_result_ptr ? m_result_ptr : ptr );
|
||||
}
|
||||
|
||||
template< class ViewType >
|
||||
template< class HostViewType >
|
||||
ParallelReduce( const FunctorType & arg_functor ,
|
||||
const Policy & arg_policy ,
|
||||
const HostViewType & arg_result_view ,
|
||||
typename std::enable_if<
|
||||
Kokkos::is_view< HostViewType >::value &&
|
||||
!Kokkos::is_reducer_type<ReducerType>::value
|
||||
,void*>::type = NULL)
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( InvalidType() )
|
||||
, m_result_ptr( arg_result_view.ptr_on_device() )
|
||||
{
|
||||
static_assert( Kokkos::is_view< HostViewType >::value
|
||||
, "Kokkos::Serial reduce result must be a View" );
|
||||
|
||||
static_assert( std::is_same< typename HostViewType::memory_space , HostSpace >::value
|
||||
, "Kokkos::Serial reduce result must be a View in HostSpace" );
|
||||
}
|
||||
|
||||
inline
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy
|
||||
, const ViewType & arg_result )
|
||||
, Policy arg_policy
|
||||
, const ReducerType& reducer )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_result_ptr( arg_result.ptr_on_device() )
|
||||
, m_reducer( reducer )
|
||||
, m_result_ptr( reducer.result_view().data() )
|
||||
{
|
||||
static_assert( Kokkos::is_view< ViewType >::value
|
||||
, "Reduction result on Kokkos::Serial must be a Kokkos::View" );
|
||||
|
||||
static_assert( std::is_same< typename ViewType::memory_space
|
||||
/*static_assert( std::is_same< typename ViewType::memory_space
|
||||
, Kokkos::HostSpace >::value
|
||||
, "Reduction result on Kokkos::Serial must be a Kokkos::View in HostSpace" );
|
||||
, "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -697,15 +731,16 @@ public:
|
|||
, const Policy & arg_policy )
|
||||
: m_functor( arg_functor )
|
||||
, m_league( arg_policy.league_size() )
|
||||
, m_shared( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) )
|
||||
, m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) )
|
||||
{ }
|
||||
};
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
template< class FunctorType , class ... Properties >
|
||||
template< class FunctorType , class ReducerType , class ... Properties >
|
||||
class ParallelReduce< FunctorType
|
||||
, Kokkos::TeamPolicy< Properties ... >
|
||||
, ReducerType
|
||||
, Kokkos::Serial
|
||||
>
|
||||
{
|
||||
|
@ -714,30 +749,35 @@ private:
|
|||
typedef TeamPolicyInternal< Kokkos::Serial, Properties ... > Policy ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType , WorkTag > ValueInit ;
|
||||
|
||||
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
|
||||
typedef typename ReducerConditional::type ReducerTypeFwd;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTag > ValueInit ;
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const int m_league ;
|
||||
const int m_shared ;
|
||||
const ReducerType m_reducer ;
|
||||
pointer_type m_result_ptr ;
|
||||
const int m_shared ;
|
||||
|
||||
template< class TagType >
|
||||
inline
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec( pointer_type ptr ) const
|
||||
{
|
||||
reference_type update = ValueInit::init( m_functor , ptr );
|
||||
reference_type update = ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , ptr );
|
||||
|
||||
for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
|
||||
m_functor( Member(ileague,m_league,m_shared) , update );
|
||||
}
|
||||
|
||||
Kokkos::Impl::FunctorFinal< FunctorType , TagType >::
|
||||
final( m_functor , ptr );
|
||||
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
|
||||
final( ReducerConditional::select(m_functor , m_reducer) , ptr );
|
||||
}
|
||||
|
||||
template< class TagType >
|
||||
|
@ -747,14 +787,14 @@ private:
|
|||
{
|
||||
const TagType t{} ;
|
||||
|
||||
reference_type update = ValueInit::init( m_functor , ptr );
|
||||
reference_type update = ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , ptr );
|
||||
|
||||
for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
|
||||
m_functor( t , Member(ileague,m_league,m_shared) , update );
|
||||
}
|
||||
|
||||
Kokkos::Impl::FunctorFinal< FunctorType , TagType >::
|
||||
final( m_functor , ptr );
|
||||
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
|
||||
final( ReducerConditional::select(m_functor , m_reducer) , ptr );
|
||||
}
|
||||
|
||||
public:
|
||||
|
@ -763,7 +803,7 @@ public:
|
|||
void execute() const
|
||||
{
|
||||
pointer_type ptr = (pointer_type) Kokkos::Serial::scratch_memory_resize
|
||||
( ValueTraits::value_size( m_functor ) , m_shared );
|
||||
( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , m_shared );
|
||||
|
||||
this-> template exec< WorkTag >( m_result_ptr ? m_result_ptr : ptr );
|
||||
}
|
||||
|
@ -771,12 +811,16 @@ public:
|
|||
template< class ViewType >
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy
|
||||
, const ViewType & arg_result
|
||||
)
|
||||
, const ViewType & arg_result ,
|
||||
typename std::enable_if<
|
||||
Kokkos::is_view< ViewType >::value &&
|
||||
!Kokkos::is_reducer_type<ReducerType>::value
|
||||
,void*>::type = NULL)
|
||||
: m_functor( arg_functor )
|
||||
, m_league( arg_policy.league_size() )
|
||||
, m_shared( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( m_functor , 1 ) )
|
||||
, m_reducer( InvalidType() )
|
||||
, m_result_ptr( arg_result.ptr_on_device() )
|
||||
, m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( m_functor , 1 ) )
|
||||
{
|
||||
static_assert( Kokkos::is_view< ViewType >::value
|
||||
, "Reduction result on Kokkos::Serial must be a Kokkos::View" );
|
||||
|
@ -786,6 +830,21 @@ public:
|
|||
, "Reduction result on Kokkos::Serial must be a Kokkos::View in HostSpace" );
|
||||
}
|
||||
|
||||
inline
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, Policy arg_policy
|
||||
, const ReducerType& reducer )
|
||||
: m_functor( arg_functor )
|
||||
, m_league( arg_policy.league_size() )
|
||||
, m_reducer( reducer )
|
||||
, m_result_ptr( reducer.result_view().data() )
|
||||
, m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
|
||||
{
|
||||
/*static_assert( std::is_same< typename ViewType::memory_space
|
||||
, Kokkos::HostSpace >::value
|
||||
, "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
|
@ -1045,6 +1104,10 @@ void single(const Impl::ThreadSingleStruct<Impl::SerialTeamMember>& , const Func
|
|||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#include <impl/Kokkos_Serial_Task.hpp>
|
||||
|
||||
#endif // defined( KOKKOS_HAVE_SERIAL )
|
||||
#endif /* #define KOKKOS_SERIAL_HPP */
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
|
@ -47,13 +46,655 @@
|
|||
#ifndef KOKKOS_TASKPOLICY_HPP
|
||||
#define KOKKOS_TASKPOLICY_HPP
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
#include <Kokkos_MemoryPool.hpp>
|
||||
#include <impl/Kokkos_Traits.hpp>
|
||||
#include <impl/Kokkos_Tags.hpp>
|
||||
#include <impl/Kokkos_StaticAssert.hpp>
|
||||
#include <impl/Kokkos_AllocationTracker.hpp>
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
// If compiling with CUDA then must be using CUDA 8 or better
|
||||
// and use relocateable device code to enable the task policy.
|
||||
// nvcc relocatable device code option: --relocatable-device-code=true
|
||||
|
||||
#if ( defined( KOKKOS_COMPILER_NVCC ) )
|
||||
#if ( 8000 <= CUDA_VERSION ) && \
|
||||
defined( KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE )
|
||||
|
||||
#define KOKKOS_ENABLE_TASKPOLICY
|
||||
|
||||
#endif
|
||||
#else
|
||||
|
||||
#define KOKKOS_ENABLE_TASKPOLICY
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#if defined( KOKKOS_ENABLE_TASKPOLICY )
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#include <Kokkos_MemoryPool.hpp>
|
||||
#include <impl/Kokkos_Tags.hpp>
|
||||
#include <impl/Kokkos_TaskQueue.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
enum TaskType { TaskTeam = Impl::TaskBase<void,void,void>::TaskTeam
|
||||
, TaskSingle = Impl::TaskBase<void,void,void>::TaskSingle };
|
||||
|
||||
enum TaskPriority { TaskHighPriority = 0
|
||||
, TaskRegularPriority = 1
|
||||
, TaskLowPriority = 2 };
|
||||
|
||||
template< typename Space >
|
||||
class TaskPolicy ;
|
||||
|
||||
template< typename Space >
|
||||
void wait( TaskPolicy< Space > const & );
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
/*\brief Implementation data for task data management, access, and execution.
|
||||
*
|
||||
* CRTP Inheritance structure to allow static_cast from the
|
||||
* task root type and a task's FunctorType.
|
||||
*
|
||||
* TaskBase< Space , ResultType , FunctorType >
|
||||
* : TaskBase< Space , ResultType , void >
|
||||
* , FunctorType
|
||||
* { ... };
|
||||
*
|
||||
* TaskBase< Space , ResultType , void >
|
||||
* : TaskBase< Space , void , void >
|
||||
* { ... };
|
||||
*/
|
||||
template< typename Space , typename ResultType , typename FunctorType >
|
||||
class TaskBase ;
|
||||
|
||||
template< typename Space >
|
||||
class TaskExec ;
|
||||
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
/**
|
||||
*
|
||||
* Future< space > // value_type == void
|
||||
* Future< value > // space == Default
|
||||
* Future< value , space >
|
||||
*
|
||||
*/
|
||||
template< typename Arg1 /* = void */ , typename Arg2 /* = void */ >
|
||||
class Future {
|
||||
private:
|
||||
|
||||
template< typename > friend class TaskPolicy ;
|
||||
template< typename , typename > friend class Future ;
|
||||
template< typename , typename , typename > friend class Impl::TaskBase ;
|
||||
|
||||
enum { Arg1_is_space = Kokkos::Impl::is_space< Arg1 >::value };
|
||||
enum { Arg2_is_space = Kokkos::Impl::is_space< Arg2 >::value };
|
||||
enum { Arg1_is_value = ! Arg1_is_space &&
|
||||
! std::is_same< Arg1 , void >::value };
|
||||
enum { Arg2_is_value = ! Arg2_is_space &&
|
||||
! std::is_same< Arg2 , void >::value };
|
||||
|
||||
static_assert( ! ( Arg1_is_space && Arg2_is_space )
|
||||
, "Future cannot be given two spaces" );
|
||||
|
||||
static_assert( ! ( Arg1_is_value && Arg2_is_value )
|
||||
, "Future cannot be given two value types" );
|
||||
|
||||
using ValueType =
|
||||
typename std::conditional< Arg1_is_value , Arg1 ,
|
||||
typename std::conditional< Arg2_is_value , Arg2 , void
|
||||
>::type >::type ;
|
||||
|
||||
using Space =
|
||||
typename std::conditional< Arg1_is_space , Arg1 ,
|
||||
typename std::conditional< Arg2_is_space , Arg2 , void
|
||||
>::type >::type ;
|
||||
|
||||
using task_base = Impl::TaskBase< Space , ValueType , void > ;
|
||||
using queue_type = Impl::TaskQueue< Space > ;
|
||||
|
||||
task_base * m_task ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION explicit
|
||||
Future( task_base * task ) : m_task(0)
|
||||
{ if ( task ) queue_type::assign( & m_task , task ); }
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
public:
|
||||
|
||||
using execution_space = typename Space::execution_space ;
|
||||
using value_type = ValueType ;
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool is_null() const { return 0 == m_task ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int reference_count() const
|
||||
{ return 0 != m_task ? m_task->reference_count() : 0 ; }
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
~Future() { if ( m_task ) queue_type::assign( & m_task , (task_base*)0 ); }
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
constexpr Future() noexcept : m_task(0) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Future( Future && rhs )
|
||||
: m_task( rhs.m_task ) { rhs.m_task = 0 ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Future( const Future & rhs )
|
||||
: m_task(0)
|
||||
{ if ( rhs.m_task ) queue_type::assign( & m_task , rhs.m_task ); }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Future & operator = ( Future && rhs )
|
||||
{
|
||||
if ( m_task ) queue_type::assign( & m_task , (task_base*)0 );
|
||||
m_task = rhs.m_task ;
|
||||
rhs.m_task = 0 ;
|
||||
return *this ;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Future & operator = ( const Future & rhs )
|
||||
{
|
||||
if ( m_task || rhs.m_task ) queue_type::assign( & m_task , rhs.m_task );
|
||||
return *this ;
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
template< class A1 , class A2 >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Future( Future<A1,A2> && rhs )
|
||||
: m_task( rhs.m_task )
|
||||
{
|
||||
static_assert
|
||||
( std::is_same< Space , void >::value ||
|
||||
std::is_same< Space , typename Future<A1,A2>::Space >::value
|
||||
, "Assigned Futures must have the same space" );
|
||||
|
||||
static_assert
|
||||
( std::is_same< value_type , void >::value ||
|
||||
std::is_same< value_type , typename Future<A1,A2>::value_type >::value
|
||||
, "Assigned Futures must have the same value_type" );
|
||||
|
||||
rhs.m_task = 0 ;
|
||||
}
|
||||
|
||||
template< class A1 , class A2 >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Future( const Future<A1,A2> & rhs )
|
||||
: m_task(0)
|
||||
{
|
||||
static_assert
|
||||
( std::is_same< Space , void >::value ||
|
||||
std::is_same< Space , typename Future<A1,A2>::Space >::value
|
||||
, "Assigned Futures must have the same space" );
|
||||
|
||||
static_assert
|
||||
( std::is_same< value_type , void >::value ||
|
||||
std::is_same< value_type , typename Future<A1,A2>::value_type >::value
|
||||
, "Assigned Futures must have the same value_type" );
|
||||
|
||||
if ( rhs.m_task ) queue_type::assign( & m_task , rhs.m_task );
|
||||
}
|
||||
|
||||
template< class A1 , class A2 >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Future & operator = ( const Future<A1,A2> & rhs )
|
||||
{
|
||||
static_assert
|
||||
( std::is_same< Space , void >::value ||
|
||||
std::is_same< Space , typename Future<A1,A2>::Space >::value
|
||||
, "Assigned Futures must have the same space" );
|
||||
|
||||
static_assert
|
||||
( std::is_same< value_type , void >::value ||
|
||||
std::is_same< value_type , typename Future<A1,A2>::value_type >::value
|
||||
, "Assigned Futures must have the same value_type" );
|
||||
|
||||
if ( m_task || rhs.m_task ) queue_type::assign( & m_task , rhs.m_task );
|
||||
return *this ;
|
||||
}
|
||||
|
||||
template< class A1 , class A2 >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Future & operator = ( Future<A1,A2> && rhs )
|
||||
{
|
||||
static_assert
|
||||
( std::is_same< Space , void >::value ||
|
||||
std::is_same< Space , typename Future<A1,A2>::Space >::value
|
||||
, "Assigned Futures must have the same space" );
|
||||
|
||||
static_assert
|
||||
( std::is_same< value_type , void >::value ||
|
||||
std::is_same< value_type , typename Future<A1,A2>::value_type >::value
|
||||
, "Assigned Futures must have the same value_type" );
|
||||
|
||||
if ( m_task ) queue_type::assign( & m_task , (task_base*) 0 );
|
||||
m_task = rhs.m_task ;
|
||||
rhs.m_task = 0 ;
|
||||
return *this ;
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
typename task_base::get_return_type
|
||||
get() const
|
||||
{
|
||||
if ( 0 == m_task ) {
|
||||
Kokkos::abort( "Kokkos:::Future::get ERROR: is_null()");
|
||||
}
|
||||
return m_task->get();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
template< typename ExecSpace >
|
||||
class TaskPolicy
|
||||
{
|
||||
private:
|
||||
|
||||
using track_type = Kokkos::Experimental::Impl::SharedAllocationTracker ;
|
||||
using queue_type = Kokkos::Impl::TaskQueue< ExecSpace > ;
|
||||
using task_base = Impl::TaskBase< ExecSpace , void , void > ;
|
||||
|
||||
track_type m_track ;
|
||||
queue_type * m_queue ;
|
||||
|
||||
//----------------------------------------
|
||||
// Process optional arguments to spawn and respawn functions
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void assign( task_base * const ) {}
|
||||
|
||||
// TaskTeam or TaskSingle
|
||||
template< typename ... Options >
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void assign( task_base * const task
|
||||
, TaskType const & arg
|
||||
, Options const & ... opts )
|
||||
{
|
||||
task->m_task_type = arg ;
|
||||
assign( task , opts ... );
|
||||
}
|
||||
|
||||
// TaskHighPriority or TaskRegularPriority or TaskLowPriority
|
||||
template< typename ... Options >
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void assign( task_base * const task
|
||||
, TaskPriority const & arg
|
||||
, Options const & ... opts )
|
||||
{
|
||||
task->m_priority = arg ;
|
||||
assign( task , opts ... );
|
||||
}
|
||||
|
||||
// Future for a dependence
|
||||
template< typename A1 , typename A2 , typename ... Options >
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void assign( task_base * const task
|
||||
, Future< A1 , A2 > const & arg
|
||||
, Options const & ... opts )
|
||||
{
|
||||
// Assign dependence to task->m_next
|
||||
// which will be processed within subsequent call to schedule.
|
||||
// Error if the dependence is reset.
|
||||
|
||||
if ( 0 != Kokkos::atomic_exchange(& task->m_next, arg.m_task) ) {
|
||||
Kokkos::abort("TaskPolicy ERROR: resetting task dependence");
|
||||
}
|
||||
|
||||
if ( 0 != arg.m_task ) {
|
||||
// The future may be destroyed upon returning from this call
|
||||
// so increment reference count to track this assignment.
|
||||
Kokkos::atomic_fetch_add( &(arg.m_task->m_ref_count) , 1 );
|
||||
}
|
||||
|
||||
assign( task , opts ... );
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
public:
|
||||
|
||||
using execution_policy = TaskPolicy ;
|
||||
using execution_space = ExecSpace ;
|
||||
using memory_space = typename queue_type::memory_space ;
|
||||
using member_type = Kokkos::Impl::TaskExec< ExecSpace > ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TaskPolicy() : m_track(), m_queue(0) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TaskPolicy( TaskPolicy && rhs ) = default ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TaskPolicy( TaskPolicy const & rhs ) = default ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TaskPolicy & operator = ( TaskPolicy && rhs ) = default ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TaskPolicy & operator = ( TaskPolicy const & rhs ) = default ;
|
||||
|
||||
TaskPolicy( memory_space const & arg_memory_space
|
||||
, unsigned const arg_memory_pool_capacity
|
||||
, unsigned const arg_memory_pool_log2_superblock = 12 )
|
||||
: m_track()
|
||||
, m_queue(0)
|
||||
{
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationRecord
|
||||
< memory_space , typename queue_type::Destroy >
|
||||
record_type ;
|
||||
|
||||
record_type * record =
|
||||
record_type::allocate( arg_memory_space
|
||||
, "TaskQueue"
|
||||
, sizeof(queue_type)
|
||||
);
|
||||
|
||||
m_queue = new( record->data() )
|
||||
queue_type( arg_memory_space
|
||||
, arg_memory_pool_capacity
|
||||
, arg_memory_pool_log2_superblock );
|
||||
|
||||
record->m_destroy.m_queue = m_queue ;
|
||||
|
||||
m_track.assign_allocated_record_to_uninitialized( record );
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
/**\brief Allocation size for a spawned task */
|
||||
template< typename FunctorType >
|
||||
KOKKOS_FUNCTION
|
||||
size_t spawn_allocation_size() const
|
||||
{
|
||||
using task_type = Impl::TaskBase< execution_space
|
||||
, typename FunctorType::value_type
|
||||
, FunctorType > ;
|
||||
|
||||
return m_queue->allocate_block_size( sizeof(task_type) );
|
||||
}
|
||||
|
||||
/**\brief Allocation size for a when_all aggregate */
|
||||
KOKKOS_FUNCTION
|
||||
size_t when_all_allocation_size( int narg ) const
|
||||
{
|
||||
using task_base = Kokkos::Impl::TaskBase< ExecSpace , void , void > ;
|
||||
|
||||
return m_queue->allocate_block_size( sizeof(task_base) + narg * sizeof(task_base*) );
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
/**\brief A task spawns a task with options
|
||||
*
|
||||
* 1) High, Normal, or Low priority
|
||||
* 2) With or without dependence
|
||||
* 3) Team or Serial
|
||||
*/
|
||||
template< typename FunctorType , typename ... Options >
|
||||
KOKKOS_FUNCTION
|
||||
Future< typename FunctorType::value_type , ExecSpace >
|
||||
task_spawn( FunctorType const & arg_functor
|
||||
, Options const & ... arg_options
|
||||
) const
|
||||
{
|
||||
using value_type = typename FunctorType::value_type ;
|
||||
using future_type = Future< value_type , execution_space > ;
|
||||
using task_type = Impl::TaskBase< execution_space
|
||||
, value_type
|
||||
, FunctorType > ;
|
||||
|
||||
//----------------------------------------
|
||||
// Give single-thread back-ends an opportunity to clear
|
||||
// queue of ready tasks before allocating a new task
|
||||
|
||||
m_queue->iff_single_thread_recursive_execute();
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
future_type f ;
|
||||
|
||||
// Allocate task from memory pool
|
||||
f.m_task =
|
||||
reinterpret_cast< task_type * >(m_queue->allocate(sizeof(task_type)));
|
||||
|
||||
if ( f.m_task ) {
|
||||
|
||||
// Placement new construction
|
||||
new ( f.m_task ) task_type( arg_functor );
|
||||
|
||||
// Reference count starts at two
|
||||
// +1 for matching decrement when task is complete
|
||||
// +1 for future
|
||||
f.m_task->m_queue = m_queue ;
|
||||
f.m_task->m_ref_count = 2 ;
|
||||
f.m_task->m_alloc_size = sizeof(task_type);
|
||||
|
||||
assign( f.m_task , arg_options... );
|
||||
|
||||
// Spawning from within the execution space so the
|
||||
// apply function pointer is guaranteed to be valid
|
||||
f.m_task->m_apply = task_type::apply ;
|
||||
|
||||
m_queue->schedule( f.m_task );
|
||||
// this task may be updated or executed at any moment
|
||||
}
|
||||
|
||||
return f ;
|
||||
}
|
||||
|
||||
/**\brief The host process spawns a task with options
|
||||
*
|
||||
* 1) High, Normal, or Low priority
|
||||
* 2) With or without dependence
|
||||
* 3) Team or Serial
|
||||
*/
|
||||
template< typename FunctorType , typename ... Options >
|
||||
inline
|
||||
Future< typename FunctorType::value_type , ExecSpace >
|
||||
host_spawn( FunctorType const & arg_functor
|
||||
, Options const & ... arg_options
|
||||
) const
|
||||
{
|
||||
using value_type = typename FunctorType::value_type ;
|
||||
using future_type = Future< value_type , execution_space > ;
|
||||
using task_type = Impl::TaskBase< execution_space
|
||||
, value_type
|
||||
, FunctorType > ;
|
||||
|
||||
future_type f ;
|
||||
|
||||
// Allocate task from memory pool
|
||||
f.m_task =
|
||||
reinterpret_cast<task_type*>( m_queue->allocate(sizeof(task_type)) );
|
||||
|
||||
if ( f.m_task ) {
|
||||
|
||||
// Placement new construction
|
||||
new( f.m_task ) task_type( arg_functor );
|
||||
|
||||
// Reference count starts at two:
|
||||
// +1 to match decrement when task completes
|
||||
// +1 for the future
|
||||
f.m_task->m_queue = m_queue ;
|
||||
f.m_task->m_ref_count = 2 ;
|
||||
f.m_task->m_alloc_size = sizeof(task_type);
|
||||
|
||||
assign( f.m_task , arg_options... );
|
||||
|
||||
// Potentially spawning outside execution space so the
|
||||
// apply function pointer must be obtained from execution space.
|
||||
// Required for Cuda execution space function pointer.
|
||||
queue_type::specialization::template
|
||||
proc_set_apply< FunctorType >( & f.m_task->m_apply );
|
||||
|
||||
m_queue->schedule( f.m_task );
|
||||
}
|
||||
return f ;
|
||||
}
|
||||
|
||||
/**\brief Return a future that is complete
|
||||
* when all input futures are complete.
|
||||
*/
|
||||
template< typename A1 , typename A2 >
|
||||
KOKKOS_FUNCTION
|
||||
Future< ExecSpace >
|
||||
when_all( int narg , Future< A1 , A2 > const * const arg ) const
|
||||
{
|
||||
static_assert
|
||||
( std::is_same< execution_space
|
||||
, typename Future< A1 , A2 >::execution_space
|
||||
>::value
|
||||
, "Future must have same execution space" );
|
||||
|
||||
using future_type = Future< ExecSpace > ;
|
||||
using task_base = Kokkos::Impl::TaskBase< ExecSpace , void , void > ;
|
||||
|
||||
future_type f ;
|
||||
|
||||
size_t const size = sizeof(task_base) + narg * sizeof(task_base*);
|
||||
|
||||
f.m_task =
|
||||
reinterpret_cast< task_base * >( m_queue->allocate( size ) );
|
||||
|
||||
if ( f.m_task ) {
|
||||
|
||||
new( f.m_task ) task_base();
|
||||
|
||||
// Reference count starts at two:
|
||||
// +1 to match decrement when task completes
|
||||
// +1 for the future
|
||||
f.m_task->m_queue = m_queue ;
|
||||
f.m_task->m_ref_count = 2 ;
|
||||
f.m_task->m_alloc_size = size ;
|
||||
f.m_task->m_dep_count = narg ;
|
||||
f.m_task->m_task_type = task_base::Aggregate ;
|
||||
|
||||
task_base ** const dep = f.m_task->aggregate_dependences();
|
||||
|
||||
// Assign dependences to increment their reference count
|
||||
// The futures may be destroyed upon returning from this call
|
||||
// so increment reference count to track this assignment.
|
||||
|
||||
for ( int i = 0 ; i < narg ; ++i ) {
|
||||
task_base * const t = dep[i] = arg[i].m_task ;
|
||||
if ( 0 != t ) {
|
||||
Kokkos::atomic_fetch_add( &(t->m_ref_count) , 1 );
|
||||
}
|
||||
}
|
||||
|
||||
m_queue->schedule( f.m_task );
|
||||
// this when_all may be processed at any moment
|
||||
}
|
||||
|
||||
return f ;
|
||||
}
|
||||
|
||||
/**\brief An executing task respawns itself with options
|
||||
*
|
||||
* 1) High, Normal, or Low priority
|
||||
* 2) With or without dependence
|
||||
*/
|
||||
template< class FunctorType , typename ... Options >
|
||||
KOKKOS_FUNCTION
|
||||
void respawn( FunctorType * task_self
|
||||
, Options const & ... arg_options ) const
|
||||
{
|
||||
using value_type = typename FunctorType::value_type ;
|
||||
using task_type = Impl::TaskBase< execution_space
|
||||
, value_type
|
||||
, FunctorType > ;
|
||||
|
||||
task_base * const zero = (task_base *) 0 ;
|
||||
task_base * const lock = (task_base *) task_base::LockTag ;
|
||||
task_type * const task = static_cast< task_type * >( task_self );
|
||||
|
||||
// Precondition:
|
||||
// task is in Executing state
|
||||
// therefore m_next == LockTag
|
||||
//
|
||||
// Change to m_next == 0 for no dependence
|
||||
|
||||
if ( lock != Kokkos::atomic_exchange( & task->m_next, zero ) ) {
|
||||
Kokkos::abort("TaskPolicy::respawn ERROR: already respawned");
|
||||
}
|
||||
|
||||
assign( task , arg_options... );
|
||||
|
||||
// Postcondition:
|
||||
// task is in Executing-Respawn state
|
||||
// therefore m_next == dependece or 0
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
template< typename S >
|
||||
friend
|
||||
void Kokkos::wait( Kokkos::TaskPolicy< S > const & );
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
inline
|
||||
int allocation_capacity() const noexcept
|
||||
{ return m_queue->m_memory.get_mem_size(); }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int allocated_task_count() const noexcept
|
||||
{ return m_queue->m_count_alloc ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int allocated_task_count_max() const noexcept
|
||||
{ return m_queue->m_max_alloc ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
long allocated_task_count_accum() const noexcept
|
||||
{ return m_queue->m_accum_alloc ; }
|
||||
|
||||
};
|
||||
|
||||
template< typename ExecSpace >
|
||||
inline
|
||||
void wait( TaskPolicy< ExecSpace > const & policy )
|
||||
{ policy.m_queue->execute(); }
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
|
@ -463,5 +1104,6 @@ void wait( TaskPolicy< ExecSpace > & );
|
|||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #define KOKKOS_TASKPOLICY_HPP */
|
||||
#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
|
||||
#endif /* #ifndef KOKKOS_TASKPOLICY_HPP */
|
||||
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
|
@ -36,7 +36,7 @@
|
|||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
@ -211,6 +211,8 @@ struct VerifyExecutionCanAccessMemorySpace
|
|||
#include <Threads/Kokkos_ThreadsTeam.hpp>
|
||||
#include <Threads/Kokkos_Threads_Parallel.hpp>
|
||||
|
||||
#include <KokkosExp_MDRangePolicy.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -178,9 +178,10 @@ public:
|
|||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template< class FunctorType , class ... Traits >
|
||||
template< class FunctorType , class ReducerType, class ... Traits >
|
||||
class ParallelReduce< FunctorType
|
||||
, Kokkos::RangePolicy< Traits ...>
|
||||
, ReducerType
|
||||
, Kokkos::OpenMP
|
||||
>
|
||||
{
|
||||
|
@ -192,15 +193,21 @@ private:
|
|||
typedef typename Policy::WorkRange WorkRange ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;
|
||||
typedef Kokkos::Impl::FunctorValueJoin< FunctorType, WorkTag > ValueJoin ;
|
||||
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
|
||||
typedef typename ReducerConditional::type ReducerTypeFwd;
|
||||
|
||||
// Static Assert WorkTag void if ReducerType not InvalidType
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ;
|
||||
typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, WorkTag > ValueJoin ;
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
const ReducerType m_reducer ;
|
||||
const pointer_type m_result_ptr ;
|
||||
|
||||
template< class TagType >
|
||||
|
@ -252,7 +259,7 @@ public:
|
|||
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
|
||||
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
|
||||
|
||||
OpenMPexec::resize_scratch( ValueTraits::value_size( m_functor ) , 0 );
|
||||
OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
|
@ -260,7 +267,7 @@ public:
|
|||
const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
|
||||
ParallelReduce::template exec_range< WorkTag >
|
||||
( m_functor , range.begin() , range.end()
|
||||
, ValueInit::init( m_functor , exec.scratch_reduce() ) );
|
||||
, ValueInit::init( ReducerConditional::select(m_functor , m_reducer), exec.scratch_reduce() ) );
|
||||
}
|
||||
/* END #pragma omp parallel */
|
||||
|
||||
|
@ -269,13 +276,13 @@ public:
|
|||
const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
|
||||
|
||||
for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
|
||||
ValueJoin::join( m_functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
|
||||
ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
|
||||
}
|
||||
|
||||
Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( m_functor , ptr );
|
||||
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
|
||||
|
||||
if ( m_result_ptr ) {
|
||||
const int n = ValueTraits::value_count( m_functor );
|
||||
const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
|
||||
|
||||
for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
|
||||
}
|
||||
|
@ -289,7 +296,7 @@ public:
|
|||
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
|
||||
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
|
||||
|
||||
OpenMPexec::resize_scratch( ValueTraits::value_size( m_functor ) , 0 );
|
||||
OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
|
@ -302,7 +309,7 @@ public:
|
|||
|
||||
long work_index = exec.get_work_index();
|
||||
|
||||
reference_type update = ValueInit::init( m_functor , exec.scratch_reduce() );
|
||||
reference_type update = ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , exec.scratch_reduce() );
|
||||
while(work_index != -1) {
|
||||
const Member begin = static_cast<Member>(work_index) * m_policy.chunk_size();
|
||||
const Member end = begin + m_policy.chunk_size() < m_policy.end()?begin+m_policy.chunk_size():m_policy.end();
|
||||
|
@ -319,13 +326,13 @@ public:
|
|||
const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
|
||||
|
||||
for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
|
||||
ValueJoin::join( m_functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
|
||||
ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
|
||||
}
|
||||
|
||||
Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( m_functor , ptr );
|
||||
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
|
||||
|
||||
if ( m_result_ptr ) {
|
||||
const int n = ValueTraits::value_count( m_functor );
|
||||
const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
|
||||
|
||||
for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
|
||||
}
|
||||
|
@ -337,18 +344,35 @@ public:
|
|||
inline
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, Policy arg_policy
|
||||
, const ViewType & arg_result_view )
|
||||
, const ViewType & arg_result_view
|
||||
, typename std::enable_if<
|
||||
Kokkos::is_view< ViewType >::value &&
|
||||
!Kokkos::is_reducer_type<ReducerType>::value
|
||||
,void*>::type = NULL)
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_result_ptr( arg_result_view.ptr_on_device() )
|
||||
, m_reducer( InvalidType() )
|
||||
, m_result_ptr( arg_result_view.data() )
|
||||
{
|
||||
static_assert( Kokkos::is_view< ViewType >::value
|
||||
, "Reduction result on Kokkos::OpenMP must be a Kokkos::View" );
|
||||
|
||||
static_assert( std::is_same< typename ViewType::memory_space
|
||||
/*static_assert( std::is_same< typename ViewType::memory_space
|
||||
, Kokkos::HostSpace >::value
|
||||
, "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );
|
||||
, "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
|
||||
}
|
||||
|
||||
inline
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, Policy arg_policy
|
||||
, const ReducerType& reducer )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( reducer )
|
||||
, m_result_ptr( reducer.result_view().data() )
|
||||
{
|
||||
/*static_assert( std::is_same< typename ViewType::memory_space
|
||||
, Kokkos::HostSpace >::value
|
||||
, "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
|
@ -568,13 +592,13 @@ public:
|
|||
|
||||
const size_t team_reduce_size = Policy::member_type::team_reduce_size();
|
||||
|
||||
OpenMPexec::resize_scratch( 0 , team_reduce_size + m_shmem_size );
|
||||
OpenMPexec::resize_scratch( 0 , team_reduce_size + m_shmem_size + m_policy.scratch_size(1));
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
ParallelFor::template exec_team< WorkTag, typename Policy::schedule_type::type>
|
||||
( m_functor
|
||||
, Member( * OpenMPexec::get_thread_omp(), m_policy, m_shmem_size) );
|
||||
, Member( * OpenMPexec::get_thread_omp(), m_policy, m_shmem_size, 0) );
|
||||
}
|
||||
/* END #pragma omp parallel */
|
||||
}
|
||||
|
@ -584,14 +608,15 @@ public:
|
|||
const Policy & arg_policy )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_shmem_size( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
|
||||
, m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
|
||||
{}
|
||||
};
|
||||
|
||||
|
||||
template< class FunctorType , class ... Properties >
|
||||
template< class FunctorType , class ReducerType, class ... Properties >
|
||||
class ParallelReduce< FunctorType
|
||||
, Kokkos::TeamPolicy< Properties ... >
|
||||
, ReducerType
|
||||
, Kokkos::OpenMP
|
||||
>
|
||||
{
|
||||
|
@ -602,15 +627,19 @@ private:
|
|||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType , WorkTag > ValueInit ;
|
||||
typedef Kokkos::Impl::FunctorValueJoin< FunctorType , WorkTag > ValueJoin ;
|
||||
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
|
||||
typedef typename ReducerConditional::type ReducerTypeFwd;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTag > ValueInit ;
|
||||
typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd , WorkTag > ValueJoin ;
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
const ReducerType m_reducer ;
|
||||
const pointer_type m_result_ptr ;
|
||||
const int m_shmem_size ;
|
||||
|
||||
|
@ -644,7 +673,7 @@ public:
|
|||
|
||||
const size_t team_reduce_size = Policy::member_type::team_reduce_size();
|
||||
|
||||
OpenMPexec::resize_scratch( ValueTraits::value_size( m_functor ) , team_reduce_size + m_shmem_size );
|
||||
OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , team_reduce_size + m_shmem_size );
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
|
@ -652,8 +681,8 @@ public:
|
|||
|
||||
ParallelReduce::template exec_team< WorkTag >
|
||||
( m_functor
|
||||
, Member( exec , m_policy , m_shmem_size )
|
||||
, ValueInit::init( m_functor , exec.scratch_reduce() ) );
|
||||
, Member( exec , m_policy , m_shmem_size, 0 )
|
||||
, ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , exec.scratch_reduce() ) );
|
||||
}
|
||||
/* END #pragma omp parallel */
|
||||
|
||||
|
@ -665,13 +694,13 @@ public:
|
|||
max_active_threads = m_policy.league_size()* m_policy.team_size();
|
||||
|
||||
for ( int i = 1 ; i < max_active_threads ; ++i ) {
|
||||
ValueJoin::join( m_functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
|
||||
ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
|
||||
}
|
||||
|
||||
Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( m_functor , ptr );
|
||||
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
|
||||
|
||||
if ( m_result_ptr ) {
|
||||
const int n = ValueTraits::value_count( m_functor );
|
||||
const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
|
||||
|
||||
for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
|
||||
}
|
||||
|
@ -682,12 +711,33 @@ public:
|
|||
inline
|
||||
ParallelReduce( const FunctorType & arg_functor ,
|
||||
const Policy & arg_policy ,
|
||||
const ViewType & arg_result )
|
||||
const ViewType & arg_result ,
|
||||
typename std::enable_if<
|
||||
Kokkos::is_view< ViewType >::value &&
|
||||
!Kokkos::is_reducer_type<ReducerType>::value
|
||||
,void*>::type = NULL)
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( InvalidType() )
|
||||
, m_result_ptr( arg_result.ptr_on_device() )
|
||||
, m_shmem_size( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
|
||||
, m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
|
||||
{}
|
||||
|
||||
inline
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, Policy arg_policy
|
||||
, const ReducerType& reducer )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( reducer )
|
||||
, m_result_ptr( reducer.result_view().data() )
|
||||
, m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
|
||||
{
|
||||
/*static_assert( std::is_same< typename ViewType::memory_space
|
||||
, Kokkos::HostSpace >::value
|
||||
, "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
|
|
|
@ -0,0 +1,329 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
#if defined( KOKKOS_HAVE_OPENMP ) && defined( KOKKOS_ENABLE_TASKPOLICY )
|
||||
|
||||
#include <impl/Kokkos_TaskQueue_impl.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template class TaskQueue< Kokkos::OpenMP > ;
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
TaskExec< Kokkos::OpenMP >::
|
||||
TaskExec()
|
||||
: m_self_exec( 0 )
|
||||
, m_team_exec( 0 )
|
||||
, m_sync_mask( 0 )
|
||||
, m_sync_value( 0 )
|
||||
, m_sync_step( 0 )
|
||||
, m_group_rank( 0 )
|
||||
, m_team_rank( 0 )
|
||||
, m_team_size( 1 )
|
||||
{
|
||||
}
|
||||
|
||||
TaskExec< Kokkos::OpenMP >::
|
||||
TaskExec( Kokkos::Impl::OpenMPexec & arg_exec , int const arg_team_size )
|
||||
: m_self_exec( & arg_exec )
|
||||
, m_team_exec( arg_exec.pool_rev(arg_exec.pool_rank_rev() / arg_team_size) )
|
||||
, m_sync_mask( 0 )
|
||||
, m_sync_value( 0 )
|
||||
, m_sync_step( 0 )
|
||||
, m_group_rank( arg_exec.pool_rank_rev() / arg_team_size )
|
||||
, m_team_rank( arg_exec.pool_rank_rev() % arg_team_size )
|
||||
, m_team_size( arg_team_size )
|
||||
{
|
||||
// This team spans
|
||||
// m_self_exec->pool_rev( team_size * group_rank )
|
||||
// m_self_exec->pool_rev( team_size * ( group_rank + 1 ) - 1 )
|
||||
|
||||
int64_t volatile * const sync = (int64_t *) m_self_exec->scratch_reduce();
|
||||
|
||||
sync[0] = int64_t(0) ;
|
||||
sync[1] = int64_t(0) ;
|
||||
|
||||
for ( int i = 0 ; i < m_team_size ; ++i ) {
|
||||
m_sync_value |= int64_t(1) << (8*i);
|
||||
m_sync_mask |= int64_t(3) << (8*i);
|
||||
}
|
||||
|
||||
Kokkos::memory_fence();
|
||||
}
|
||||
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
|
||||
void TaskExec< Kokkos::OpenMP >::team_barrier_impl() const
|
||||
{
|
||||
if ( m_team_exec->scratch_reduce_size() < int(2 * sizeof(int64_t)) ) {
|
||||
Kokkos::abort("TaskQueue<OpenMP> scratch_reduce memory too small");
|
||||
}
|
||||
|
||||
// Use team shared memory to synchronize.
|
||||
// Alternate memory locations between barriers to avoid a sequence
|
||||
// of barriers overtaking one another.
|
||||
|
||||
int64_t volatile * const sync =
|
||||
((int64_t *) m_team_exec->scratch_reduce()) + ( m_sync_step & 0x01 );
|
||||
|
||||
// This team member sets one byte within the sync variable
|
||||
int8_t volatile * const sync_self =
|
||||
((int8_t *) sync) + m_team_rank ;
|
||||
|
||||
#if 0
|
||||
fprintf( stdout
|
||||
, "barrier group(%d) member(%d) step(%d) wait(%lx) : before(%lx)\n"
|
||||
, m_group_rank
|
||||
, m_team_rank
|
||||
, m_sync_step
|
||||
, m_sync_value
|
||||
, *sync
|
||||
);
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
*sync_self = int8_t( m_sync_value & 0x03 ); // signal arrival
|
||||
|
||||
while ( m_sync_value != *sync ); // wait for team to arrive
|
||||
|
||||
#if 0
|
||||
fprintf( stdout
|
||||
, "barrier group(%d) member(%d) step(%d) wait(%lx) : after(%lx)\n"
|
||||
, m_group_rank
|
||||
, m_team_rank
|
||||
, m_sync_step
|
||||
, m_sync_value
|
||||
, *sync
|
||||
);
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
++m_sync_step ;
|
||||
|
||||
if ( 0 == ( 0x01 & m_sync_step ) ) { // Every other step
|
||||
m_sync_value ^= m_sync_mask ;
|
||||
if ( 1000 < m_sync_step ) m_sync_step = 0 ;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
void TaskQueueSpecialization< Kokkos::OpenMP >::execute
|
||||
( TaskQueue< Kokkos::OpenMP > * const queue )
|
||||
{
|
||||
using execution_space = Kokkos::OpenMP ;
|
||||
using queue_type = TaskQueue< execution_space > ;
|
||||
using task_root_type = TaskBase< execution_space , void , void > ;
|
||||
using PoolExec = Kokkos::Impl::OpenMPexec ;
|
||||
using Member = TaskExec< execution_space > ;
|
||||
|
||||
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
|
||||
|
||||
// Required: team_size <= 8
|
||||
|
||||
const int team_size = PoolExec::pool_size(2); // Threads per core
|
||||
// const int team_size = PoolExec::pool_size(1); // Threads per NUMA
|
||||
|
||||
if ( 8 < team_size ) {
|
||||
Kokkos::abort("TaskQueue<OpenMP> unsupported team size");
|
||||
}
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
PoolExec & self = *PoolExec::get_thread_omp();
|
||||
|
||||
Member single_exec ;
|
||||
Member team_exec( self , team_size );
|
||||
|
||||
// Team shared memory
|
||||
task_root_type * volatile * const task_shared =
|
||||
(task_root_type **) team_exec.m_team_exec->scratch_thread();
|
||||
|
||||
// Barrier across entire OpenMP thread pool to insure initialization
|
||||
#pragma omp barrier
|
||||
|
||||
// Loop until all queues are empty and no tasks in flight
|
||||
|
||||
do {
|
||||
|
||||
task_root_type * task = 0 ;
|
||||
|
||||
// Each team lead attempts to acquire either a thread team task
|
||||
// or a single thread task for the team.
|
||||
|
||||
if ( 0 == team_exec.team_rank() ) {
|
||||
|
||||
task = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
|
||||
|
||||
// Loop by priority and then type
|
||||
for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
|
||||
for ( int j = 0 ; j < 2 && end == task ; ++j ) {
|
||||
task = queue_type::pop_task( & queue->m_ready[i][j] );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Team lead broadcast acquired task to team members:
|
||||
|
||||
if ( 1 < team_exec.team_size() ) {
|
||||
|
||||
if ( 0 == team_exec.team_rank() ) *task_shared = task ;
|
||||
|
||||
// Fence to be sure task_shared is stored before the barrier
|
||||
Kokkos::memory_fence();
|
||||
|
||||
// Whole team waits for every team member to reach this statement
|
||||
team_exec.team_barrier();
|
||||
|
||||
// Fence to be sure task_shared is stored
|
||||
Kokkos::memory_fence();
|
||||
|
||||
task = *task_shared ;
|
||||
}
|
||||
|
||||
#if 0
|
||||
fprintf( stdout
|
||||
, "\nexecute group(%d) member(%d) task_shared(0x%lx) task(0x%lx)\n"
|
||||
, team_exec.m_group_rank
|
||||
, team_exec.m_team_rank
|
||||
, uintptr_t(task_shared)
|
||||
, uintptr_t(task)
|
||||
);
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
if ( 0 == task ) break ; // 0 == m_ready_count
|
||||
|
||||
if ( end == task ) {
|
||||
// All team members wait for whole team to reach this statement.
|
||||
// Is necessary to prevent task_shared from being updated
|
||||
// before it is read by all threads.
|
||||
team_exec.team_barrier();
|
||||
}
|
||||
else if ( task_root_type::TaskTeam == task->m_task_type ) {
|
||||
// Thread Team Task
|
||||
(*task->m_apply)( task , & team_exec );
|
||||
|
||||
// The m_apply function performs a barrier
|
||||
|
||||
if ( 0 == team_exec.team_rank() ) {
|
||||
// team member #0 completes the task, which may delete the task
|
||||
queue->complete( task );
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Single Thread Task
|
||||
|
||||
if ( 0 == team_exec.team_rank() ) {
|
||||
|
||||
(*task->m_apply)( task , & single_exec );
|
||||
|
||||
queue->complete( task );
|
||||
}
|
||||
|
||||
// All team members wait for whole team to reach this statement.
|
||||
// Not necessary to complete the task.
|
||||
// Is necessary to prevent task_shared from being updated
|
||||
// before it is read by all threads.
|
||||
team_exec.team_barrier();
|
||||
}
|
||||
} while(1);
|
||||
}
|
||||
// END #pragma omp parallel
|
||||
|
||||
}
|
||||
|
||||
void TaskQueueSpecialization< Kokkos::OpenMP >::
|
||||
iff_single_thread_recursive_execute
|
||||
( TaskQueue< Kokkos::OpenMP > * const queue )
|
||||
{
|
||||
using execution_space = Kokkos::OpenMP ;
|
||||
using queue_type = TaskQueue< execution_space > ;
|
||||
using task_root_type = TaskBase< execution_space , void , void > ;
|
||||
using Member = TaskExec< execution_space > ;
|
||||
|
||||
if ( 1 == omp_get_num_threads() ) {
|
||||
|
||||
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
|
||||
|
||||
Member single_exec ;
|
||||
|
||||
task_root_type * task = end ;
|
||||
|
||||
do {
|
||||
|
||||
task = end ;
|
||||
|
||||
// Loop by priority and then type
|
||||
for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
|
||||
for ( int j = 0 ; j < 2 && end == task ; ++j ) {
|
||||
task = queue_type::pop_task( & queue->m_ready[i][j] );
|
||||
}
|
||||
}
|
||||
|
||||
if ( end == task ) break ;
|
||||
|
||||
(*task->m_apply)( task , & single_exec );
|
||||
|
||||
queue->complete( task );
|
||||
|
||||
} while(1);
|
||||
}
|
||||
}
|
||||
|
||||
}} /* namespace Kokkos::Impl */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #if defined( KOKKOS_HAVE_OPENMP ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
|
||||
|
||||
|
|
@ -0,0 +1,356 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_IMPL_OPENMP_TASK_HPP
|
||||
#define KOKKOS_IMPL_OPENMP_TASK_HPP
|
||||
|
||||
#if defined( KOKKOS_ENABLE_TASKPOLICY )
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template<>
|
||||
class TaskQueueSpecialization< Kokkos::OpenMP >
|
||||
{
|
||||
public:
|
||||
|
||||
using execution_space = Kokkos::OpenMP ;
|
||||
using queue_type = Kokkos::Impl::TaskQueue< execution_space > ;
|
||||
using task_base_type = Kokkos::Impl::TaskBase< execution_space , void , void > ;
|
||||
|
||||
// Must specify memory space
|
||||
using memory_space = Kokkos::HostSpace ;
|
||||
|
||||
static
|
||||
void iff_single_thread_recursive_execute( queue_type * const );
|
||||
|
||||
// Must provide task queue execution function
|
||||
static void execute( queue_type * const );
|
||||
|
||||
// Must provide mechanism to set function pointer in
|
||||
// execution space from the host process.
|
||||
template< typename FunctorType >
|
||||
static
|
||||
void proc_set_apply( task_base_type::function_type * ptr )
|
||||
{
|
||||
using TaskType = TaskBase< Kokkos::OpenMP
|
||||
, typename FunctorType::value_type
|
||||
, FunctorType
|
||||
> ;
|
||||
*ptr = TaskType::apply ;
|
||||
}
|
||||
};
|
||||
|
||||
extern template class TaskQueue< Kokkos::OpenMP > ;
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template<>
|
||||
class TaskExec< Kokkos::OpenMP >
|
||||
{
|
||||
private:
|
||||
|
||||
TaskExec( TaskExec && ) = delete ;
|
||||
TaskExec( TaskExec const & ) = delete ;
|
||||
TaskExec & operator = ( TaskExec && ) = delete ;
|
||||
TaskExec & operator = ( TaskExec const & ) = delete ;
|
||||
|
||||
|
||||
using PoolExec = Kokkos::Impl::OpenMPexec ;
|
||||
|
||||
friend class Kokkos::Impl::TaskQueue< Kokkos::OpenMP > ;
|
||||
friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::OpenMP > ;
|
||||
|
||||
PoolExec * const m_self_exec ; ///< This thread's thread pool data structure
|
||||
PoolExec * const m_team_exec ; ///< Team thread's thread pool data structure
|
||||
int64_t m_sync_mask ;
|
||||
int64_t mutable m_sync_value ;
|
||||
int mutable m_sync_step ;
|
||||
int m_group_rank ; ///< Which "team" subset of thread pool
|
||||
int m_team_rank ; ///< Which thread within a team
|
||||
int m_team_size ;
|
||||
|
||||
TaskExec();
|
||||
TaskExec( PoolExec & arg_exec , int arg_team_size );
|
||||
|
||||
void team_barrier_impl() const ;
|
||||
|
||||
public:
|
||||
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
void * team_shared() const
|
||||
{ return m_team_exec ? m_team_exec->scratch_thread() : (void*) 0 ; }
|
||||
|
||||
int team_shared_size() const
|
||||
{ return m_team_exec ? m_team_exec->scratch_thread_size() : 0 ; }
|
||||
|
||||
/**\brief Whole team enters this function call
|
||||
* before any teeam member returns from
|
||||
* this function call.
|
||||
*/
|
||||
void team_barrier() const { if ( 1 < m_team_size ) team_barrier_impl(); }
|
||||
#else
|
||||
KOKKOS_INLINE_FUNCTION void team_barrier() const {}
|
||||
KOKKOS_INLINE_FUNCTION void * team_shared() const { return 0 ; }
|
||||
KOKKOS_INLINE_FUNCTION int team_shared_size() const { return 0 ; }
|
||||
#endif
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int team_rank() const { return m_team_rank ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int team_size() const { return m_team_size ; }
|
||||
};
|
||||
|
||||
}} /* namespace Kokkos::Impl */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >
|
||||
TeamThreadRange
|
||||
( Impl::TaskExec< Kokkos::OpenMP > & thread
|
||||
, const iType & count )
|
||||
{
|
||||
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >(thread,count);
|
||||
}
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >
|
||||
TeamThreadRange
|
||||
( Impl:: TaskExec< Kokkos::OpenMP > & thread
|
||||
, const iType & start
|
||||
, const iType & end )
|
||||
{
|
||||
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >(thread,start,end);
|
||||
}
|
||||
|
||||
/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all threads of the the calling thread team.
|
||||
* This functionality requires C++11 support.
|
||||
*/
|
||||
template<typename iType, class Lambda>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_for
|
||||
( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >& loop_boundaries
|
||||
, const Lambda& lambda
|
||||
)
|
||||
{
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename iType, class Lambda, typename ValueType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >& loop_boundaries
|
||||
, const Lambda& lambda
|
||||
, ValueType& initialized_result)
|
||||
{
|
||||
int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
|
||||
ValueType result = initialized_result;
|
||||
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i, result);
|
||||
}
|
||||
|
||||
if ( 1 < loop_boundaries.thread.team_size() ) {
|
||||
|
||||
ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
|
||||
|
||||
loop_boundaries.thread.team_barrier();
|
||||
shared[team_rank] = result;
|
||||
|
||||
loop_boundaries.thread.team_barrier();
|
||||
|
||||
// reduce across threads to thread 0
|
||||
if (team_rank == 0) {
|
||||
for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
|
||||
shared[0] += shared[i];
|
||||
}
|
||||
}
|
||||
|
||||
loop_boundaries.thread.team_barrier();
|
||||
|
||||
// broadcast result
|
||||
initialized_result = shared[0];
|
||||
}
|
||||
else {
|
||||
initialized_result = result ;
|
||||
}
|
||||
}
|
||||
|
||||
template< typename iType, class Lambda, typename ValueType, class JoinType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
|
||||
const Lambda & lambda,
|
||||
const JoinType & join,
|
||||
ValueType& initialized_result)
|
||||
{
|
||||
int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
|
||||
ValueType result = initialized_result;
|
||||
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i, result);
|
||||
}
|
||||
|
||||
if ( 1 < loop_boundaries.thread.team_size() ) {
|
||||
ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
|
||||
|
||||
loop_boundaries.thread.team_barrier();
|
||||
shared[team_rank] = result;
|
||||
|
||||
loop_boundaries.thread.team_barrier();
|
||||
|
||||
// reduce across threads to thread 0
|
||||
if (team_rank == 0) {
|
||||
for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
|
||||
join(shared[0], shared[i]);
|
||||
}
|
||||
}
|
||||
|
||||
loop_boundaries.thread.team_barrier();
|
||||
|
||||
// broadcast result
|
||||
initialized_result = shared[0];
|
||||
}
|
||||
else {
|
||||
initialized_result = result ;
|
||||
}
|
||||
}
|
||||
|
||||
// placeholder for future function
|
||||
template< typename iType, class Lambda, typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
|
||||
const Lambda & lambda,
|
||||
ValueType& initialized_result)
|
||||
{
|
||||
}
|
||||
|
||||
// placeholder for future function
|
||||
template< typename iType, class Lambda, typename ValueType, class JoinType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
|
||||
const Lambda & lambda,
|
||||
const JoinType & join,
|
||||
ValueType& initialized_result)
|
||||
{
|
||||
}
|
||||
|
||||
template< typename ValueType, typename iType, class Lambda >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_scan
|
||||
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
|
||||
const Lambda & lambda)
|
||||
{
|
||||
ValueType accum = 0 ;
|
||||
ValueType val, local_total;
|
||||
ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
|
||||
int team_size = loop_boundaries.thread.team_size();
|
||||
int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
|
||||
|
||||
// Intra-member scan
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
local_total = 0;
|
||||
lambda(i,local_total,false);
|
||||
val = accum;
|
||||
lambda(i,val,true);
|
||||
accum += local_total;
|
||||
}
|
||||
|
||||
shared[team_rank] = accum;
|
||||
loop_boundaries.thread.team_barrier();
|
||||
|
||||
// Member 0 do scan on accumulated totals
|
||||
if (team_rank == 0) {
|
||||
for( iType i = 1; i < team_size; i+=1) {
|
||||
shared[i] += shared[i-1];
|
||||
}
|
||||
accum = 0; // Member 0 set accum to 0 in preparation for inter-member scan
|
||||
}
|
||||
|
||||
loop_boundaries.thread.team_barrier();
|
||||
|
||||
// Inter-member scan adding in accumulated totals
|
||||
if (team_rank != 0) { accum = shared[team_rank-1]; }
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
local_total = 0;
|
||||
lambda(i,local_total,false);
|
||||
val = accum;
|
||||
lambda(i,val,true);
|
||||
accum += local_total;
|
||||
}
|
||||
}
|
||||
|
||||
// placeholder for future function
|
||||
template< typename iType, class Lambda, typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_scan
|
||||
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
|
||||
const Lambda & lambda)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
} /* namespace Kokkos */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
|
||||
#endif /* #ifndef KOKKOS_IMPL_OPENMP_TASK_HPP */
|
||||
|
|
@ -49,6 +49,7 @@
|
|||
#include <impl/Kokkos_Error.hpp>
|
||||
#include <iostream>
|
||||
#include <impl/Kokkos_CPUDiscovery.hpp>
|
||||
#include <impl/Kokkos_Profiling_Interface.hpp>
|
||||
|
||||
#ifdef KOKKOS_HAVE_OPENMP
|
||||
|
||||
|
@ -85,16 +86,8 @@ int OpenMPexec::m_map_rank[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
|
|||
|
||||
int OpenMPexec::m_pool_topo[ 4 ] = { 0 };
|
||||
|
||||
#if ! KOKKOS_USING_EXP_VIEW
|
||||
|
||||
OpenMPexec::Pool OpenMPexec::m_pool;
|
||||
|
||||
#else
|
||||
|
||||
OpenMPexec * OpenMPexec::m_pool[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
|
||||
|
||||
#endif
|
||||
|
||||
void OpenMPexec::verify_is_process( const char * const label )
|
||||
{
|
||||
if ( omp_in_parallel() ) {
|
||||
|
@ -125,16 +118,12 @@ void OpenMPexec::clear_scratch()
|
|||
#pragma omp parallel
|
||||
{
|
||||
const int rank_rev = m_map_rank[ omp_get_thread_num() ];
|
||||
#if KOKKOS_USING_EXP_VIEW
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
|
||||
if ( m_pool[ rank_rev ] ) {
|
||||
Record * const r = Record::get_record( m_pool[ rank_rev ] );
|
||||
m_pool[ rank_rev ] = 0 ;
|
||||
Record::decrement( r );
|
||||
}
|
||||
#else
|
||||
m_pool.at(rank_rev).clear();
|
||||
#endif
|
||||
}
|
||||
/* END #pragma omp parallel */
|
||||
}
|
||||
|
@ -172,8 +161,6 @@ void OpenMPexec::resize_scratch( size_t reduce_size , size_t thread_size )
|
|||
const int rank_rev = m_map_rank[ omp_get_thread_num() ];
|
||||
const int rank = pool_size - ( rank_rev + 1 );
|
||||
|
||||
#if KOKKOS_USING_EXP_VIEW
|
||||
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
|
||||
|
||||
Record * const r = Record::allocate( Kokkos::HostSpace()
|
||||
|
@ -184,15 +171,6 @@ void OpenMPexec::resize_scratch( size_t reduce_size , size_t thread_size )
|
|||
|
||||
m_pool[ rank_rev ] = reinterpret_cast<OpenMPexec*>( r->data() );
|
||||
|
||||
#else
|
||||
|
||||
#pragma omp critical
|
||||
{
|
||||
m_pool.at(rank_rev) = HostSpace::allocate_and_track( "openmp_scratch", alloc_size );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
new ( m_pool[ rank_rev ] ) OpenMPexec( rank , ALLOC_EXEC , reduce_size , thread_size );
|
||||
}
|
||||
/* END #pragma omp parallel */
|
||||
|
@ -330,6 +308,10 @@ void OpenMP::initialize( unsigned thread_count ,
|
|||
}
|
||||
// Init the array for used for arbitrarily sized atomics
|
||||
Impl::init_lock_array_host_space();
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
Kokkos::Profiling::initialize();
|
||||
#endif
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
@ -350,6 +332,10 @@ void OpenMP::finalize()
|
|||
if ( Impl::s_using_hwloc && Kokkos::hwloc::can_bind_threads() ) {
|
||||
hwloc::unbind_this_thread();
|
||||
}
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
Kokkos::Profiling::finalize();
|
||||
#endif
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
|
|
@ -46,7 +46,6 @@
|
|||
|
||||
#include <impl/Kokkos_Traits.hpp>
|
||||
#include <impl/Kokkos_spinwait.hpp>
|
||||
#include <impl/Kokkos_AllocationTracker.hpp>
|
||||
|
||||
#include <Kokkos_Atomic.hpp>
|
||||
#include <iostream>
|
||||
|
@ -63,38 +62,10 @@ public:
|
|||
|
||||
enum { MAX_THREAD_COUNT = 4096 };
|
||||
|
||||
#if ! KOKKOS_USING_EXP_VIEW
|
||||
|
||||
struct Pool
|
||||
{
|
||||
Pool() : m_trackers() {}
|
||||
|
||||
AllocationTracker m_trackers[ MAX_THREAD_COUNT ];
|
||||
|
||||
OpenMPexec * operator[](int i)
|
||||
{
|
||||
return reinterpret_cast<OpenMPexec *>(m_trackers[i].alloc_ptr());
|
||||
}
|
||||
|
||||
AllocationTracker & at(int i)
|
||||
{
|
||||
return m_trackers[i];
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
private:
|
||||
|
||||
static Pool m_pool; // Indexed by: m_pool_rank_rev
|
||||
|
||||
#else
|
||||
|
||||
private:
|
||||
|
||||
static OpenMPexec * m_pool[ MAX_THREAD_COUNT ]; // Indexed by: m_pool_rank_rev
|
||||
|
||||
#endif
|
||||
|
||||
static int m_pool_topo[ 4 ];
|
||||
static int m_map_rank[ MAX_THREAD_COUNT ];
|
||||
|
||||
|
@ -145,6 +116,12 @@ public:
|
|||
|
||||
inline long team_work_index() const { return m_team_work_index ; }
|
||||
|
||||
inline int scratch_reduce_size() const
|
||||
{ return m_scratch_reduce_end - m_scratch_exec_end ; }
|
||||
|
||||
inline int scratch_thread_size() const
|
||||
{ return m_scratch_thread_end - m_scratch_reduce_end ; }
|
||||
|
||||
inline void * scratch_reduce() const { return ((char *) this) + m_scratch_exec_end ; }
|
||||
inline void * scratch_thread() const { return ((char *) this) + m_scratch_reduce_end ; }
|
||||
|
||||
|
@ -157,15 +134,15 @@ public:
|
|||
|
||||
~OpenMPexec() {}
|
||||
|
||||
OpenMPexec( const int poolRank
|
||||
, const int scratch_exec_size
|
||||
, const int scratch_reduce_size
|
||||
, const int scratch_thread_size )
|
||||
: m_pool_rank( poolRank )
|
||||
, m_pool_rank_rev( pool_size() - ( poolRank + 1 ) )
|
||||
, m_scratch_exec_end( scratch_exec_size )
|
||||
, m_scratch_reduce_end( m_scratch_exec_end + scratch_reduce_size )
|
||||
, m_scratch_thread_end( m_scratch_reduce_end + scratch_thread_size )
|
||||
OpenMPexec( const int arg_poolRank
|
||||
, const int arg_scratch_exec_size
|
||||
, const int arg_scratch_reduce_size
|
||||
, const int arg_scratch_thread_size )
|
||||
: m_pool_rank( arg_poolRank )
|
||||
, m_pool_rank_rev( pool_size() - ( arg_poolRank + 1 ) )
|
||||
, m_scratch_exec_end( arg_scratch_exec_size )
|
||||
, m_scratch_reduce_end( m_scratch_exec_end + arg_scratch_reduce_size )
|
||||
, m_scratch_thread_end( m_scratch_reduce_end + arg_scratch_thread_size )
|
||||
, m_barrier_state(0)
|
||||
{}
|
||||
|
||||
|
@ -330,7 +307,7 @@ public:
|
|||
|
||||
Impl::OpenMPexec & m_exec ;
|
||||
scratch_memory_space m_team_shared ;
|
||||
int m_team_shmem ;
|
||||
int m_team_scratch_size[2] ;
|
||||
int m_team_base_rev ;
|
||||
int m_team_rank_rev ;
|
||||
int m_team_rank ;
|
||||
|
@ -378,15 +355,15 @@ public:
|
|||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const execution_space::scratch_memory_space& team_shmem() const
|
||||
{ return m_team_shared.set_team_thread_mode(1,0) ; }
|
||||
{ return m_team_shared.set_team_thread_mode(0,1,0) ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const execution_space::scratch_memory_space& team_scratch(int) const
|
||||
{ return m_team_shared.set_team_thread_mode(1,0) ; }
|
||||
{ return m_team_shared.set_team_thread_mode(0,1,0) ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const execution_space::scratch_memory_space& thread_scratch(int) const
|
||||
{ return m_team_shared.set_team_thread_mode(team_size(),team_rank()) ; }
|
||||
{ return m_team_shared.set_team_thread_mode(0,team_size(),team_rank()) ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
|
||||
KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
|
||||
|
@ -568,11 +545,12 @@ public:
|
|||
inline
|
||||
OpenMPexecTeamMember( Impl::OpenMPexec & exec
|
||||
, const TeamPolicyInternal< OpenMP, Properties ...> & team
|
||||
, const int shmem_size
|
||||
, const int shmem_size_L1
|
||||
, const int shmem_size_L2
|
||||
)
|
||||
: m_exec( exec )
|
||||
, m_team_shared(0,0)
|
||||
, m_team_shmem( shmem_size )
|
||||
, m_team_scratch_size{ shmem_size_L1 , shmem_size_L2 }
|
||||
, m_team_base_rev(0)
|
||||
, m_team_rank_rev(0)
|
||||
, m_team_rank(0)
|
||||
|
@ -580,7 +558,7 @@ public:
|
|||
, m_league_rank(0)
|
||||
, m_league_end(0)
|
||||
, m_league_size( team.league_size() )
|
||||
, m_chunk_size( team.chunk_size() )
|
||||
, m_chunk_size( team.chunk_size()>0?team.chunk_size():team.team_iter() )
|
||||
, m_league_chunk_end(0)
|
||||
, m_team_lead_exec( *exec.pool_rev( team.team_alloc() * (m_exec.pool_rank_rev()/team.team_alloc()) ))
|
||||
, m_team_alloc( team.team_alloc())
|
||||
|
@ -589,10 +567,9 @@ public:
|
|||
const int pool_team_rank_rev = pool_rank_rev % team.team_alloc();
|
||||
const int pool_league_rank_rev = pool_rank_rev / team.team_alloc();
|
||||
const int pool_num_teams = OpenMP::thread_pool_size(0)/team.team_alloc();
|
||||
const int chunk_size = team.chunk_size()>0?team.chunk_size():team.team_iter();
|
||||
const int chunks_per_team = ( team.league_size() + chunk_size*pool_num_teams-1 ) / (chunk_size*pool_num_teams);
|
||||
int league_iter_end = team.league_size() - pool_league_rank_rev * chunks_per_team * chunk_size;
|
||||
int league_iter_begin = league_iter_end - chunks_per_team * chunk_size;
|
||||
const int chunks_per_team = ( team.league_size() + m_chunk_size*pool_num_teams-1 ) / (m_chunk_size*pool_num_teams);
|
||||
int league_iter_end = team.league_size() - pool_league_rank_rev * chunks_per_team * m_chunk_size;
|
||||
int league_iter_begin = league_iter_end - chunks_per_team * m_chunk_size;
|
||||
if (league_iter_begin < 0) league_iter_begin = 0;
|
||||
if (league_iter_end>team.league_size()) league_iter_end = team.league_size();
|
||||
|
||||
|
@ -611,7 +588,9 @@ public:
|
|||
m_team_rank = m_team_size - ( m_team_rank_rev + 1 );
|
||||
m_league_end = league_iter_end ;
|
||||
m_league_rank = league_iter_begin ;
|
||||
new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_shmem );
|
||||
new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] ,
|
||||
( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0],
|
||||
0 );
|
||||
}
|
||||
|
||||
if ( (m_team_rank_rev == 0) && (m_invalid_thread == 0) ) {
|
||||
|
@ -627,10 +606,13 @@ public:
|
|||
|
||||
void next_static()
|
||||
{
|
||||
if ( ++m_league_rank < m_league_end ) {
|
||||
if ( m_league_rank < m_league_end ) {
|
||||
team_barrier();
|
||||
new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_shmem );
|
||||
new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] ,
|
||||
( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0],
|
||||
0);
|
||||
}
|
||||
m_league_rank++;
|
||||
}
|
||||
|
||||
bool valid_dynamic() {
|
||||
|
@ -661,10 +643,13 @@ public:
|
|||
if(m_invalid_thread)
|
||||
return;
|
||||
|
||||
team_barrier();
|
||||
if ( ++m_league_rank < m_league_chunk_end ) {
|
||||
new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_shmem );
|
||||
if ( m_league_rank < m_league_chunk_end ) {
|
||||
team_barrier();
|
||||
new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] ,
|
||||
( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0],
|
||||
0);
|
||||
}
|
||||
m_league_rank++;
|
||||
}
|
||||
|
||||
static inline int team_reduce_size() { return TEAM_REDUCE_SIZE ; }
|
||||
|
@ -687,8 +672,10 @@ public:
|
|||
m_team_size = p.m_team_size;
|
||||
m_team_alloc = p.m_team_alloc;
|
||||
m_team_iter = p.m_team_iter;
|
||||
m_team_scratch_size = p.m_team_scratch_size;
|
||||
m_thread_scratch_size = p.m_thread_scratch_size;
|
||||
m_team_scratch_size[0] = p.m_team_scratch_size[0];
|
||||
m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
|
||||
m_team_scratch_size[1] = p.m_team_scratch_size[1];
|
||||
m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
|
||||
m_chunk_size = p.m_chunk_size;
|
||||
return *this;
|
||||
}
|
||||
|
@ -719,8 +706,8 @@ private:
|
|||
int m_team_alloc ;
|
||||
int m_team_iter ;
|
||||
|
||||
size_t m_team_scratch_size;
|
||||
size_t m_thread_scratch_size;
|
||||
size_t m_team_scratch_size[2];
|
||||
size_t m_thread_scratch_size[2];
|
||||
|
||||
int m_chunk_size;
|
||||
|
||||
|
@ -753,15 +740,19 @@ public:
|
|||
|
||||
inline int team_size() const { return m_team_size ; }
|
||||
inline int league_size() const { return m_league_size ; }
|
||||
inline size_t scratch_size() const { return m_team_scratch_size + m_team_size*m_thread_scratch_size ; }
|
||||
inline size_t scratch_size(const int& level, int team_size_ = -1) const {
|
||||
if(team_size_ < 0)
|
||||
team_size_ = m_team_size;
|
||||
return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level] ;
|
||||
}
|
||||
|
||||
/** \brief Specify league size, request team size */
|
||||
TeamPolicyInternal( typename traits::execution_space &
|
||||
, int league_size_request
|
||||
, int team_size_request
|
||||
, int /* vector_length_request */ = 1 )
|
||||
: m_team_scratch_size ( 0 )
|
||||
, m_thread_scratch_size ( 0 )
|
||||
: m_team_scratch_size { 0 , 0 }
|
||||
, m_thread_scratch_size { 0 , 0 }
|
||||
, m_chunk_size(0)
|
||||
{ init( league_size_request , team_size_request ); }
|
||||
|
||||
|
@ -769,24 +760,24 @@ public:
|
|||
, int league_size_request
|
||||
, const Kokkos::AUTO_t & /* team_size_request */
|
||||
, int /* vector_length_request */ = 1)
|
||||
: m_team_scratch_size ( 0 )
|
||||
, m_thread_scratch_size ( 0 )
|
||||
: m_team_scratch_size { 0 , 0 }
|
||||
, m_thread_scratch_size { 0 , 0 }
|
||||
, m_chunk_size(0)
|
||||
{ init( league_size_request , traits::execution_space::thread_pool_size(2) ); }
|
||||
|
||||
TeamPolicyInternal( int league_size_request
|
||||
, int team_size_request
|
||||
, int /* vector_length_request */ = 1 )
|
||||
: m_team_scratch_size ( 0 )
|
||||
, m_thread_scratch_size ( 0 )
|
||||
: m_team_scratch_size { 0 , 0 }
|
||||
, m_thread_scratch_size { 0 , 0 }
|
||||
, m_chunk_size(0)
|
||||
{ init( league_size_request , team_size_request ); }
|
||||
|
||||
TeamPolicyInternal( int league_size_request
|
||||
, const Kokkos::AUTO_t & /* team_size_request */
|
||||
, int /* vector_length_request */ = 1 )
|
||||
: m_team_scratch_size ( 0 )
|
||||
, m_thread_scratch_size ( 0 )
|
||||
: m_team_scratch_size { 0 , 0 }
|
||||
, m_thread_scratch_size { 0 , 0 }
|
||||
, m_chunk_size(0)
|
||||
{ init( league_size_request , traits::execution_space::thread_pool_size(2) ); }
|
||||
|
||||
|
@ -803,24 +794,21 @@ public:
|
|||
}
|
||||
|
||||
inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
|
||||
(void) level;
|
||||
TeamPolicyInternal p = *this;
|
||||
p.m_team_scratch_size = per_team.value;
|
||||
p.m_team_scratch_size[level] = per_team.value;
|
||||
return p;
|
||||
};
|
||||
|
||||
inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
|
||||
(void) level;
|
||||
TeamPolicyInternal p = *this;
|
||||
p.m_thread_scratch_size = per_thread.value;
|
||||
p.m_thread_scratch_size[level] = per_thread.value;
|
||||
return p;
|
||||
};
|
||||
|
||||
inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
|
||||
(void) level;
|
||||
TeamPolicyInternal p = *this;
|
||||
p.m_team_scratch_size = per_team.value;
|
||||
p.m_thread_scratch_size = per_thread.value;
|
||||
p.m_team_scratch_size[level] = per_team.value;
|
||||
p.m_thread_scratch_size[level] = per_thread.value;
|
||||
return p;
|
||||
};
|
||||
|
||||
|
|
|
@ -104,7 +104,7 @@ namespace Kokkos {
|
|||
|
||||
int Qthread::is_initialized()
|
||||
{
|
||||
Impl::s_number_workers != 0 ;
|
||||
return Impl::s_number_workers != 0 ;
|
||||
}
|
||||
|
||||
int Qthread::concurrency()
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
|
@ -36,7 +36,7 @@
|
|||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
@ -113,7 +113,7 @@ public:
|
|||
m_worker_state = QthreadExec::Inactive ;
|
||||
Impl::spinwait( m_worker_state , QthreadExec::Inactive );
|
||||
}
|
||||
|
||||
|
||||
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
|
||||
m_worker_base[j]->m_worker_state = QthreadExec::Active ;
|
||||
}
|
||||
|
@ -136,7 +136,7 @@ public:
|
|||
m_worker_state = QthreadExec::Inactive ;
|
||||
Impl::spinwait( m_worker_state , QthreadExec::Inactive );
|
||||
}
|
||||
|
||||
|
||||
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
|
||||
m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
|
||||
}
|
||||
|
@ -145,11 +145,13 @@ public:
|
|||
|
||||
//----------------------------------------
|
||||
/** Reduce across all workers participating in the 'exec_all' */
|
||||
template< class FunctorType , class ArgTag >
|
||||
template< class FunctorType , class ReducerType , class ArgTag >
|
||||
inline
|
||||
void exec_all_reduce( const FunctorType & func ) const
|
||||
void exec_all_reduce( const FunctorType & func, const ReducerType & reduce ) const
|
||||
{
|
||||
typedef Kokkos::Impl::FunctorValueJoin< FunctorType , ArgTag > ValueJoin ;
|
||||
typedef Kokkos::Impl::if_c< std::is_same<InvalidType, ReducerType>::value, FunctorType, ReducerType > ReducerConditional;
|
||||
typedef typename ReducerConditional::type ReducerTypeFwd;
|
||||
typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, ArgTag > ValueJoin ;
|
||||
|
||||
const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
|
||||
|
||||
|
@ -160,14 +162,14 @@ public:
|
|||
|
||||
Impl::spinwait( fan.m_worker_state , QthreadExec::Active );
|
||||
|
||||
ValueJoin::join( func , m_scratch_alloc , fan.m_scratch_alloc );
|
||||
ValueJoin::join( ReducerConditional::select(func , reduce) , m_scratch_alloc , fan.m_scratch_alloc );
|
||||
}
|
||||
|
||||
if ( rev_rank ) {
|
||||
m_worker_state = QthreadExec::Inactive ;
|
||||
Impl::spinwait( m_worker_state , QthreadExec::Inactive );
|
||||
}
|
||||
|
||||
|
||||
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
|
||||
m_worker_base[j]->m_worker_state = QthreadExec::Active ;
|
||||
}
|
||||
|
@ -197,7 +199,7 @@ public:
|
|||
}
|
||||
else {
|
||||
// Root thread scans across values before releasing threads
|
||||
// Worker data is in reverse order, so m_worker_base[0] is the
|
||||
// Worker data is in reverse order, so m_worker_base[0] is the
|
||||
// highest ranking thread.
|
||||
|
||||
// Copy from lower ranking to higher ranking worker.
|
||||
|
@ -216,7 +218,7 @@ public:
|
|||
ValueJoin::join( func , m_worker_base[i-1]->m_scratch_alloc , m_worker_base[i]->m_scratch_alloc );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
|
||||
m_worker_base[j]->m_worker_state = QthreadExec::Active ;
|
||||
}
|
||||
|
@ -349,7 +351,7 @@ public:
|
|||
}
|
||||
else {
|
||||
// Root thread scans across values before releasing threads
|
||||
// Worker data is in reverse order, so m_shepherd_base[0] is the
|
||||
// Worker data is in reverse order, so m_shepherd_base[0] is the
|
||||
// highest ranking thread.
|
||||
|
||||
// Copy from lower ranking to higher ranking worker.
|
||||
|
@ -371,7 +373,7 @@ public:
|
|||
|
||||
memory_fence();
|
||||
}
|
||||
|
||||
|
||||
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
|
||||
m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
|
||||
}
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
|
@ -36,7 +36,7 @@
|
|||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
@ -130,9 +130,10 @@ public:
|
|||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template< class FunctorType , class ... Traits >
|
||||
template< class FunctorType , class ReducerType , class ... Traits >
|
||||
class ParallelReduce< FunctorType
|
||||
, Kokkos::RangePolicy< Traits ... >
|
||||
, ReducerType
|
||||
, Kokkos::Qthread
|
||||
>
|
||||
{
|
||||
|
@ -141,18 +142,24 @@ private:
|
|||
typedef Kokkos::RangePolicy< Traits ... > Policy ;
|
||||
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
typedef typename Policy::WorkRange WorkRange ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;
|
||||
typedef Kokkos::Impl::if_c< std::is_same<InvalidType, ReducerType>::value, FunctorType, ReducerType > ReducerConditional;
|
||||
typedef typename ReducerConditional::type ReducerTypeFwd;
|
||||
|
||||
// Static Assert WorkTag void if ReducerType not InvalidType
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ;
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
const pointer_type m_result_ptr ;
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
const ReducerType m_reducer ;
|
||||
const pointer_type m_result_ptr ;
|
||||
|
||||
template< class TagType >
|
||||
inline static
|
||||
|
@ -187,9 +194,10 @@ private:
|
|||
|
||||
ParallelReduce::template exec_range< WorkTag >(
|
||||
self.m_functor, range.begin(), range.end(),
|
||||
ValueInit::init( self.m_functor , exec.exec_all_reduce_value() ) );
|
||||
ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer)
|
||||
, exec.exec_all_reduce_value() ) );
|
||||
|
||||
exec.template exec_all_reduce<FunctorType, WorkTag >( self.m_functor );
|
||||
exec.template exec_all_reduce< FunctorType, ReducerType, WorkTag >( self.m_functor, self.m_reducer );
|
||||
}
|
||||
|
||||
public:
|
||||
|
@ -197,26 +205,39 @@ public:
|
|||
inline
|
||||
void execute() const
|
||||
{
|
||||
QthreadExec::resize_worker_scratch( ValueTraits::value_size( m_functor ) , 0 );
|
||||
QthreadExec::resize_worker_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
|
||||
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::exec , this );
|
||||
|
||||
const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();
|
||||
|
||||
Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( m_functor , data );
|
||||
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , data );
|
||||
|
||||
if ( m_result_ptr ) {
|
||||
const unsigned n = ValueTraits::value_count( m_functor );
|
||||
const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
|
||||
for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
|
||||
}
|
||||
}
|
||||
|
||||
template< class HostViewType >
|
||||
template< class ViewType >
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy
|
||||
, const HostViewType & arg_result_view )
|
||||
, const ViewType & arg_result_view
|
||||
, typename std::enable_if<Kokkos::is_view< ViewType >::value &&
|
||||
!Kokkos::is_reducer_type< ReducerType >::value
|
||||
, void*>::type = NULL)
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_result_ptr( arg_result_view.ptr_on_device() )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( InvalidType() )
|
||||
, m_result_ptr( arg_result_view.data() )
|
||||
{ }
|
||||
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, Policy arg_policy
|
||||
, const ReducerType& reducer )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( reducer )
|
||||
, m_result_ptr( reducer.result_view().data() )
|
||||
{ }
|
||||
};
|
||||
|
||||
|
@ -291,10 +312,12 @@ public:
|
|||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template< class FunctorType , class ... Properties >
|
||||
template< class FunctorType , class ReducerType , class ... Properties >
|
||||
class ParallelReduce< FunctorType
|
||||
, TeamPolicy< Properties... >
|
||||
, Kokkos::Qthread >
|
||||
, ReducerType
|
||||
, Kokkos::Qthread
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
|
@ -303,14 +326,18 @@ private:
|
|||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;
|
||||
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
|
||||
typedef typename ReducerConditional::type ReducerTypeFwd;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTag > ValueInit ;
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
const ReducerType m_reducer ;
|
||||
const pointer_type m_result_ptr ;
|
||||
|
||||
template< class TagType >
|
||||
|
@ -345,9 +372,10 @@ private:
|
|||
ParallelReduce::template exec_team< WorkTag >
|
||||
( self.m_functor
|
||||
, Member( exec , self.m_policy )
|
||||
, ValueInit::init( self.m_functor , exec.exec_all_reduce_value() ) );
|
||||
, ValueInit::init( ReducerConditional::select( self.m_functor , self.m_reducer )
|
||||
, exec.exec_all_reduce_value() ) );
|
||||
|
||||
exec.template exec_all_reduce< FunctorType , WorkTag >( self.m_functor );
|
||||
exec.template exec_all_reduce< FunctorType, ReducerType, WorkTag >( self.m_functor, self.m_reducer );
|
||||
}
|
||||
|
||||
public:
|
||||
|
@ -356,29 +384,43 @@ public:
|
|||
void execute() const
|
||||
{
|
||||
QthreadExec::resize_worker_scratch
|
||||
( /* reduction memory */ ValueTraits::value_size( m_functor )
|
||||
( /* reduction memory */ ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) )
|
||||
, /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( m_functor , m_policy.team_size() ) );
|
||||
|
||||
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::exec , this );
|
||||
|
||||
const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();
|
||||
|
||||
Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( m_functor , data );
|
||||
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer), data );
|
||||
|
||||
if ( m_result_ptr ) {
|
||||
const unsigned n = ValueTraits::value_count( m_functor );
|
||||
const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
|
||||
for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
|
||||
}
|
||||
}
|
||||
|
||||
template< class ViewType >
|
||||
ParallelReduce( const FunctorType & arg_functor ,
|
||||
const Policy & arg_policy ,
|
||||
const ViewType & arg_result )
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy
|
||||
, const ViewType & arg_result
|
||||
, typename std::enable_if<Kokkos::is_view< ViewType >::value &&
|
||||
!Kokkos::is_reducer_type< ReducerType >::value
|
||||
, void*>::type = NULL)
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( InvalidType() )
|
||||
, m_result_ptr( arg_result.ptr_on_device() )
|
||||
{ }
|
||||
|
||||
inline
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, Policy arg_policy
|
||||
, const ReducerType& reducer )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( reducer )
|
||||
, m_result_ptr( reducer.result_view().data() )
|
||||
{ }
|
||||
};
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
@ -395,8 +437,8 @@ private:
|
|||
typedef Kokkos::RangePolicy< Traits ... > Policy ;
|
||||
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
typedef typename Policy::WorkRange WorkRange ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;
|
||||
|
|
|
@ -58,6 +58,8 @@
|
|||
#include <Kokkos_Atomic.hpp>
|
||||
#include <Qthread/Kokkos_Qthread_TaskPolicy.hpp>
|
||||
|
||||
#if defined( KOKKOS_ENABLE_TASKPOLICY )
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
|
@ -120,13 +122,13 @@ Task::~TaskMember()
|
|||
}
|
||||
|
||||
|
||||
Task::TaskMember( const function_verify_type arg_verify
|
||||
, const function_dealloc_type arg_dealloc
|
||||
, const function_apply_single_type arg_apply_single
|
||||
, const function_apply_team_type arg_apply_team
|
||||
, volatile int & arg_active_count
|
||||
, const unsigned arg_sizeof_derived
|
||||
, const unsigned arg_dependence_capacity
|
||||
Task::TaskMember( const function_verify_type arg_verify
|
||||
, const function_dealloc_type arg_dealloc
|
||||
, const function_single_type arg_apply_single
|
||||
, const function_team_type arg_apply_team
|
||||
, volatile int & arg_active_count
|
||||
, const unsigned arg_sizeof_derived
|
||||
, const unsigned arg_dependence_capacity
|
||||
)
|
||||
: m_dealloc( arg_dealloc )
|
||||
, m_verify( arg_verify )
|
||||
|
@ -144,12 +146,12 @@ Task::TaskMember( const function_verify_type arg_verify
|
|||
for ( unsigned i = 0 ; i < arg_dependence_capacity ; ++i ) m_dep[i] = 0 ;
|
||||
}
|
||||
|
||||
Task::TaskMember( const function_dealloc_type arg_dealloc
|
||||
, const function_apply_single_type arg_apply_single
|
||||
, const function_apply_team_type arg_apply_team
|
||||
, volatile int & arg_active_count
|
||||
, const unsigned arg_sizeof_derived
|
||||
, const unsigned arg_dependence_capacity
|
||||
Task::TaskMember( const function_dealloc_type arg_dealloc
|
||||
, const function_single_type arg_apply_single
|
||||
, const function_team_type arg_apply_team
|
||||
, volatile int & arg_active_count
|
||||
, const unsigned arg_sizeof_derived
|
||||
, const unsigned arg_dependence_capacity
|
||||
)
|
||||
: m_dealloc( arg_dealloc )
|
||||
, m_verify( & Task::verify_type<void> )
|
||||
|
@ -316,12 +318,8 @@ aligned_t Task::qthread_func( void * arg )
|
|||
, int(Kokkos::Experimental::TASK_STATE_EXECUTING)
|
||||
);
|
||||
|
||||
// It is a single thread's responsibility to close out
|
||||
// this task's execution.
|
||||
bool close_out = false ;
|
||||
|
||||
if ( task->m_apply_team && ! task->m_apply_single ) {
|
||||
const Kokkos::Impl::QthreadTeamPolicyMember::TaskTeam task_team_tag ;
|
||||
Kokkos::Impl::QthreadTeamPolicyMember::TaskTeam task_team_tag ;
|
||||
|
||||
// Initialize team size and rank with shephered info
|
||||
Kokkos::Impl::QthreadTeamPolicyMember member( task_team_tag );
|
||||
|
@ -344,7 +342,7 @@ fflush(stdout);
|
|||
if ( member.team_rank() == 0 ) task->closeout();
|
||||
member.team_barrier();
|
||||
}
|
||||
else if ( task->m_apply_team && task->m_apply_single == reinterpret_cast<function_apply_single_type>(1) ) {
|
||||
else if ( task->m_apply_team && task->m_apply_single == reinterpret_cast<function_single_type>(1) ) {
|
||||
// Team hard-wired to one, no cloning
|
||||
Kokkos::Impl::QthreadTeamPolicyMember member ;
|
||||
(*task->m_apply_team)( task , member );
|
||||
|
@ -488,5 +486,6 @@ void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Qthread > & policy )
|
|||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
|
||||
#endif /* #if defined( KOKKOS_HAVE_QTHREAD ) */
|
||||
|
||||
|
|
|
@ -69,6 +69,8 @@
|
|||
|
||||
#include <impl/Kokkos_FunctorAdapter.hpp>
|
||||
|
||||
#if defined( KOKKOS_ENABLE_TASKPOLICY )
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
|
@ -80,24 +82,24 @@ class TaskMember< Kokkos::Qthread , void , void >
|
|||
{
|
||||
public:
|
||||
|
||||
typedef void (* function_apply_single_type) ( TaskMember * );
|
||||
typedef void (* function_apply_team_type) ( TaskMember * , Kokkos::Impl::QthreadTeamPolicyMember & );
|
||||
typedef void (* function_dealloc_type)( TaskMember * );
|
||||
typedef TaskMember * (* function_verify_type) ( TaskMember * );
|
||||
typedef void (* function_single_type) ( TaskMember * );
|
||||
typedef void (* function_team_type) ( TaskMember * , Kokkos::Impl::QthreadTeamPolicyMember & );
|
||||
typedef void (* function_dealloc_type)( TaskMember * );
|
||||
|
||||
private:
|
||||
|
||||
const function_dealloc_type m_dealloc ; ///< Deallocation
|
||||
const function_verify_type m_verify ; ///< Result type verification
|
||||
const function_apply_single_type m_apply_single ; ///< Apply function
|
||||
const function_apply_team_type m_apply_team ; ///< Apply function
|
||||
int volatile * const m_active_count ; ///< Count of active tasks on this policy
|
||||
aligned_t m_qfeb ; ///< Qthread full/empty bit
|
||||
TaskMember ** const m_dep ; ///< Dependences
|
||||
const int m_dep_capacity ; ///< Capacity of dependences
|
||||
int m_dep_size ; ///< Actual count of dependences
|
||||
int m_ref_count ; ///< Reference count
|
||||
int m_state ; ///< State of the task
|
||||
const function_dealloc_type m_dealloc ; ///< Deallocation
|
||||
const function_verify_type m_verify ; ///< Result type verification
|
||||
const function_single_type m_apply_single ; ///< Apply function
|
||||
const function_team_type m_apply_team ; ///< Apply function
|
||||
int volatile * const m_active_count ; ///< Count of active tasks on this policy
|
||||
aligned_t m_qfeb ; ///< Qthread full/empty bit
|
||||
TaskMember ** const m_dep ; ///< Dependences
|
||||
const int m_dep_capacity ; ///< Capacity of dependences
|
||||
int m_dep_size ; ///< Actual count of dependences
|
||||
int m_ref_count ; ///< Reference count
|
||||
int m_state ; ///< State of the task
|
||||
|
||||
TaskMember() /* = delete */ ;
|
||||
TaskMember( const TaskMember & ) /* = delete */ ;
|
||||
|
@ -128,22 +130,22 @@ protected :
|
|||
~TaskMember();
|
||||
|
||||
// Used by TaskMember< Qthread , ResultType , void >
|
||||
TaskMember( const function_verify_type arg_verify
|
||||
, const function_dealloc_type arg_dealloc
|
||||
, const function_apply_single_type arg_apply_single
|
||||
, const function_apply_team_type arg_apply_team
|
||||
, volatile int & arg_active_count
|
||||
, const unsigned arg_sizeof_derived
|
||||
, const unsigned arg_dependence_capacity
|
||||
TaskMember( const function_verify_type arg_verify
|
||||
, const function_dealloc_type arg_dealloc
|
||||
, const function_single_type arg_apply_single
|
||||
, const function_team_type arg_apply_team
|
||||
, volatile int & arg_active_count
|
||||
, const unsigned arg_sizeof_derived
|
||||
, const unsigned arg_dependence_capacity
|
||||
);
|
||||
|
||||
// Used for TaskMember< Qthread , void , void >
|
||||
TaskMember( const function_dealloc_type arg_dealloc
|
||||
, const function_apply_single_type arg_apply_single
|
||||
, const function_apply_team_type arg_apply_team
|
||||
, volatile int & arg_active_count
|
||||
, const unsigned arg_sizeof_derived
|
||||
, const unsigned arg_dependence_capacity
|
||||
TaskMember( const function_dealloc_type arg_dealloc
|
||||
, const function_single_type arg_apply_single
|
||||
, const function_team_type arg_apply_team
|
||||
, volatile int & arg_active_count
|
||||
, const unsigned arg_sizeof_derived
|
||||
, const unsigned arg_dependence_capacity
|
||||
);
|
||||
|
||||
public:
|
||||
|
@ -221,7 +223,7 @@ public:
|
|||
typedef typename DerivedTaskType::functor_type functor_type ;
|
||||
typedef typename functor_type::value_type value_type ;
|
||||
|
||||
const function_apply_single_type flag = reinterpret_cast<function_apply_single_type>( arg_is_team ? 0 : 1 );
|
||||
const function_single_type flag = reinterpret_cast<function_single_type>( arg_is_team ? 0 : 1 );
|
||||
|
||||
DerivedTaskType * const task =
|
||||
new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) )
|
||||
|
@ -379,16 +381,16 @@ protected:
|
|||
|
||||
typedef TaskMember< Kokkos::Qthread , void , void > task_root_type ;
|
||||
typedef task_root_type::function_dealloc_type function_dealloc_type ;
|
||||
typedef task_root_type::function_apply_single_type function_apply_single_type ;
|
||||
typedef task_root_type::function_apply_team_type function_apply_team_type ;
|
||||
typedef task_root_type::function_single_type function_single_type ;
|
||||
typedef task_root_type::function_team_type function_team_type ;
|
||||
|
||||
inline
|
||||
TaskMember( const function_dealloc_type arg_dealloc
|
||||
, const function_apply_single_type arg_apply_single
|
||||
, const function_apply_team_type arg_apply_team
|
||||
, volatile int & arg_active_count
|
||||
, const unsigned arg_sizeof_derived
|
||||
, const unsigned arg_dependence_capacity
|
||||
TaskMember( const function_dealloc_type arg_dealloc
|
||||
, const function_single_type arg_apply_single
|
||||
, const function_team_type arg_apply_team
|
||||
, volatile int & arg_active_count
|
||||
, const unsigned arg_sizeof_derived
|
||||
, const unsigned arg_dependence_capacity
|
||||
)
|
||||
: task_root_type( & task_root_type::template verify_type< ResultType >
|
||||
, arg_dealloc
|
||||
|
@ -413,17 +415,17 @@ public:
|
|||
typedef TaskMember< Kokkos::Qthread , void , void > task_root_type ;
|
||||
typedef TaskMember< Kokkos::Qthread , ResultType , void > task_base_type ;
|
||||
typedef task_root_type::function_dealloc_type function_dealloc_type ;
|
||||
typedef task_root_type::function_apply_single_type function_apply_single_type ;
|
||||
typedef task_root_type::function_apply_team_type function_apply_team_type ;
|
||||
typedef task_root_type::function_single_type function_single_type ;
|
||||
typedef task_root_type::function_team_type function_team_type ;
|
||||
|
||||
inline
|
||||
TaskMember( const function_dealloc_type arg_dealloc
|
||||
, const function_apply_single_type arg_apply_single
|
||||
, const function_apply_team_type arg_apply_team
|
||||
, volatile int & arg_active_count
|
||||
, const unsigned arg_sizeof_derived
|
||||
, const unsigned arg_dependence_capacity
|
||||
, const functor_type & arg_functor
|
||||
TaskMember( const function_dealloc_type arg_dealloc
|
||||
, const function_single_type arg_apply_single
|
||||
, const function_team_type arg_apply_team
|
||||
, volatile int & arg_active_count
|
||||
, const unsigned arg_sizeof_derived
|
||||
, const unsigned arg_dependence_capacity
|
||||
, const functor_type & arg_functor
|
||||
)
|
||||
: task_base_type( arg_dealloc
|
||||
, arg_apply_single
|
||||
|
@ -453,6 +455,7 @@ class TaskPolicy< Kokkos::Qthread >
|
|||
public:
|
||||
|
||||
typedef Kokkos::Qthread execution_space ;
|
||||
typedef TaskPolicy execution_policy ;
|
||||
typedef Kokkos::Impl::QthreadTeamPolicyMember member_type ;
|
||||
|
||||
private:
|
||||
|
@ -489,14 +492,17 @@ public:
|
|||
, const unsigned arg_task_team_size = 0 /* choose default */
|
||||
);
|
||||
|
||||
TaskPolicy() = default ;
|
||||
TaskPolicy( TaskPolicy && rhs ) = default ;
|
||||
TaskPolicy( const TaskPolicy & rhs ) = default ;
|
||||
TaskPolicy & operator = ( TaskPolicy && rhs ) = default ;
|
||||
TaskPolicy & operator = ( const TaskPolicy & rhs ) = default ;
|
||||
KOKKOS_FUNCTION TaskPolicy() = default ;
|
||||
KOKKOS_FUNCTION TaskPolicy( TaskPolicy && rhs ) = default ;
|
||||
KOKKOS_FUNCTION TaskPolicy( const TaskPolicy & rhs ) = default ;
|
||||
KOKKOS_FUNCTION TaskPolicy & operator = ( TaskPolicy && rhs ) = default ;
|
||||
KOKKOS_FUNCTION TaskPolicy & operator = ( const TaskPolicy & rhs ) = default ;
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int allocated_task_count() const { return m_active_count ; }
|
||||
|
||||
template< class ValueType >
|
||||
const Future< ValueType , execution_space > &
|
||||
spawn( const Future< ValueType , execution_space > & f
|
||||
|
@ -653,5 +659,6 @@ public:
|
|||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
|
||||
#endif /* #define KOKKOS_QTHREAD_TASK_HPP */
|
||||
|
||||
|
|
|
@ -3,26 +3,23 @@
|
|||
|
||||
# Cloning repository and branch:
|
||||
|
||||
git clone https://github.com/stelleg/qthreads qthreads-with-clone
|
||||
git clone git@github.com:Qthreads/qthreads.git qthreads
|
||||
|
||||
cd qthreads-with-clone
|
||||
cd qthreads
|
||||
|
||||
# Added to ./git/config
|
||||
#
|
||||
# [branch "cloned_tasks"]
|
||||
# remote = origin
|
||||
# merge = refs/heads/cloned_tasks
|
||||
#
|
||||
# checkout branch with "cloned tasks"
|
||||
|
||||
git branch cloned_tasks
|
||||
git checkout cloned_tasks
|
||||
git pull
|
||||
git checkout dev-kokkos
|
||||
|
||||
# Configure/autogen
|
||||
|
||||
sh autogen.sh
|
||||
|
||||
# configurure with 'hwloc' installation:
|
||||
# configure with 'hwloc' installation:
|
||||
|
||||
./configure CFLAGS="-DCLONED_TASKS -DQTHREAD_LOCAL_PRIORITY" --with-hwloc=${HWLOCDIR} --prefix=${INSTALLDIR}
|
||||
|
||||
# install
|
||||
|
||||
make install
|
||||
|
||||
|
|
|
@ -53,6 +53,7 @@
|
|||
#include <Kokkos_Core.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
#include <impl/Kokkos_CPUDiscovery.hpp>
|
||||
#include <impl/Kokkos_Profiling_Interface.hpp>
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
@ -134,11 +135,7 @@ void ThreadsExec::driver(void)
|
|||
|
||||
ThreadsExec::ThreadsExec()
|
||||
: m_pool_base(0)
|
||||
#if ! KOKKOS_USING_EXP_VIEW
|
||||
, m_scratch()
|
||||
#else
|
||||
, m_scratch(0)
|
||||
#endif
|
||||
, m_scratch_reduce_end(0)
|
||||
, m_scratch_thread_end(0)
|
||||
, m_numa_rank(0)
|
||||
|
@ -198,8 +195,6 @@ ThreadsExec::~ThreadsExec()
|
|||
{
|
||||
const unsigned entry = m_pool_size - ( m_pool_rank + 1 );
|
||||
|
||||
#if KOKKOS_USING_EXP_VIEW
|
||||
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
|
||||
|
||||
if ( m_scratch ) {
|
||||
|
@ -210,12 +205,6 @@ ThreadsExec::~ThreadsExec()
|
|||
Record::decrement( r );
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
m_scratch.clear();
|
||||
|
||||
#endif
|
||||
|
||||
m_pool_base = 0 ;
|
||||
m_scratch_reduce_end = 0 ;
|
||||
m_scratch_thread_end = 0 ;
|
||||
|
@ -439,8 +428,6 @@ void * ThreadsExec::root_reduce_scratch()
|
|||
|
||||
void ThreadsExec::execute_resize_scratch( ThreadsExec & exec , const void * )
|
||||
{
|
||||
#if KOKKOS_USING_EXP_VIEW
|
||||
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
|
||||
|
||||
if ( exec.m_scratch ) {
|
||||
|
@ -451,19 +438,11 @@ void ThreadsExec::execute_resize_scratch( ThreadsExec & exec , const void * )
|
|||
Record::decrement( r );
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
exec.m_scratch.clear();
|
||||
|
||||
#endif
|
||||
|
||||
exec.m_scratch_reduce_end = s_threads_process.m_scratch_reduce_end ;
|
||||
exec.m_scratch_thread_end = s_threads_process.m_scratch_thread_end ;
|
||||
|
||||
if ( s_threads_process.m_scratch_thread_end ) {
|
||||
|
||||
#if KOKKOS_USING_EXP_VIEW
|
||||
|
||||
// Allocate tracked memory:
|
||||
{
|
||||
Record * const r = Record::allocate( Kokkos::HostSpace() , "thread_scratch" , s_threads_process.m_scratch_thread_end );
|
||||
|
@ -475,15 +454,6 @@ void ThreadsExec::execute_resize_scratch( ThreadsExec & exec , const void * )
|
|||
|
||||
unsigned * ptr = reinterpret_cast<unsigned *>( exec.m_scratch );
|
||||
|
||||
#else
|
||||
|
||||
exec.m_scratch =
|
||||
HostSpace::allocate_and_track( "thread_scratch" , s_threads_process.m_scratch_thread_end );
|
||||
|
||||
unsigned * ptr = reinterpret_cast<unsigned *>( exec.m_scratch.alloc_ptr() );
|
||||
|
||||
#endif
|
||||
|
||||
unsigned * const end = ptr + s_threads_process.m_scratch_thread_end / sizeof(unsigned);
|
||||
|
||||
// touch on this thread
|
||||
|
@ -520,11 +490,7 @@ void * ThreadsExec::resize_scratch( size_t reduce_size , size_t thread_size )
|
|||
s_threads_process.m_scratch = s_threads_exec[0]->m_scratch ;
|
||||
}
|
||||
|
||||
#if KOKKOS_USING_EXP_VIEW
|
||||
return s_threads_process.m_scratch ;
|
||||
#else
|
||||
return s_threads_process.m_scratch.alloc_ptr() ;
|
||||
#endif
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
@ -758,6 +724,9 @@ void ThreadsExec::initialize( unsigned thread_count ,
|
|||
// Init the array for used for arbitrarily sized atomics
|
||||
Impl::init_lock_array_host_space();
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
Kokkos::Profiling::initialize();
|
||||
#endif
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
@ -807,6 +776,10 @@ void ThreadsExec::finalize()
|
|||
s_threads_process.m_pool_size = 1 ;
|
||||
s_threads_process.m_pool_fan_size = 0 ;
|
||||
s_threads_process.m_pool_state = ThreadsExec::Inactive ;
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
Kokkos::Profiling::finalize();
|
||||
#endif
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
|
|
@ -49,7 +49,6 @@
|
|||
#include <utility>
|
||||
#include <impl/Kokkos_spinwait.hpp>
|
||||
#include <impl/Kokkos_FunctorAdapter.hpp>
|
||||
#include <impl/Kokkos_AllocationTracker.hpp>
|
||||
|
||||
#include <Kokkos_Atomic.hpp>
|
||||
|
||||
|
@ -89,11 +88,7 @@ private:
|
|||
|
||||
ThreadsExec * const * m_pool_base ; ///< Base for pool fan-in
|
||||
|
||||
#if ! KOKKOS_USING_EXP_VIEW
|
||||
Impl::AllocationTracker m_scratch ;
|
||||
#else
|
||||
void * m_scratch ;
|
||||
#endif
|
||||
int m_scratch_reduce_end ;
|
||||
int m_scratch_thread_end ;
|
||||
int m_numa_rank ;
|
||||
|
@ -138,19 +133,10 @@ public:
|
|||
static int get_thread_count();
|
||||
static ThreadsExec * get_thread( const int init_thread_rank );
|
||||
|
||||
#if ! KOKKOS_USING_EXP_VIEW
|
||||
|
||||
inline void * reduce_memory() const { return reinterpret_cast<unsigned char *>(m_scratch.alloc_ptr()); }
|
||||
KOKKOS_INLINE_FUNCTION void * scratch_memory() const { return reinterpret_cast<unsigned char *>(m_scratch.alloc_ptr()) + m_scratch_reduce_end ; }
|
||||
|
||||
#else
|
||||
|
||||
inline void * reduce_memory() const { return m_scratch ; }
|
||||
KOKKOS_INLINE_FUNCTION void * scratch_memory() const
|
||||
{ return reinterpret_cast<unsigned char *>(m_scratch) + m_scratch_reduce_end ; }
|
||||
|
||||
#endif
|
||||
|
||||
KOKKOS_INLINE_FUNCTION int volatile & state() { return m_pool_state ; }
|
||||
KOKKOS_INLINE_FUNCTION ThreadsExec * const * pool_base() const { return m_pool_base ; }
|
||||
|
||||
|
|
|
@ -129,15 +129,15 @@ public:
|
|||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const execution_space::scratch_memory_space & team_shmem() const
|
||||
{ return m_team_shared.set_team_thread_mode(1,0) ; }
|
||||
{ return m_team_shared.set_team_thread_mode(0,1,0) ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const execution_space::scratch_memory_space & team_scratch(int) const
|
||||
{ return m_team_shared.set_team_thread_mode(1,0) ; }
|
||||
{ return m_team_shared.set_team_thread_mode(0,1,0) ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const execution_space::scratch_memory_space & thread_scratch(int) const
|
||||
{ return m_team_shared.set_team_thread_mode(team_size(),team_rank()) ; }
|
||||
{ return m_team_shared.set_team_thread_mode(0,team_size(),team_rank()) ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
|
||||
KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
|
||||
|
@ -433,10 +433,11 @@ public:
|
|||
|
||||
void next_static()
|
||||
{
|
||||
if ( ++m_league_rank < m_league_end ) {
|
||||
if ( m_league_rank < m_league_end ) {
|
||||
team_barrier();
|
||||
set_team_shared();
|
||||
}
|
||||
m_league_rank++;
|
||||
}
|
||||
|
||||
bool valid_dynamic() {
|
||||
|
@ -468,10 +469,11 @@ public:
|
|||
if(m_invalid_thread)
|
||||
return;
|
||||
|
||||
team_barrier();
|
||||
if ( ++m_league_rank < m_league_chunk_end ) {
|
||||
if ( m_league_rank < m_league_chunk_end ) {
|
||||
team_barrier();
|
||||
set_team_shared();
|
||||
}
|
||||
m_league_rank++;
|
||||
}
|
||||
|
||||
void set_league_shmem( const int arg_league_rank
|
||||
|
@ -504,8 +506,8 @@ private:
|
|||
int m_team_alloc ;
|
||||
int m_team_iter ;
|
||||
|
||||
size_t m_team_scratch_size;
|
||||
size_t m_thread_scratch_size;
|
||||
size_t m_team_scratch_size[2];
|
||||
size_t m_thread_scratch_size[2];
|
||||
|
||||
int m_chunk_size;
|
||||
|
||||
|
@ -549,8 +551,10 @@ public:
|
|||
m_team_size = p.m_team_size;
|
||||
m_team_alloc = p.m_team_alloc;
|
||||
m_team_iter = p.m_team_iter;
|
||||
m_team_scratch_size = p.m_team_scratch_size;
|
||||
m_thread_scratch_size = p.m_thread_scratch_size;
|
||||
m_team_scratch_size[0] = p.m_team_scratch_size[0];
|
||||
m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
|
||||
m_team_scratch_size[1] = p.m_team_scratch_size[1];
|
||||
m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
|
||||
m_chunk_size = p.m_chunk_size;
|
||||
return *this;
|
||||
}
|
||||
|
@ -577,7 +581,12 @@ public:
|
|||
inline int team_size() const { return m_team_size ; }
|
||||
inline int team_alloc() const { return m_team_alloc ; }
|
||||
inline int league_size() const { return m_league_size ; }
|
||||
inline size_t scratch_size() const { return m_team_scratch_size + m_team_size*m_thread_scratch_size ; }
|
||||
inline size_t scratch_size(const int& level, int team_size_ = -1 ) const {
|
||||
if(team_size_ < 0)
|
||||
team_size_ = m_team_size;
|
||||
return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level] ;
|
||||
}
|
||||
|
||||
inline int team_iter() const { return m_team_iter ; }
|
||||
|
||||
/** \brief Specify league size, request team size */
|
||||
|
@ -588,8 +597,8 @@ public:
|
|||
: m_league_size(0)
|
||||
, m_team_size(0)
|
||||
, m_team_alloc(0)
|
||||
, m_team_scratch_size ( 0 )
|
||||
, m_thread_scratch_size ( 0 )
|
||||
, m_team_scratch_size { 0 , 0 }
|
||||
, m_thread_scratch_size { 0 , 0 }
|
||||
, m_chunk_size(0)
|
||||
{ init(league_size_request,team_size_request); (void) vector_length_request; }
|
||||
|
||||
|
@ -601,8 +610,8 @@ public:
|
|||
: m_league_size(0)
|
||||
, m_team_size(0)
|
||||
, m_team_alloc(0)
|
||||
, m_team_scratch_size ( 0 )
|
||||
, m_thread_scratch_size ( 0 )
|
||||
, m_team_scratch_size { 0 , 0 }
|
||||
, m_thread_scratch_size { 0 , 0 }
|
||||
, m_chunk_size(0)
|
||||
{ init(league_size_request,traits::execution_space::thread_pool_size(2)); }
|
||||
|
||||
|
@ -612,8 +621,8 @@ public:
|
|||
: m_league_size(0)
|
||||
, m_team_size(0)
|
||||
, m_team_alloc(0)
|
||||
, m_team_scratch_size ( 0 )
|
||||
, m_thread_scratch_size ( 0 )
|
||||
, m_team_scratch_size { 0 , 0 }
|
||||
, m_thread_scratch_size { 0 , 0 }
|
||||
, m_chunk_size(0)
|
||||
{ init(league_size_request,team_size_request); }
|
||||
|
||||
|
@ -623,8 +632,8 @@ public:
|
|||
: m_league_size(0)
|
||||
, m_team_size(0)
|
||||
, m_team_alloc(0)
|
||||
, m_team_scratch_size ( 0 )
|
||||
, m_thread_scratch_size ( 0 )
|
||||
, m_team_scratch_size { 0 , 0 }
|
||||
, m_thread_scratch_size { 0 , 0 }
|
||||
, m_chunk_size(0)
|
||||
{ init(league_size_request,traits::execution_space::thread_pool_size(2)); }
|
||||
|
||||
|
@ -639,26 +648,23 @@ public:
|
|||
|
||||
/** \brief set per team scratch size for a specific level of the scratch hierarchy */
|
||||
inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
|
||||
(void) level;
|
||||
TeamPolicyInternal p = *this;
|
||||
p.m_team_scratch_size = per_team.value;
|
||||
p.m_team_scratch_size[level] = per_team.value;
|
||||
return p;
|
||||
};
|
||||
|
||||
/** \brief set per thread scratch size for a specific level of the scratch hierarchy */
|
||||
inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
|
||||
(void) level;
|
||||
TeamPolicyInternal p = *this;
|
||||
p.m_thread_scratch_size = per_thread.value;
|
||||
p.m_thread_scratch_size[level] = per_thread.value;
|
||||
return p;
|
||||
};
|
||||
|
||||
/** \brief set per thread and per team scratch size for a specific level of the scratch hierarchy */
|
||||
inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
|
||||
(void) level;
|
||||
TeamPolicyInternal p = *this;
|
||||
p.m_team_scratch_size = per_team.value;
|
||||
p.m_thread_scratch_size = per_thread.value;
|
||||
p.m_team_scratch_size[level] = per_team.value;
|
||||
p.m_thread_scratch_size[level] = per_thread.value;
|
||||
return p;
|
||||
};
|
||||
|
||||
|
|
|
@ -264,7 +264,7 @@ public:
|
|||
, const Policy & arg_policy )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_shared( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
|
||||
, m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
|
||||
{ }
|
||||
};
|
||||
|
||||
|
@ -272,9 +272,10 @@ public:
|
|||
//----------------------------------------------------------------------------
|
||||
/* ParallelReduce with Kokkos::Threads and RangePolicy */
|
||||
|
||||
template< class FunctorType , class ... Traits >
|
||||
template< class FunctorType , class ReducerType, class ... Traits >
|
||||
class ParallelReduce< FunctorType
|
||||
, Kokkos::RangePolicy< Traits ... >
|
||||
, ReducerType
|
||||
, Kokkos::Threads
|
||||
>
|
||||
{
|
||||
|
@ -286,14 +287,18 @@ private:
|
|||
typedef typename Policy::WorkRange WorkRange ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;
|
||||
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
|
||||
typedef typename ReducerConditional::type ReducerTypeFwd;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ;
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
const ReducerType m_reducer ;
|
||||
const pointer_type m_result_ptr ;
|
||||
|
||||
template< class TagType >
|
||||
|
@ -344,9 +349,9 @@ private:
|
|||
|
||||
ParallelReduce::template exec_range< WorkTag >
|
||||
( self.m_functor , range.begin() , range.end()
|
||||
, ValueInit::init( self.m_functor , exec.reduce_memory() ) );
|
||||
, ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() ) );
|
||||
|
||||
exec.template fan_in_reduce< FunctorType , WorkTag >( self.m_functor );
|
||||
exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) );
|
||||
}
|
||||
|
||||
template<class Schedule>
|
||||
|
@ -362,7 +367,7 @@ private:
|
|||
exec.barrier();
|
||||
|
||||
long work_index = exec.get_work_index();
|
||||
reference_type update = ValueInit::init( self.m_functor , exec.reduce_memory() );
|
||||
reference_type update = ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() );
|
||||
while(work_index != -1) {
|
||||
const Member begin = static_cast<Member>(work_index) * self.m_policy.chunk_size();
|
||||
const Member end = begin + self.m_policy.chunk_size() < self.m_policy.end()?begin+self.m_policy.chunk_size():self.m_policy.end();
|
||||
|
@ -372,7 +377,7 @@ private:
|
|||
work_index = exec.get_work_index();
|
||||
}
|
||||
|
||||
exec.template fan_in_reduce< FunctorType , WorkTag >( self.m_functor );
|
||||
exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) );
|
||||
}
|
||||
|
||||
public:
|
||||
|
@ -380,7 +385,7 @@ public:
|
|||
inline
|
||||
void execute() const
|
||||
{
|
||||
ThreadsExec::resize_scratch( ValueTraits::value_size( m_functor ) , 0 );
|
||||
ThreadsExec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
|
||||
|
||||
ThreadsExec::start( & ParallelReduce::exec , this );
|
||||
|
||||
|
@ -391,7 +396,7 @@ public:
|
|||
const pointer_type data =
|
||||
(pointer_type) ThreadsExec::root_reduce_scratch();
|
||||
|
||||
const unsigned n = ValueTraits::value_count( m_functor );
|
||||
const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
|
||||
for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
|
||||
}
|
||||
}
|
||||
|
@ -399,9 +404,14 @@ public:
|
|||
template< class HostViewType >
|
||||
ParallelReduce( const FunctorType & arg_functor ,
|
||||
const Policy & arg_policy ,
|
||||
const HostViewType & arg_result_view )
|
||||
const HostViewType & arg_result_view ,
|
||||
typename std::enable_if<
|
||||
Kokkos::is_view< HostViewType >::value &&
|
||||
!Kokkos::is_reducer_type<ReducerType>::value
|
||||
,void*>::type = NULL)
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( InvalidType() )
|
||||
, m_result_ptr( arg_result_view.ptr_on_device() )
|
||||
{
|
||||
static_assert( Kokkos::is_view< HostViewType >::value
|
||||
|
@ -410,14 +420,30 @@ public:
|
|||
static_assert( std::is_same< typename HostViewType::memory_space , HostSpace >::value
|
||||
, "Kokkos::Threads reduce result must be a View in HostSpace" );
|
||||
}
|
||||
|
||||
inline
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, Policy arg_policy
|
||||
, const ReducerType& reducer )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( reducer )
|
||||
, m_result_ptr( reducer.result_view().data() )
|
||||
{
|
||||
/*static_assert( std::is_same< typename ViewType::memory_space
|
||||
, Kokkos::HostSpace >::value
|
||||
, "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
/* ParallelReduce with Kokkos::Threads and TeamPolicy */
|
||||
|
||||
template< class FunctorType , class ... Properties >
|
||||
template< class FunctorType , class ReducerType, class ... Properties >
|
||||
class ParallelReduce< FunctorType
|
||||
, Kokkos::TeamPolicy< Properties ... >
|
||||
, ReducerType
|
||||
, Kokkos::Threads
|
||||
>
|
||||
{
|
||||
|
@ -426,14 +452,19 @@ private:
|
|||
typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Threads, Properties ... > Policy ;
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;
|
||||
|
||||
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
|
||||
typedef typename ReducerConditional::type ReducerTypeFwd;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ;
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
const ReducerType m_reducer ;
|
||||
const pointer_type m_result_ptr ;
|
||||
const int m_shared ;
|
||||
|
||||
|
@ -464,9 +495,9 @@ private:
|
|||
|
||||
ParallelReduce::template exec_team< WorkTag >
|
||||
( self.m_functor , Member( & exec , self.m_policy , self.m_shared )
|
||||
, ValueInit::init( self.m_functor , exec.reduce_memory() ) );
|
||||
, ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() ) );
|
||||
|
||||
exec.template fan_in_reduce< FunctorType , WorkTag >( self.m_functor );
|
||||
exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) );
|
||||
}
|
||||
|
||||
public:
|
||||
|
@ -474,7 +505,7 @@ public:
|
|||
inline
|
||||
void execute() const
|
||||
{
|
||||
ThreadsExec::resize_scratch( ValueTraits::value_size( m_functor ) , Policy::member_type::team_reduce_size() + m_shared );
|
||||
ThreadsExec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , Policy::member_type::team_reduce_size() + m_shared );
|
||||
|
||||
ThreadsExec::start( & ParallelReduce::exec , this );
|
||||
|
||||
|
@ -484,20 +515,41 @@ public:
|
|||
|
||||
const pointer_type data = (pointer_type) ThreadsExec::root_reduce_scratch();
|
||||
|
||||
const unsigned n = ValueTraits::value_count( m_functor );
|
||||
const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
|
||||
for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
|
||||
}
|
||||
}
|
||||
|
||||
template< class ViewType >
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy
|
||||
, const ViewType & arg_result )
|
||||
inline
|
||||
ParallelReduce( const FunctorType & arg_functor ,
|
||||
const Policy & arg_policy ,
|
||||
const ViewType & arg_result ,
|
||||
typename std::enable_if<
|
||||
Kokkos::is_view< ViewType >::value &&
|
||||
!Kokkos::is_reducer_type<ReducerType>::value
|
||||
,void*>::type = NULL)
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( InvalidType() )
|
||||
, m_result_ptr( arg_result.ptr_on_device() )
|
||||
, m_shared( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
|
||||
{ }
|
||||
, m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
|
||||
{}
|
||||
|
||||
inline
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, Policy arg_policy
|
||||
, const ReducerType& reducer )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( reducer )
|
||||
, m_result_ptr( reducer.result_view().data() )
|
||||
, m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
|
||||
{
|
||||
/*static_assert( std::is_same< typename ViewType::memory_space
|
||||
, Kokkos::HostSpace >::value
|
||||
, "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
|
||||
}
|
||||
};
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
|
|
@ -46,9 +46,10 @@
|
|||
#include <stdio.h>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <Threads/Kokkos_Threads_TaskPolicy.hpp>
|
||||
|
||||
#if defined( KOKKOS_HAVE_PTHREAD )
|
||||
#if defined( KOKKOS_HAVE_PTHREAD ) && defined( KOKKOS_ENABLE_TASKPOLICY )
|
||||
|
||||
#define QLOCK (reinterpret_cast<void*>( ~((uintptr_t)0) ))
|
||||
#define QDENIED (reinterpret_cast<void*>( ~((uintptr_t)0) - 1 ))
|
||||
|
@ -87,9 +88,8 @@ ThreadsTaskPolicyQueue::ThreadsTaskPolicyQueue
|
|||
, const unsigned arg_task_team_size
|
||||
)
|
||||
: m_space( Kokkos::Threads::memory_space()
|
||||
, arg_task_max_size
|
||||
, arg_task_max_size * arg_task_max_count
|
||||
, 1 /* only one level of memory pool */
|
||||
, arg_task_max_size * arg_task_max_count * 1.2
|
||||
, 16 /* log2(superblock size) */
|
||||
)
|
||||
, m_team { 0 , 0 , 0 }
|
||||
, m_serial { 0 , 0 , 0 }
|
||||
|
@ -624,10 +624,10 @@ ThreadsTaskPolicyQueue::allocate_task
|
|||
// User created task memory pool with an estimate,
|
||||
// if estimate is to low then report and throw exception.
|
||||
|
||||
if ( m_space.get_min_chunk_size() < size_alloc ) {
|
||||
if ( m_space.get_min_block_size() < size_alloc ) {
|
||||
fprintf(stderr,"TaskPolicy<Threads> task allocation requires %d bytes on memory pool with %d byte chunk size\n"
|
||||
, int(size_alloc)
|
||||
, int(m_space.get_min_chunk_size())
|
||||
, int(m_space.get_min_block_size())
|
||||
);
|
||||
fflush(stderr);
|
||||
Kokkos::Impl::throw_runtime_exception("TaskMember< Threads >::task_allocate");
|
||||
|
@ -926,5 +926,5 @@ void Task::clear_dependence()
|
|||
} /* namespace Experimental */
|
||||
} /* namespace Kokkos */
|
||||
|
||||
#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */
|
||||
#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
|
||||
|
||||
|
|
|
@ -50,7 +50,7 @@
|
|||
#include <Kokkos_Threads.hpp>
|
||||
#include <Kokkos_TaskPolicy.hpp>
|
||||
|
||||
#if defined( KOKKOS_HAVE_PTHREAD )
|
||||
#if defined( KOKKOS_HAVE_PTHREAD ) && defined( KOKKOS_ENABLE_TASKPOLICY )
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
@ -737,10 +737,9 @@ public:
|
|||
} /* namespace Experimental */
|
||||
} /* namespace Kokkos */
|
||||
|
||||
#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
|
||||
#endif /* #ifndef KOKKOS_THREADS_TASKPOLICY_HPP */
|
||||
|
||||
|
||||
|
|
|
@ -246,8 +246,8 @@ private:
|
|||
enum : uintptr_t { DO_NOT_DEREF_FLAG = 0x01ul };
|
||||
|
||||
// The allocation record resides in Host memory space
|
||||
Record * m_record ;
|
||||
uintptr_t m_record_bits ;
|
||||
Record * m_record ;
|
||||
|
||||
public:
|
||||
|
||||
|
|
|
@ -47,8 +47,6 @@
|
|||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#if KOKKOS_USING_EXP_VIEW
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
/* For backward compatibility */
|
||||
|
@ -68,8 +66,6 @@ struct ViewAllocateWithoutInitializing {
|
|||
|
||||
} /* namespace Kokkos */
|
||||
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
|
|
@ -2604,18 +2604,24 @@ class ViewMapping< DstTraits , SrcTraits ,
|
|||
&&
|
||||
std::is_same< typename DstTraits::specialize , void >::value
|
||||
&&
|
||||
(
|
||||
std::is_same< typename DstTraits::array_layout , Kokkos::LayoutLeft >::value ||
|
||||
std::is_same< typename DstTraits::array_layout , Kokkos::LayoutRight >::value ||
|
||||
std::is_same< typename DstTraits::array_layout , Kokkos::LayoutStride >::value
|
||||
)
|
||||
&&
|
||||
std::is_same< typename SrcTraits::specialize , void >::value
|
||||
&&
|
||||
(
|
||||
std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value ||
|
||||
std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value ||
|
||||
std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutStride >::value
|
||||
std::is_same< typename DstTraits::array_layout , typename SrcTraits::array_layout >::value
|
||||
||
|
||||
(
|
||||
(
|
||||
std::is_same< typename DstTraits::array_layout , Kokkos::LayoutLeft >::value ||
|
||||
std::is_same< typename DstTraits::array_layout , Kokkos::LayoutRight >::value ||
|
||||
std::is_same< typename DstTraits::array_layout , Kokkos::LayoutStride >::value
|
||||
)
|
||||
&&
|
||||
(
|
||||
std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value ||
|
||||
std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value ||
|
||||
std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutStride >::value
|
||||
)
|
||||
)
|
||||
)
|
||||
)>::type >
|
||||
{
|
||||
|
|
|
@ -1,848 +0,0 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
#if ! KOKKOS_USING_EXP_VIEW
|
||||
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
|
||||
#include <Kokkos_Atomic.hpp>
|
||||
|
||||
#include <impl/Kokkos_Singleton.hpp>
|
||||
#include <impl/Kokkos_AllocationTracker.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <sstream>
|
||||
#include <algorithm>
|
||||
#include <utility>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
|
||||
/* Enable clean up of memory leaks */
|
||||
#define CLEAN_UP_MEMORY_LEAKS 0
|
||||
|
||||
namespace Kokkos { namespace Impl {
|
||||
|
||||
namespace {
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// AllocationRecord
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// Used to track details about an allocation and provide a ref count
|
||||
// sizeof(AllocationRecord) == 128
|
||||
struct AllocationRecord
|
||||
{
|
||||
enum {
|
||||
OFFSET = sizeof(AllocatorBase*) // allocator
|
||||
+ sizeof(void*) // alloc_ptr
|
||||
+ sizeof(uint64_t) // alloc_size
|
||||
+ sizeof(AllocatorAttributeBase*) // attribute
|
||||
+ sizeof(uint32_t) // node_index
|
||||
+ sizeof(uint32_t) // ref_count
|
||||
, LABEL_LENGTH = 128 - OFFSET
|
||||
};
|
||||
|
||||
AllocatorBase * const allocator;
|
||||
void * const alloc_ptr;
|
||||
const uint64_t alloc_size;
|
||||
AllocatorAttributeBase * const attribute;
|
||||
const int32_t node_index;
|
||||
volatile uint32_t ref_count;
|
||||
const char label[LABEL_LENGTH];
|
||||
|
||||
|
||||
AllocationRecord( AllocatorBase * const arg_allocator
|
||||
, void * arg_alloc_ptr
|
||||
, uint64_t arg_alloc_size
|
||||
, int32_t arg_node_index
|
||||
, const std::string & arg_label
|
||||
)
|
||||
: allocator(arg_allocator)
|
||||
, alloc_ptr(arg_alloc_ptr)
|
||||
, alloc_size(arg_alloc_size)
|
||||
, attribute(NULL)
|
||||
, node_index(arg_node_index)
|
||||
, ref_count(1)
|
||||
, label() // zero fill
|
||||
{
|
||||
const size_t length = static_cast<size_t>(LABEL_LENGTH-1u) < arg_label.size() ? static_cast<size_t>(LABEL_LENGTH-1u) : arg_label.size();
|
||||
strncpy( const_cast<char *>(label), arg_label.c_str(), length );
|
||||
}
|
||||
|
||||
~AllocationRecord()
|
||||
{
|
||||
if (attribute) {
|
||||
delete attribute;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t increment_ref_count()
|
||||
{
|
||||
uint32_t old_value = atomic_fetch_add( &ref_count, static_cast<uint32_t>(1) );
|
||||
return old_value + 1u;
|
||||
}
|
||||
|
||||
uint32_t decrement_ref_count()
|
||||
{
|
||||
uint32_t old_value = atomic_fetch_sub( &ref_count, static_cast<uint32_t>(1) );
|
||||
return old_value - 1u;
|
||||
}
|
||||
|
||||
void print( std::ostream & oss ) const
|
||||
{
|
||||
oss << "{ " << allocator->name()
|
||||
<< " } : \"" << label
|
||||
<< "\" ref_count(" << ref_count
|
||||
<< ") memory[ " << alloc_ptr
|
||||
<< " + " << alloc_size
|
||||
<< " ]" ;
|
||||
}
|
||||
|
||||
bool set_attribute( AllocatorAttributeBase * attr )
|
||||
{
|
||||
bool result = false;
|
||||
if (attribute == NULL) {
|
||||
result = NULL == atomic_compare_exchange( const_cast<AllocatorAttributeBase **>(&attribute)
|
||||
, reinterpret_cast<AllocatorAttributeBase *>(NULL)
|
||||
, attr );
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// disallow copy and assignment
|
||||
AllocationRecord( const AllocationRecord & );
|
||||
AllocationRecord & operator=(const AllocationRecord &);
|
||||
};
|
||||
|
||||
template <int NumBlocks>
|
||||
struct Bitset
|
||||
{
|
||||
enum { blocks = NumBlocks };
|
||||
enum { size = blocks * 64 };
|
||||
enum { block_mask = 63u };
|
||||
enum { block_shift = 6 };
|
||||
|
||||
// used to find free bits in a bitset
|
||||
static int count_trailing_zeros(uint64_t x)
|
||||
{
|
||||
#if defined( KOKKOS_COMPILER_GNU ) || defined( KOKKOS_COMPILER_CLANG ) || defined( KOKKOS_COMPILER_APPLECC )
|
||||
return x ? __builtin_ctzll(x) : 64;
|
||||
#elif defined( KOKKOS_COMPILER_INTEL )
|
||||
enum { shift = 32 };
|
||||
enum { mask = (static_cast<uint64_t>(1) << shift) - 1u };
|
||||
return (x & mask) ? _bit_scan_forward(static_cast<int>(x & mask)) :
|
||||
(x >> shift) ? shift + _bit_scan_forward(static_cast<int>(x >> shift)) :
|
||||
64 ;
|
||||
#elif defined( KOKKOS_COMPILER_IBM )
|
||||
return x ? __cnttz8(x) : 64;
|
||||
#else
|
||||
int i = 0;
|
||||
for (; ((x & (static_cast<uint64_t>(1) << i)) == 0u) && i < 64; ++i ) {}
|
||||
return i;
|
||||
#endif
|
||||
}
|
||||
|
||||
Bitset()
|
||||
: m_bits()
|
||||
{
|
||||
for (int i=0; i < blocks; ++i) {
|
||||
m_bits[i] = 0u;
|
||||
}
|
||||
}
|
||||
|
||||
bool set( int i )
|
||||
{
|
||||
const uint64_t bit = static_cast<uint64_t>(1) << ( i & block_mask );
|
||||
return !( atomic_fetch_or( m_bits + (i >> block_shift), bit ) & bit );
|
||||
}
|
||||
|
||||
bool reset( int i )
|
||||
{
|
||||
const uint64_t bit = static_cast<uint64_t>(1) << ( i & block_mask );
|
||||
return atomic_fetch_and( m_bits + (i >> block_shift), ~bit ) & bit;
|
||||
}
|
||||
|
||||
bool test( int i )
|
||||
{
|
||||
const uint64_t block = m_bits[ i >> block_shift ];
|
||||
const uint64_t bit = static_cast<uint64_t>(1) << ( i & block_mask );
|
||||
return block & bit;
|
||||
}
|
||||
|
||||
int find_first_unset() const
|
||||
{
|
||||
for (int i=0; i < blocks; ++i) {
|
||||
const uint64_t block = m_bits[i];
|
||||
int b = count_trailing_zeros( ~block );
|
||||
|
||||
if ( b < 64 ) {
|
||||
return (i << block_shift) + b;
|
||||
}
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
volatile uint64_t m_bits[blocks];
|
||||
};
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// AllocationRecordPool -- singleton class
|
||||
//
|
||||
// global_alloc_rec_pool is the ONLY instance of this class
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
// Record AllocationRecords in a lock-free circular list.
|
||||
// Each node in the list has a buffer with space for 959 ((15*64)-1) records
|
||||
// managed by a bitset. Atomics are used to set and reset bits in the bit set.
|
||||
// The head of the list is atomically updated to the last node found with
|
||||
// unused space.
|
||||
//
|
||||
// Cost time to create an allocation record: amortized O(1), worst case O(num nodes)
|
||||
// Cost to destroy an allocation recored: O(1)
|
||||
//
|
||||
// Singleton allocations are pushed onto a lock-free stack that is destroyed
|
||||
// after the circular list of allocation records.
|
||||
struct AllocationRecordPool
|
||||
{
|
||||
enum { BITSET_BLOCKS = 15 };
|
||||
|
||||
typedef Bitset<BITSET_BLOCKS> bitset_type;
|
||||
|
||||
enum { BUFFER_SIZE = (bitset_type::size - 1) * sizeof(AllocationRecord) };
|
||||
|
||||
struct AllocationNode
|
||||
{
|
||||
AllocationNode()
|
||||
: next()
|
||||
, bitset()
|
||||
, buffer()
|
||||
{
|
||||
// set the first bit to used
|
||||
bitset.set(0);
|
||||
}
|
||||
|
||||
void * get_buffer( int32_t node_index )
|
||||
{
|
||||
return buffer + (node_index-1) * sizeof(AllocationRecord);
|
||||
}
|
||||
|
||||
// return 0 if no space is available in the node
|
||||
int32_t get_node_index()
|
||||
{
|
||||
int32_t node_index = 0;
|
||||
do {
|
||||
node_index = bitset.find_first_unset();
|
||||
|
||||
// successfully claimed a bit
|
||||
if ( node_index != bitset.size && bitset.set(node_index) )
|
||||
{
|
||||
return node_index;
|
||||
}
|
||||
} while ( node_index != bitset.size );
|
||||
return 0;
|
||||
}
|
||||
|
||||
void clear_node_index( int32_t node_index )
|
||||
{
|
||||
bitset.reset(node_index);
|
||||
}
|
||||
|
||||
AllocationNode * next;
|
||||
bitset_type bitset;
|
||||
char buffer[BUFFER_SIZE];
|
||||
};
|
||||
|
||||
struct SingletonNode
|
||||
{
|
||||
void * buffer;
|
||||
SingletonNode * next;
|
||||
Impl::singleton_destroy_function_type destroy;
|
||||
|
||||
SingletonNode( size_t size, Impl::singleton_create_function_type create_func, Impl::singleton_destroy_function_type destroy_func )
|
||||
: buffer(NULL)
|
||||
, next(NULL)
|
||||
, destroy(destroy_func)
|
||||
{
|
||||
if (size) {
|
||||
buffer = malloc(size);
|
||||
create_func(buffer);
|
||||
}
|
||||
}
|
||||
|
||||
~SingletonNode()
|
||||
{
|
||||
if (buffer) {
|
||||
try {
|
||||
destroy(buffer);
|
||||
} catch(...) {}
|
||||
free(buffer);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
AllocationRecordPool()
|
||||
: head( new AllocationNode() )
|
||||
, singleton_head(NULL)
|
||||
{
|
||||
// setup ring
|
||||
head->next = head;
|
||||
}
|
||||
|
||||
~AllocationRecordPool()
|
||||
{
|
||||
// delete allocation records
|
||||
{
|
||||
AllocationNode * start = head;
|
||||
|
||||
AllocationNode * curr = start;
|
||||
|
||||
std::vector< std::string > string_vec;
|
||||
|
||||
do {
|
||||
AllocationNode * next = curr->next;
|
||||
|
||||
#if defined( KOKKOS_DEBUG_PRINT_ALLOCATION_BITSET )
|
||||
// print node bitset
|
||||
for (int i=0; i < bitset_type::blocks; ++i ) {
|
||||
std::cout << std::hex << std::showbase << curr->bitset.m_bits[i] << " ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
#endif
|
||||
|
||||
// bit zero does not map to an AllocationRecord
|
||||
for ( int32_t i=1; i < bitset_type::size; ++i )
|
||||
{
|
||||
if (curr->bitset.test(i)) {
|
||||
AllocationRecord * alloc_rec = reinterpret_cast<AllocationRecord *>( curr->get_buffer(i) );
|
||||
|
||||
std::ostringstream oss;
|
||||
alloc_rec->print( oss );
|
||||
string_vec.push_back( oss.str() );
|
||||
|
||||
#if CLEAN_UP_MEMORY_LEAKS
|
||||
/* Cleaning up memory leaks prevents memory error detection tools
|
||||
* from reporting the original source of allocation, which can
|
||||
* impede debugging with such tools.
|
||||
*/
|
||||
try {
|
||||
destroy(alloc_rec);
|
||||
}
|
||||
catch(...) {}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
curr->next = NULL;
|
||||
|
||||
delete curr;
|
||||
|
||||
curr = next;
|
||||
} while ( curr != start );
|
||||
|
||||
//if ( !string_vec.empty() ) {
|
||||
// std::sort( string_vec.begin(), string_vec.end() );
|
||||
//
|
||||
// std::ostringstream oss;
|
||||
// oss << "Error: Allocation pool destroyed with the following memory leak(s):\n";
|
||||
// for (size_t i=0; i< string_vec.size(); ++i)
|
||||
// {
|
||||
// oss << " " << string_vec[i] << std::endl;
|
||||
// }
|
||||
//
|
||||
// std::cerr << oss.str() << std::endl;
|
||||
//}
|
||||
}
|
||||
|
||||
// delete singletons
|
||||
{
|
||||
SingletonNode * curr = singleton_head;
|
||||
|
||||
while (curr) {
|
||||
SingletonNode * next = curr->next;
|
||||
delete curr;
|
||||
curr = next;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
AllocationRecord * create( AllocatorBase * arg_allocator
|
||||
, void * arg_alloc_ptr
|
||||
, size_t arg_alloc_size
|
||||
, const std::string & arg_label
|
||||
)
|
||||
{
|
||||
AllocationNode * start = volatile_load(&head);
|
||||
|
||||
AllocationNode * curr = start;
|
||||
|
||||
|
||||
int32_t node_index = curr->get_node_index();
|
||||
|
||||
if (node_index == 0) {
|
||||
curr = volatile_load(&curr->next);
|
||||
}
|
||||
|
||||
while (node_index == 0 && curr != start)
|
||||
{
|
||||
node_index = curr->get_node_index();
|
||||
if (node_index == 0) {
|
||||
curr = volatile_load(&curr->next);
|
||||
}
|
||||
}
|
||||
|
||||
// Need to allocate and insert a new node
|
||||
if (node_index == 0 && curr == start)
|
||||
{
|
||||
AllocationNode * new_node = new AllocationNode();
|
||||
|
||||
node_index = new_node->get_node_index();
|
||||
|
||||
AllocationNode * next = NULL;
|
||||
do {
|
||||
next = volatile_load(&curr->next);
|
||||
new_node->next = next;
|
||||
memory_fence();
|
||||
} while ( next != atomic_compare_exchange( &(curr->next), next, new_node ) );
|
||||
|
||||
curr = new_node;
|
||||
}
|
||||
|
||||
void * buffer = curr->get_buffer(node_index);
|
||||
|
||||
// try to set head to curr
|
||||
if ( start != curr )
|
||||
{
|
||||
atomic_compare_exchange( & head, start, curr );
|
||||
}
|
||||
|
||||
return new (buffer) AllocationRecord( arg_allocator
|
||||
, arg_alloc_ptr
|
||||
, arg_alloc_size
|
||||
, node_index
|
||||
, arg_label
|
||||
);
|
||||
}
|
||||
|
||||
void destroy( AllocationRecord * alloc_rec )
|
||||
{
|
||||
if (alloc_rec) {
|
||||
const int32_t node_index = alloc_rec->node_index;
|
||||
AllocationNode * node = get_node( alloc_rec );
|
||||
|
||||
// deallocate memory
|
||||
alloc_rec->allocator->deallocate( alloc_rec->alloc_ptr, alloc_rec->alloc_size );
|
||||
|
||||
// call destructor
|
||||
alloc_rec->~AllocationRecord();
|
||||
|
||||
// wait for writes to complete
|
||||
memory_fence();
|
||||
|
||||
// clear node index
|
||||
node->clear_node_index( node_index );
|
||||
}
|
||||
}
|
||||
|
||||
void * create_singleton( size_t size, Impl::singleton_create_function_type create_func, Impl::singleton_destroy_function_type destroy_func )
|
||||
{
|
||||
SingletonNode * node = new SingletonNode( size, create_func, destroy_func );
|
||||
SingletonNode * next;
|
||||
|
||||
// insert new node at the head of the list
|
||||
do {
|
||||
next = volatile_load(&singleton_head);
|
||||
node->next = next;
|
||||
} while ( next != atomic_compare_exchange( &singleton_head, next, node ) );
|
||||
|
||||
return node->buffer;
|
||||
}
|
||||
|
||||
void print_memory( std::ostream & out ) const
|
||||
{
|
||||
AllocationNode * start = head;
|
||||
|
||||
AllocationNode * curr = start;
|
||||
|
||||
std::vector< std::string > string_vec;
|
||||
|
||||
do {
|
||||
AllocationNode * next = curr->next;
|
||||
|
||||
// bit zero does not map to an AllocationRecord
|
||||
for ( int32_t i=1; i < bitset_type::size; ++i )
|
||||
{
|
||||
if (curr->bitset.test(i)) {
|
||||
AllocationRecord * alloc_rec = reinterpret_cast<AllocationRecord *>( curr->get_buffer(i) );
|
||||
|
||||
std::ostringstream oss;
|
||||
alloc_rec->print( oss );
|
||||
string_vec.push_back( oss.str() );
|
||||
}
|
||||
}
|
||||
curr = next;
|
||||
} while ( curr != start );
|
||||
|
||||
if ( !string_vec.empty() ) {
|
||||
std::sort( string_vec.begin(), string_vec.end() );
|
||||
|
||||
std::ostringstream oss;
|
||||
oss << "Tracked Memory:" << std::endl;
|
||||
for (size_t i=0; i< string_vec.size(); ++i)
|
||||
{
|
||||
oss << " " << string_vec[i] << std::endl;
|
||||
}
|
||||
out << oss.str() << std::endl;
|
||||
}
|
||||
else {
|
||||
out << "No Tracked Memory" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
// find an AllocationRecord such that
|
||||
// alloc_ptr <= ptr < alloc_ptr + alloc_size
|
||||
// otherwise return NULL
|
||||
AllocationRecord * find( void const * ptr, AllocatorBase const * allocator ) const
|
||||
{
|
||||
AllocationNode * start = head;
|
||||
|
||||
AllocationNode * curr = start;
|
||||
|
||||
char const * const char_ptr = reinterpret_cast<const char *>(ptr);
|
||||
|
||||
do {
|
||||
AllocationNode * next = curr->next;
|
||||
|
||||
// bit zero does not map to an AllocationRecord
|
||||
for ( int32_t i=1; i < bitset_type::size; ++i )
|
||||
{
|
||||
if (curr->bitset.test(i)) {
|
||||
AllocationRecord * alloc_rec = reinterpret_cast<AllocationRecord *>( curr->get_buffer(i) );
|
||||
|
||||
char const * const alloc_ptr = reinterpret_cast<char const *>(alloc_rec->alloc_ptr);
|
||||
|
||||
if ( (allocator == alloc_rec->allocator)
|
||||
&& (alloc_ptr <= char_ptr)
|
||||
&& (char_ptr < (alloc_ptr + alloc_rec->alloc_size)) )
|
||||
{
|
||||
return alloc_rec;
|
||||
}
|
||||
}
|
||||
}
|
||||
curr = next;
|
||||
} while ( curr != start );
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
AllocationNode * get_node( AllocationRecord * alloc_rec )
|
||||
{
|
||||
return reinterpret_cast<AllocationNode *>( alloc_rec - alloc_rec->node_index);
|
||||
}
|
||||
|
||||
AllocationNode * head;
|
||||
SingletonNode * singleton_head;
|
||||
};
|
||||
|
||||
// create the global pool for allocation records
|
||||
AllocationRecordPool global_alloc_rec_pool;
|
||||
|
||||
|
||||
|
||||
// convert a uintptr_t to an AllocationRecord pointer
|
||||
inline
|
||||
AllocationRecord * to_alloc_rec( uintptr_t alloc_rec )
|
||||
{
|
||||
return reinterpret_cast<AllocationRecord *>( alloc_rec & ~static_cast<uintptr_t>(1) );
|
||||
}
|
||||
|
||||
} // unnamed namespace
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// Allocation Tracker methods
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
// Create a reference counted AllocationTracker
|
||||
void AllocationTracker::initalize( AllocatorBase * arg_allocator
|
||||
, void * arg_alloc_ptr
|
||||
, size_t arg_alloc_size
|
||||
, const std::string & arg_label
|
||||
)
|
||||
{
|
||||
if ( arg_allocator && arg_alloc_ptr && arg_alloc_size) {
|
||||
// create record
|
||||
AllocationRecord * alloc_rec = global_alloc_rec_pool.create( arg_allocator
|
||||
, arg_alloc_ptr
|
||||
, arg_alloc_size
|
||||
, arg_label
|
||||
);
|
||||
|
||||
m_alloc_rec = reinterpret_cast<uintptr_t>(alloc_rec) | REF_COUNT_BIT;
|
||||
}
|
||||
}
|
||||
|
||||
void AllocationTracker::reallocate( size_t size ) const
|
||||
{
|
||||
AllocationRecord * rec = to_alloc_rec( m_alloc_rec );
|
||||
|
||||
void * the_alloc_ptr = rec->allocator->reallocate( rec->alloc_ptr, rec->alloc_size, size );
|
||||
|
||||
if ( NULL != the_alloc_ptr )
|
||||
{
|
||||
*const_cast<void **>(&rec->alloc_ptr) = the_alloc_ptr;
|
||||
*const_cast<uint64_t *>(&rec->alloc_size) = size;
|
||||
}
|
||||
else {
|
||||
Impl::throw_runtime_exception( "Error: unable to reallocate allocation tracker");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void AllocationTracker::increment_ref_count() const
|
||||
{
|
||||
to_alloc_rec( m_alloc_rec )->increment_ref_count();
|
||||
}
|
||||
|
||||
|
||||
void AllocationTracker::decrement_ref_count() const
|
||||
{
|
||||
AllocationRecord * alloc_rec = to_alloc_rec( m_alloc_rec );
|
||||
uint32_t the_ref_count = alloc_rec->decrement_ref_count();
|
||||
if (the_ref_count == 0u) {
|
||||
try {
|
||||
global_alloc_rec_pool.destroy( alloc_rec );
|
||||
}
|
||||
catch(...) {}
|
||||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
struct NullAllocator { static const char * name() { return "Null Allocator"; } };
|
||||
|
||||
}
|
||||
|
||||
AllocatorBase * AllocationTracker::allocator() const
|
||||
{
|
||||
if (m_alloc_rec & REF_COUNT_MASK) {
|
||||
return to_alloc_rec(m_alloc_rec)->allocator;
|
||||
}
|
||||
return Allocator<NullAllocator>::singleton();
|
||||
}
|
||||
|
||||
void * AllocationTracker::alloc_ptr() const
|
||||
{
|
||||
if (m_alloc_rec & REF_COUNT_MASK) {
|
||||
return to_alloc_rec(m_alloc_rec)->alloc_ptr;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
size_t AllocationTracker::alloc_size() const
|
||||
{
|
||||
if (m_alloc_rec & REF_COUNT_MASK) {
|
||||
return to_alloc_rec(m_alloc_rec)->alloc_size;
|
||||
}
|
||||
return 0u;
|
||||
}
|
||||
|
||||
size_t AllocationTracker::ref_count() const
|
||||
{
|
||||
if (m_alloc_rec & REF_COUNT_MASK) {
|
||||
return to_alloc_rec(m_alloc_rec)->ref_count;
|
||||
}
|
||||
return 0u;
|
||||
}
|
||||
|
||||
char const * AllocationTracker::label() const
|
||||
{
|
||||
if (m_alloc_rec & REF_COUNT_MASK) {
|
||||
return to_alloc_rec(m_alloc_rec)->label;
|
||||
}
|
||||
return "[Empty Allocation Tracker]";
|
||||
}
|
||||
|
||||
void AllocationTracker::print( std::ostream & oss) const
|
||||
{
|
||||
if (m_alloc_rec & REF_COUNT_MASK) {
|
||||
to_alloc_rec(m_alloc_rec)->print(oss);
|
||||
}
|
||||
else {
|
||||
oss << label();
|
||||
}
|
||||
}
|
||||
|
||||
bool AllocationTracker::set_attribute( AllocatorAttributeBase * attr ) const
|
||||
{
|
||||
bool result = false;
|
||||
if (m_alloc_rec & REF_COUNT_MASK) {
|
||||
result = to_alloc_rec(m_alloc_rec)->set_attribute(attr);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
AllocatorAttributeBase * AllocationTracker::attribute() const
|
||||
{
|
||||
if (m_alloc_rec & REF_COUNT_MASK) {
|
||||
return to_alloc_rec(m_alloc_rec)->attribute;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void AllocationTracker::print_tracked_memory( std::ostream & out )
|
||||
{
|
||||
global_alloc_rec_pool.print_memory( out );
|
||||
}
|
||||
|
||||
|
||||
AllocationTracker AllocationTracker::find( void const * ptr, AllocatorBase const * arg_allocator )
|
||||
{
|
||||
AllocationRecord * alloc_rec = global_alloc_rec_pool.find(ptr, arg_allocator);
|
||||
|
||||
AllocationTracker tracker;
|
||||
|
||||
if ( alloc_rec != NULL )
|
||||
{
|
||||
if ( tracking_enabled() ) {
|
||||
alloc_rec->increment_ref_count();
|
||||
tracker.m_alloc_rec = reinterpret_cast<uintptr_t>(alloc_rec) | REF_COUNT_BIT;
|
||||
}
|
||||
else {
|
||||
tracker.m_alloc_rec = reinterpret_cast<uintptr_t>(alloc_rec);
|
||||
}
|
||||
}
|
||||
|
||||
return tracker ;
|
||||
}
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// static AllocationTracker
|
||||
//-----------------------------------------------------------------------------
|
||||
#if defined( KOKKOS_USE_DECENTRALIZED_HOST )
|
||||
namespace {
|
||||
|
||||
// TODO : Detect compiler support for thread local variables
|
||||
#if defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP )
|
||||
bool g_thread_local_tracking_enabled = true;
|
||||
#pragma omp threadprivate(g_thread_local_tracking_enabled)
|
||||
#elif defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS )
|
||||
__thread bool g_thread_local_tracking_enabled = true;
|
||||
#elif defined( KOKKOS_HAVE_OPENMP )
|
||||
bool g_thread_local_tracking_enabled = true;
|
||||
#pragma omp threadprivate(g_thread_local_tracking_enabled)
|
||||
#elif defined( KOKKOS_HAVE_PTHREAD )
|
||||
__thread bool g_thread_local_tracking_enabled = true;
|
||||
#elif defined( KOKKOS_HAVE_SERIAL )
|
||||
bool g_thread_local_tracking_enabled = true;
|
||||
#endif
|
||||
} // unnamed namespace
|
||||
|
||||
void AllocationTracker::disable_tracking()
|
||||
{
|
||||
g_thread_local_tracking_enabled = false;
|
||||
}
|
||||
|
||||
void AllocationTracker::enable_tracking()
|
||||
{
|
||||
g_thread_local_tracking_enabled = true;
|
||||
}
|
||||
|
||||
bool AllocationTracker::tracking_enabled()
|
||||
{
|
||||
return g_thread_local_tracking_enabled;
|
||||
}
|
||||
#else
|
||||
namespace {
|
||||
enum TrackingEnum { TRACKING_ENABLED, TRACKING_DISABLED };
|
||||
volatile TrackingEnum g_tracking_enabled = TRACKING_ENABLED;
|
||||
}
|
||||
|
||||
void AllocationTracker::disable_tracking()
|
||||
{
|
||||
if ( TRACKING_ENABLED != atomic_compare_exchange( &g_tracking_enabled, TRACKING_ENABLED, TRACKING_DISABLED ) ) {
|
||||
Impl::throw_runtime_exception("Error: Tracking already disabled");
|
||||
}
|
||||
}
|
||||
|
||||
void AllocationTracker::enable_tracking()
|
||||
{
|
||||
if ( TRACKING_DISABLED != atomic_compare_exchange( &g_tracking_enabled, TRACKING_DISABLED, TRACKING_ENABLED ) ) {
|
||||
Impl::throw_runtime_exception("Error: Tracking already enabled");
|
||||
}
|
||||
}
|
||||
|
||||
bool AllocationTracker::tracking_enabled()
|
||||
{
|
||||
return g_tracking_enabled == TRACKING_ENABLED;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// create singleton free function
|
||||
//-----------------------------------------------------------------------------
|
||||
void * create_singleton( size_t size
|
||||
, Impl::singleton_create_function_type create_func
|
||||
, Impl::singleton_destroy_function_type destroy_func )
|
||||
{
|
||||
return global_alloc_rec_pool.create_singleton( size, create_func, destroy_func );
|
||||
}
|
||||
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
#endif /* #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) */
|
||||
|
||||
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
|
||||
|
|
@ -1,574 +0,0 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_ALLOCATION_TRACKER_HPP
|
||||
#define KOKKOS_ALLOCATION_TRACKER_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
#if ! KOKKOS_USING_EXP_VIEW
|
||||
|
||||
#include <impl/Kokkos_Traits.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
|
||||
#include <stdint.h>
|
||||
#include <cstdlib>
|
||||
#include <string>
|
||||
#include <iosfwd>
|
||||
|
||||
namespace Kokkos { namespace Impl {
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// Create Singleton objects
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
typedef void * (*singleton_create_function_type)(void * buffer);
|
||||
typedef void (*singleton_destroy_function_type)(void *);
|
||||
|
||||
void * create_singleton( size_t size
|
||||
, singleton_create_function_type create_func
|
||||
, singleton_destroy_function_type destroy_func
|
||||
);
|
||||
|
||||
|
||||
|
||||
/// class Singleton
|
||||
///
|
||||
/// Default construct a singleton type. This method is used to circumvent
|
||||
/// order of construction issues. Singleton objects are destroyed after all
|
||||
/// other allocations in the reverse order of their creation.
|
||||
template <typename Type>
|
||||
class Singleton
|
||||
{
|
||||
public:
|
||||
/// Get a pointer to the Singleton. Default construct the singleton if it does not already exist
|
||||
static Type * get()
|
||||
{
|
||||
static Type * singleton = NULL;
|
||||
if (singleton == NULL) {
|
||||
Impl::singleton_create_function_type create_func = &create;
|
||||
Impl::singleton_destroy_function_type destroy_func = &destroy;
|
||||
singleton = reinterpret_cast<Type*>( Impl::create_singleton( sizeof(Type), create_func, destroy_func ) );
|
||||
}
|
||||
return singleton;
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
/// Call the Type constructor
|
||||
static void destroy(void * ptr)
|
||||
{
|
||||
reinterpret_cast<Type*>(ptr)->~Type();
|
||||
}
|
||||
|
||||
/// placement new the Type in buffer
|
||||
static void * create(void * buffer)
|
||||
{
|
||||
return new (buffer) Type();
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// AllocatorBase
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
/// class AllocatorBase
|
||||
///
|
||||
/// Abstract base class for all Allocators.
|
||||
/// Allocators should be singleton objects, use Singleton<Allocator>::get to create
|
||||
/// to avoid order of destruction issues
|
||||
class AllocatorBase
|
||||
{
|
||||
public:
|
||||
/// name of the allocator
|
||||
/// used to report memory leaks
|
||||
virtual const char * name() const = 0;
|
||||
|
||||
/// Allocate a buffer of size number of bytes
|
||||
virtual void* allocate(size_t size) const = 0;
|
||||
|
||||
/// Deallocate a buffer with size number of bytes
|
||||
/// The pointer must have been allocated with a call to corresponding allocate
|
||||
virtual void deallocate(void * ptr, size_t size) const = 0;
|
||||
|
||||
/// Changes the size of the memory block pointed to by ptr.
|
||||
/// Ptr must have been allocated with the corresponding allocate call
|
||||
/// The function may move the memory block to a new location
|
||||
/// (whose address is returned by the function).
|
||||
///
|
||||
/// The content of the memory block is preserved up to the lesser of the new and
|
||||
/// old sizes, even if the block is moved to a new location. If the new size is larger,
|
||||
/// the value of the newly allocated portion is indeterminate.
|
||||
///
|
||||
/// In case that ptr is a null pointer, the function behaves like allocate, assigning a
|
||||
/// new block of size bytes and returning a pointer to its beginning.
|
||||
virtual void * reallocate(void * old_ptr, size_t old_size, size_t new_size) const = 0;
|
||||
|
||||
/// can a texture object be bound to the allocated memory
|
||||
virtual bool support_texture_binding() const = 0;
|
||||
|
||||
/// virtual destructor
|
||||
virtual ~AllocatorBase() {}
|
||||
};
|
||||
|
||||
/// class AllocatorAttributeBase
|
||||
class AllocatorAttributeBase
|
||||
{
|
||||
public:
|
||||
virtual ~AllocatorAttributeBase() {}
|
||||
};
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// Allocator< StaticAllocator > : public AllocatorBase
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
// HasStaticName
|
||||
template<typename T>
|
||||
class HasStaticName
|
||||
{
|
||||
typedef const char * (*static_method)();
|
||||
template<typename U, static_method> struct SFINAE {};
|
||||
template<typename U> static char Test(SFINAE<U, &U::name>*);
|
||||
template<typename U> static int Test(...);
|
||||
public:
|
||||
enum { value = sizeof(Test<T>(0)) == sizeof(char) };
|
||||
};
|
||||
|
||||
|
||||
template <typename T>
|
||||
inline
|
||||
typename enable_if<HasStaticName<T>::value, const char *>::type
|
||||
allocator_name()
|
||||
{
|
||||
return T::name();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline
|
||||
typename enable_if<!HasStaticName<T>::value, const char *>::type
|
||||
allocator_name()
|
||||
{
|
||||
return "Unnamed Allocator";
|
||||
}
|
||||
|
||||
|
||||
// HasStaticAllocate
|
||||
template<typename T>
|
||||
class HasStaticAllocate
|
||||
{
|
||||
typedef void * (*static_method)(size_t);
|
||||
template<typename U, static_method> struct SFINAE {};
|
||||
template<typename U> static char Test(SFINAE<U, &U::allocate>*);
|
||||
template<typename U> static int Test(...);
|
||||
public:
|
||||
enum { value = sizeof(Test<T>(0)) == sizeof(char) };
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
inline
|
||||
typename enable_if<HasStaticAllocate<T>::value, void *>::type
|
||||
allocator_allocate(size_t size)
|
||||
{
|
||||
return T::allocate(size);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline
|
||||
typename enable_if<!HasStaticAllocate<T>::value, void *>::type
|
||||
allocator_allocate(size_t)
|
||||
{
|
||||
throw_runtime_exception( std::string("Error: ")
|
||||
+ std::string(allocator_name<T>())
|
||||
+ std::string(" cannot allocate memory!") );
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// HasStaticDeallocate
|
||||
template<typename T>
|
||||
class HasStaticDeallocate
|
||||
{
|
||||
typedef void (*static_method)(void *, size_t);
|
||||
template<typename U, static_method> struct SFINAE {};
|
||||
template<typename U> static char Test(SFINAE<U, &U::deallocate>*);
|
||||
template<typename U> static int Test(...);
|
||||
public:
|
||||
enum { value = sizeof(Test<T>(0)) == sizeof(char) };
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
inline
|
||||
typename enable_if<HasStaticDeallocate<T>::value, void>::type
|
||||
allocator_deallocate(void * ptr, size_t size)
|
||||
{
|
||||
T::deallocate(ptr,size);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline
|
||||
typename enable_if<!HasStaticDeallocate<T>::value, void>::type
|
||||
allocator_deallocate(void *, size_t)
|
||||
{
|
||||
throw_runtime_exception( std::string("Error: ")
|
||||
+ std::string(allocator_name<T>())
|
||||
+ std::string(" cannot deallocate memory!") );
|
||||
}
|
||||
|
||||
// HasStaticReallocate
|
||||
template<typename T>
|
||||
class HasStaticReallocate
|
||||
{
|
||||
typedef void * (*static_method)(void *, size_t, size_t);
|
||||
template<typename U, static_method> struct SFINAE {};
|
||||
template<typename U> static char Test(SFINAE<U, &U::reallocate>*);
|
||||
template<typename U> static int Test(...);
|
||||
public:
|
||||
enum { value = sizeof(Test<T>(0)) == sizeof(char) };
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
inline
|
||||
typename enable_if<HasStaticReallocate<T>::value, void *>::type
|
||||
allocator_reallocate(void * old_ptr, size_t old_size, size_t new_size)
|
||||
{
|
||||
return T::reallocate(old_ptr, old_size, new_size);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline
|
||||
typename enable_if<!HasStaticReallocate<T>::value, void *>::type
|
||||
allocator_reallocate(void *, size_t, size_t)
|
||||
{
|
||||
throw_runtime_exception( std::string("Error: ")
|
||||
+ std::string(allocator_name<T>())
|
||||
+ std::string(" cannot reallocate memory!") );
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// HasStaticReallocate
|
||||
template<typename T>
|
||||
class HasStaticSupportTextureBinding
|
||||
{
|
||||
typedef bool (*static_method)();
|
||||
template<typename U, static_method> struct SFINAE {};
|
||||
template<typename U> static char Test(SFINAE<U, &U::support_texture_binding>*);
|
||||
template<typename U> static int Test(...);
|
||||
public:
|
||||
enum { value = sizeof(Test<T>(0)) == sizeof(char) };
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
inline
|
||||
typename enable_if<HasStaticSupportTextureBinding<T>::value, bool>::type
|
||||
allocator_support_texture_binding()
|
||||
{
|
||||
return T::support_texture_binding();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline
|
||||
typename enable_if<!HasStaticSupportTextureBinding<T>::value, bool>::type
|
||||
allocator_support_texture_binding()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
class Allocator : public AllocatorBase
|
||||
{
|
||||
public:
|
||||
virtual const char * name() const
|
||||
{
|
||||
return allocator_name<T>();
|
||||
}
|
||||
|
||||
virtual void* allocate(size_t size) const
|
||||
{
|
||||
return allocator_allocate<T>(size);
|
||||
}
|
||||
|
||||
virtual void deallocate(void * ptr, size_t size) const
|
||||
{
|
||||
allocator_deallocate<T>(ptr,size);
|
||||
}
|
||||
|
||||
virtual void * reallocate(void * old_ptr, size_t old_size, size_t new_size) const
|
||||
{
|
||||
return allocator_reallocate<T>(old_ptr, old_size, new_size);
|
||||
}
|
||||
|
||||
virtual bool support_texture_binding() const
|
||||
{
|
||||
return allocator_support_texture_binding<T>();
|
||||
}
|
||||
|
||||
static AllocatorBase * singleton()
|
||||
{
|
||||
return Singleton< Allocator<T> >::get();
|
||||
}
|
||||
};
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// AllocationTracker
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
// forward declaration for friend classes
|
||||
struct MallocHelper;
|
||||
|
||||
/// class AllocationTracker
|
||||
/// Will call deallocate from the AllocatorBase when the reference count reaches 0.
|
||||
/// Reference counting is disabled when the host is in parallel.
|
||||
class AllocationTracker
|
||||
{
|
||||
// use the least significant bit of the AllocationRecord pointer to indicate if the
|
||||
// AllocationTracker should reference count
|
||||
enum {
|
||||
REF_COUNT_BIT = static_cast<uintptr_t>(1)
|
||||
, REF_COUNT_MASK = ~static_cast<uintptr_t>(1)
|
||||
};
|
||||
|
||||
public:
|
||||
|
||||
/// Find an AllocationTracker such that
|
||||
/// alloc_ptr <= ptr < alloc_ptr + alloc_size
|
||||
/// O(n) where n is the number of tracked allocations.
|
||||
template <typename StaticAllocator>
|
||||
static AllocationTracker find( void const * ptr )
|
||||
{
|
||||
return find( ptr, Allocator<StaticAllocator>::singleton() );
|
||||
}
|
||||
|
||||
|
||||
/// Pretty print all the currently tracked memory
|
||||
static void print_tracked_memory( std::ostream & out );
|
||||
|
||||
/// Default constructor
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
AllocationTracker()
|
||||
: m_alloc_rec(0)
|
||||
{}
|
||||
|
||||
/// Create a AllocationTracker
|
||||
///
|
||||
/// Start reference counting the alloc_ptr.
|
||||
/// When the reference count reachs 0 the allocator deallocate method
|
||||
/// will be call with the given size. The alloc_ptr should have been
|
||||
/// allocated with the allocator's allocate method.
|
||||
///
|
||||
/// If arg_allocator == NULL OR arg_alloc_ptr == NULL OR size == 0
|
||||
/// do nothing
|
||||
template <typename StaticAllocator>
|
||||
AllocationTracker( StaticAllocator const &
|
||||
, void * arg_alloc_ptr
|
||||
, size_t arg_alloc_size
|
||||
, const std::string & arg_label = std::string("") )
|
||||
: m_alloc_rec(0)
|
||||
{
|
||||
AllocatorBase * arg_allocator = Allocator<StaticAllocator>::singleton();
|
||||
initalize( arg_allocator, arg_alloc_ptr, arg_alloc_size, arg_label);
|
||||
}
|
||||
|
||||
/// Create a AllocationTracker
|
||||
///
|
||||
/// Start reference counting the alloc_ptr.
|
||||
/// When the reference count reachs 0 the allocator deallocate method
|
||||
/// will be call with the given size. The alloc_ptr should have been
|
||||
/// allocated with the allocator's allocate method.
|
||||
///
|
||||
/// If arg_allocator == NULL OR arg_alloc_ptr == NULL OR size == 0
|
||||
/// do nothing
|
||||
template <typename StaticAllocator>
|
||||
AllocationTracker( StaticAllocator const &
|
||||
, size_t arg_alloc_size
|
||||
, const std::string & arg_label = std::string("")
|
||||
)
|
||||
: m_alloc_rec(0)
|
||||
{
|
||||
AllocatorBase * arg_allocator = Allocator<StaticAllocator>::singleton();
|
||||
void * arg_alloc_ptr = arg_allocator->allocate( arg_alloc_size );
|
||||
|
||||
initalize( arg_allocator, arg_alloc_ptr, arg_alloc_size, arg_label);
|
||||
}
|
||||
|
||||
/// Copy an AllocatorTracker
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
AllocationTracker( const AllocationTracker & rhs )
|
||||
: m_alloc_rec( rhs.m_alloc_rec)
|
||||
{
|
||||
#if !defined( __CUDA_ARCH__ )
|
||||
if ( rhs.ref_counting() && tracking_enabled() ) {
|
||||
increment_ref_count();
|
||||
}
|
||||
else {
|
||||
m_alloc_rec = m_alloc_rec & REF_COUNT_MASK;
|
||||
}
|
||||
#else
|
||||
m_alloc_rec = m_alloc_rec & REF_COUNT_MASK;
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Copy an AllocatorTracker
|
||||
/// Decrement the reference count of the current tracker if necessary
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
AllocationTracker & operator=( const AllocationTracker & rhs )
|
||||
{
|
||||
if (this != &rhs) {
|
||||
#if !defined( __CUDA_ARCH__ )
|
||||
if ( ref_counting() ) {
|
||||
decrement_ref_count();
|
||||
}
|
||||
|
||||
m_alloc_rec = rhs.m_alloc_rec;
|
||||
|
||||
if ( rhs.ref_counting() && tracking_enabled() ) {
|
||||
increment_ref_count();
|
||||
}
|
||||
else {
|
||||
m_alloc_rec = m_alloc_rec & REF_COUNT_MASK;
|
||||
}
|
||||
#else
|
||||
m_alloc_rec = rhs.m_alloc_rec & REF_COUNT_MASK;
|
||||
#endif
|
||||
}
|
||||
|
||||
return * this;
|
||||
}
|
||||
|
||||
/// Destructor
|
||||
/// Decrement the reference count if necessary
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
~AllocationTracker()
|
||||
{
|
||||
#if !defined( __CUDA_ARCH__ )
|
||||
if ( ref_counting() ) {
|
||||
decrement_ref_count();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Is the tracker valid?
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool is_valid() const
|
||||
{
|
||||
return (m_alloc_rec & REF_COUNT_MASK);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// clear the tracker
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void clear()
|
||||
{
|
||||
#if !defined( __CUDA_ARCH__ )
|
||||
if ( ref_counting() ) {
|
||||
decrement_ref_count();
|
||||
}
|
||||
#endif
|
||||
m_alloc_rec = 0;
|
||||
}
|
||||
|
||||
/// is this tracker currently counting allocations?
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool ref_counting() const
|
||||
{
|
||||
return (m_alloc_rec & REF_COUNT_BIT);
|
||||
}
|
||||
|
||||
AllocatorBase * allocator() const;
|
||||
|
||||
/// pointer to the allocated memory
|
||||
void * alloc_ptr() const;
|
||||
|
||||
/// size in bytes of the allocated memory
|
||||
size_t alloc_size() const;
|
||||
|
||||
/// the current reference count
|
||||
size_t ref_count() const;
|
||||
|
||||
/// the label given to the allocation
|
||||
char const * label() const;
|
||||
|
||||
/// pretty print all the tracker's information to the std::ostream
|
||||
void print( std::ostream & oss) const;
|
||||
|
||||
|
||||
/// set an attribute ptr on the allocation record
|
||||
/// the arg_attribute pointer will be deleted when the record is destroyed
|
||||
/// the attribute ptr can only be set once
|
||||
bool set_attribute( AllocatorAttributeBase * arg_attribute) const;
|
||||
|
||||
/// get the attribute ptr from the allocation record
|
||||
AllocatorAttributeBase * attribute() const;
|
||||
|
||||
|
||||
/// reallocate the memory tracked by this allocation
|
||||
/// NOT thread-safe
|
||||
void reallocate( size_t size ) const;
|
||||
|
||||
static void disable_tracking();
|
||||
static void enable_tracking();
|
||||
static bool tracking_enabled();
|
||||
|
||||
private:
|
||||
|
||||
static AllocationTracker find( void const * ptr, AllocatorBase const * arg_allocator );
|
||||
|
||||
void initalize( AllocatorBase * arg_allocator
|
||||
, void * arg_alloc_ptr
|
||||
, size_t arg_alloc_size
|
||||
, std::string const & label );
|
||||
|
||||
void increment_ref_count() const;
|
||||
void decrement_ref_count() const;
|
||||
|
||||
friend struct Impl::MallocHelper;
|
||||
|
||||
uintptr_t m_alloc_rec;
|
||||
};
|
||||
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
|
||||
|
||||
#endif //KOKKOS_ALLOCATION_TRACKER_HPP
|
||||
|
|
@ -0,0 +1,197 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_IMPL_ANALYZE_POLICY_HPP
|
||||
#define KOKKOS_IMPL_ANALYZE_POLICY_HPP
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
#include <Kokkos_Concepts.hpp>
|
||||
#include <impl/Kokkos_Tags.hpp>
|
||||
|
||||
namespace Kokkos { namespace Impl {
|
||||
|
||||
template < typename ExecutionSpace = void
|
||||
, typename Schedule = void
|
||||
, typename WorkTag = void
|
||||
, typename IndexType = void
|
||||
, typename IterationPattern = void
|
||||
>
|
||||
struct PolicyTraitsBase
|
||||
{
|
||||
using type = PolicyTraitsBase< ExecutionSpace, Schedule, WorkTag, IndexType, IterationPattern>;
|
||||
|
||||
using execution_space = ExecutionSpace;
|
||||
using schedule_type = Schedule;
|
||||
using work_tag = WorkTag;
|
||||
using index_type = IndexType;
|
||||
using iteration_pattern = IterationPattern;
|
||||
};
|
||||
|
||||
|
||||
template <typename PolicyBase, typename ExecutionSpace>
|
||||
struct SetExecutionSpace
|
||||
{
|
||||
static_assert( is_void<typename PolicyBase::execution_space>::value
|
||||
, "Kokkos Error: More than one execution space given" );
|
||||
using type = PolicyTraitsBase< ExecutionSpace
|
||||
, typename PolicyBase::schedule_type
|
||||
, typename PolicyBase::work_tag
|
||||
, typename PolicyBase::index_type
|
||||
, typename PolicyBase::iteration_pattern
|
||||
>;
|
||||
};
|
||||
|
||||
template <typename PolicyBase, typename Schedule>
|
||||
struct SetSchedule
|
||||
{
|
||||
static_assert( is_void<typename PolicyBase::schedule_type>::value
|
||||
, "Kokkos Error: More than one schedule type given" );
|
||||
using type = PolicyTraitsBase< typename PolicyBase::execution_space
|
||||
, Schedule
|
||||
, typename PolicyBase::work_tag
|
||||
, typename PolicyBase::index_type
|
||||
, typename PolicyBase::iteration_pattern
|
||||
>;
|
||||
};
|
||||
|
||||
template <typename PolicyBase, typename WorkTag>
|
||||
struct SetWorkTag
|
||||
{
|
||||
static_assert( is_void<typename PolicyBase::work_tag>::value
|
||||
, "Kokkos Error: More than one work tag given" );
|
||||
using type = PolicyTraitsBase< typename PolicyBase::execution_space
|
||||
, typename PolicyBase::schedule_type
|
||||
, WorkTag
|
||||
, typename PolicyBase::index_type
|
||||
, typename PolicyBase::iteration_pattern
|
||||
>;
|
||||
};
|
||||
|
||||
template <typename PolicyBase, typename IndexType>
|
||||
struct SetIndexType
|
||||
{
|
||||
static_assert( is_void<typename PolicyBase::index_type>::value
|
||||
, "Kokkos Error: More than one index type given" );
|
||||
using type = PolicyTraitsBase< typename PolicyBase::execution_space
|
||||
, typename PolicyBase::schedule_type
|
||||
, typename PolicyBase::work_tag
|
||||
, IndexType
|
||||
, typename PolicyBase::iteration_pattern
|
||||
>;
|
||||
};
|
||||
|
||||
|
||||
template <typename PolicyBase, typename IterationPattern>
|
||||
struct SetIterationPattern
|
||||
{
|
||||
static_assert( is_void<typename PolicyBase::iteration_pattern>::value
|
||||
, "Kokkos Error: More than one iteration_pattern given" );
|
||||
using type = PolicyTraitsBase< typename PolicyBase::execution_space
|
||||
, typename PolicyBase::schedule_type
|
||||
, typename PolicyBase::work_tag
|
||||
, typename PolicyBase::index_type
|
||||
, IterationPattern
|
||||
>;
|
||||
};
|
||||
|
||||
|
||||
template <typename Base, typename... Traits>
|
||||
struct AnalyzePolicy;
|
||||
|
||||
template <typename Base, typename T, typename... Traits>
|
||||
struct AnalyzePolicy<Base, T, Traits...> : public
|
||||
AnalyzePolicy<
|
||||
typename std::conditional< is_execution_space<T>::value , SetExecutionSpace<Base,T>
|
||||
, typename std::conditional< is_schedule_type<T>::value , SetSchedule<Base,T>
|
||||
, typename std::conditional< is_index_type<T>::value , SetIndexType<Base,T>
|
||||
, typename std::conditional< std::is_integral<T>::value , SetIndexType<Base, IndexType<T> >
|
||||
, typename std::conditional< is_iteration_pattern<T>::value, SetIterationPattern<Base,T>
|
||||
, SetWorkTag<Base,T>
|
||||
>::type >::type >::type >::type>::type::type
|
||||
, Traits...
|
||||
>
|
||||
{};
|
||||
|
||||
template <typename Base>
|
||||
struct AnalyzePolicy<Base>
|
||||
{
|
||||
using execution_space = typename std::conditional< is_void< typename Base::execution_space >::value
|
||||
, DefaultExecutionSpace
|
||||
, typename Base::execution_space
|
||||
>::type;
|
||||
|
||||
using schedule_type = typename std::conditional< is_void< typename Base::schedule_type >::value
|
||||
, Schedule< Static >
|
||||
, typename Base::schedule_type
|
||||
>::type;
|
||||
|
||||
using work_tag = typename Base::work_tag;
|
||||
|
||||
using index_type = typename std::conditional< is_void< typename Base::index_type >::value
|
||||
, IndexType< typename execution_space::size_type >
|
||||
, typename Base::index_type
|
||||
>::type
|
||||
::type // nasty hack to make index_type into an integral_type
|
||||
; // instead of the wrapped IndexType<T> for backwards compatibility
|
||||
|
||||
using iteration_pattern = typename std::conditional< is_void< typename Base::iteration_pattern >::value
|
||||
, void // TODO set default iteration pattern
|
||||
, typename Base::iteration_pattern
|
||||
>::type;
|
||||
using type = PolicyTraitsBase< execution_space
|
||||
, schedule_type
|
||||
, work_tag
|
||||
, index_type
|
||||
, iteration_pattern
|
||||
>;
|
||||
};
|
||||
|
||||
template <typename... Traits>
|
||||
struct PolicyTraits
|
||||
: public AnalyzePolicy< PolicyTraitsBase<>, Traits... >::type
|
||||
{};
|
||||
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
|
||||
#endif //KOKKOS_IMPL_ANALYZE_POLICY_HPP
|
|
@ -1,13 +1,13 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
|
@ -36,7 +36,7 @@
|
|||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
@ -218,7 +218,17 @@ T atomic_compare_exchange( volatile T * const dest , const T compare ,
|
|||
while( !Impl::lock_address_host_space( (void*) dest ) );
|
||||
T return_val = *dest;
|
||||
if( return_val == compare ) {
|
||||
const T tmp = *dest = val;
|
||||
// Don't use the following line of code here:
|
||||
//
|
||||
//const T tmp = *dest = val;
|
||||
//
|
||||
// Instead, put each assignment in its own statement. This is
|
||||
// because the overload of T::operator= for volatile *this should
|
||||
// return void, not volatile T&. See Kokkos #177:
|
||||
//
|
||||
// https://github.com/kokkos/kokkos/issues/177
|
||||
*dest = val;
|
||||
const T tmp = *dest;
|
||||
#ifndef KOKKOS_COMPILER_CLANG
|
||||
(void) tmp;
|
||||
#endif
|
||||
|
@ -239,7 +249,7 @@ T atomic_compare_exchange( volatile T * const dest, const T compare, const T val
|
|||
{
|
||||
retval = dest[0];
|
||||
if ( retval == compare )
|
||||
dest[0] = val;
|
||||
dest[0] = val;
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
|
@ -36,7 +36,7 @@
|
|||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
@ -228,7 +228,17 @@ T atomic_exchange( volatile T * const dest ,
|
|||
{
|
||||
while( !Impl::lock_address_host_space( (void*) dest ) );
|
||||
T return_val = *dest;
|
||||
const T tmp = *dest = val;
|
||||
// Don't use the following line of code here:
|
||||
//
|
||||
//const T tmp = *dest = val;
|
||||
//
|
||||
// Instead, put each assignment in its own statement. This is
|
||||
// because the overload of T::operator= for volatile *this should
|
||||
// return void, not volatile T&. See Kokkos #177:
|
||||
//
|
||||
// https://github.com/kokkos/kokkos/issues/177
|
||||
*dest = val;
|
||||
const T tmp = *dest;
|
||||
#ifndef KOKKOS_COMPILER_CLANG
|
||||
(void) tmp;
|
||||
#endif
|
||||
|
@ -305,7 +315,9 @@ void atomic_assign( volatile T * const dest ,
|
|||
// member. The volatile return value implicitly defines a
|
||||
// dereference that some compilers (gcc 4.7.2) warn is being ignored.
|
||||
// Suppress warning by casting return to void.
|
||||
(void)( *dest = val );
|
||||
//(void)( *dest = val );
|
||||
*dest = val;
|
||||
|
||||
Impl::unlock_address_host_space( (void*) dest );
|
||||
}
|
||||
//----------------------------------------------------------------------------
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
|
@ -36,7 +36,7 @@
|
|||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
@ -93,7 +93,7 @@ T atomic_fetch_add( volatile T * const dest ,
|
|||
assume.i = oldval.i ;
|
||||
newval.t = assume.t + val ;
|
||||
oldval.i = atomicCAS( (int*)dest , assume.i , newval.i );
|
||||
} while ( assumed.i != oldval.i );
|
||||
} while ( assume.i != oldval.i );
|
||||
|
||||
return oldval.t ;
|
||||
}
|
||||
|
@ -156,9 +156,26 @@ T atomic_fetch_add( volatile T * const dest ,
|
|||
|
||||
#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
|
||||
|
||||
#if defined( KOKKOS_ENABLE_ASM ) && defined ( KOKKOS_USE_ISA_X86_64 )
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int atomic_fetch_add( volatile int * dest , const int val )
|
||||
{
|
||||
int original = val;
|
||||
|
||||
__asm__ __volatile__(
|
||||
"lock xadd %1, %0"
|
||||
: "+m" (*dest), "+r" (original)
|
||||
: "m" (*dest), "r" (original)
|
||||
: "memory"
|
||||
);
|
||||
|
||||
return original;
|
||||
}
|
||||
#else
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int atomic_fetch_add( volatile int * const dest , const int val )
|
||||
{ return __sync_fetch_and_add(dest,val); }
|
||||
{ return __sync_fetch_and_add(dest, val); }
|
||||
#endif
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
long int atomic_fetch_add( volatile long int * const dest , const long int val )
|
||||
|
@ -276,7 +293,17 @@ T atomic_fetch_add( volatile T * const dest ,
|
|||
{
|
||||
while( !Impl::lock_address_host_space( (void*) dest ) );
|
||||
T return_val = *dest;
|
||||
const T tmp = *dest = return_val + val;
|
||||
// Don't use the following line of code here:
|
||||
//
|
||||
//const T tmp = *dest = return_val + val;
|
||||
//
|
||||
// Instead, put each assignment in its own statement. This is
|
||||
// because the overload of T::operator= for volatile *this should
|
||||
// return void, not volatile T&. See Kokkos #177:
|
||||
//
|
||||
// https://github.com/kokkos/kokkos/issues/177
|
||||
*dest = return_val + val;
|
||||
const T tmp = *dest;
|
||||
(void) tmp;
|
||||
Impl::unlock_address_host_space( (void*) dest );
|
||||
return return_val;
|
||||
|
|
|
@ -73,7 +73,7 @@ T atomic_fetch_sub( volatile T * const dest ,
|
|||
assume.i = oldval.i ;
|
||||
newval.t = assume.t - val ;
|
||||
oldval.i = atomicCAS( (int*)dest , assume.i , newval.i );
|
||||
} while ( assumed.i != oldval.i );
|
||||
} while ( assume.i != oldval.i );
|
||||
|
||||
return oldval.t ;
|
||||
}
|
||||
|
|
|
@ -48,6 +48,22 @@
|
|||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template<class Scalar1, class Scalar2>
|
||||
struct MaxOper {
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
|
||||
return (val1 > val2 ? val1 : val2);
|
||||
}
|
||||
};
|
||||
|
||||
template<class Scalar1, class Scalar2>
|
||||
struct MinOper {
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
|
||||
return (val1 < val2 ? val1 : val2);
|
||||
}
|
||||
};
|
||||
|
||||
template<class Scalar1, class Scalar2>
|
||||
struct AddOper {
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
|
@ -276,6 +292,18 @@ T atomic_oper_fetch( const Oper& op, volatile T * const dest ,
|
|||
namespace Kokkos {
|
||||
|
||||
// Fetch_Oper atomics: return value before operation
|
||||
template < typename T >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
T atomic_fetch_max(volatile T * const dest, const T val) {
|
||||
return Impl::atomic_fetch_oper(Impl::MaxOper<T,const T>(),dest,val);
|
||||
}
|
||||
|
||||
template < typename T >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
T atomic_fetch_min(volatile T * const dest, const T val) {
|
||||
return Impl::atomic_fetch_oper(Impl::MinOper<T,const T>(),dest,val);
|
||||
}
|
||||
|
||||
template < typename T >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
T atomic_fetch_mul(volatile T * const dest, const T val) {
|
||||
|
@ -326,6 +354,18 @@ T atomic_fetch_rshift(volatile T * const dest, const unsigned int val) {
|
|||
|
||||
|
||||
// Oper Fetch atomics: return value after operation
|
||||
template < typename T >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
T atomic_max_fetch(volatile T * const dest, const T val) {
|
||||
return Impl::atomic_oper_fetch(Impl::MaxOper<T,const T>(),dest,val);
|
||||
}
|
||||
|
||||
template < typename T >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
T atomic_min_fetch(volatile T * const dest, const T val) {
|
||||
return Impl::atomic_oper_fetch(Impl::MinOper<T,const T>(),dest,val);
|
||||
}
|
||||
|
||||
template < typename T >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
T atomic_mul_fetch(volatile T * const dest, const T val) {
|
||||
|
|
|
@ -425,42 +425,6 @@ struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<8> {
|
|||
typedef int64_t type;
|
||||
};
|
||||
|
||||
#if ! KOKKOS_USING_EXP_VIEW
|
||||
|
||||
class AllocationTracker;
|
||||
|
||||
// Must be non-const, atomic access trait, and 32 or 64 bit type for true atomics.
|
||||
template<class ViewTraits>
|
||||
class ViewDataHandle<
|
||||
ViewTraits ,
|
||||
typename enable_if<
|
||||
( ! is_same<typename ViewTraits::const_value_type,typename ViewTraits::value_type>::value) &&
|
||||
( ViewTraits::memory_traits::Atomic )
|
||||
>::type >
|
||||
{
|
||||
private:
|
||||
// typedef typename if_c<(sizeof(typename ViewTraits::const_value_type)==4) ||
|
||||
// (sizeof(typename ViewTraits::const_value_type)==8),
|
||||
// int, Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars >::type
|
||||
// atomic_view_possible;
|
||||
typedef typename Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<sizeof(typename ViewTraits::const_value_type)>::type enable_atomic_type;
|
||||
typedef ViewDataHandle self_type;
|
||||
|
||||
public:
|
||||
enum { ReturnTypeIsReference = false };
|
||||
|
||||
typedef Impl::AtomicViewDataHandle<ViewTraits> handle_type;
|
||||
typedef Impl::AtomicDataElement<ViewTraits> return_type;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static handle_type create_handle( typename ViewTraits::value_type * arg_data_ptr, AllocationTracker const & /*arg_tracker*/ )
|
||||
{
|
||||
return handle_type(arg_data_ptr);
|
||||
}
|
||||
};
|
||||
|
||||
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
|
||||
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,287 +0,0 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_HostSpace.hpp>
|
||||
|
||||
#if ! KOKKOS_USING_EXP_VIEW
|
||||
|
||||
#include <impl/Kokkos_BasicAllocators.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
|
||||
|
||||
#include <stdint.h> // uintptr_t
|
||||
#include <cstdlib> // for malloc, realloc, and free
|
||||
#include <cstring> // for memcpy
|
||||
|
||||
#if defined(KOKKOS_POSIX_MEMALIGN_AVAILABLE)
|
||||
#include <sys/mman.h> // for mmap, munmap, MAP_ANON, etc
|
||||
#include <unistd.h> // for sysconf, _SC_PAGE_SIZE, _SC_PHYS_PAGES
|
||||
#endif
|
||||
|
||||
#include <sstream>
|
||||
|
||||
namespace Kokkos { namespace Impl {
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
void* MallocAllocator::allocate( size_t size )
|
||||
{
|
||||
void * ptr = NULL;
|
||||
if (size) {
|
||||
ptr = malloc(size);
|
||||
|
||||
if (!ptr)
|
||||
{
|
||||
std::ostringstream msg ;
|
||||
msg << name() << ": allocate(" << size << ") FAILED";
|
||||
throw_runtime_exception( msg.str() );
|
||||
}
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void MallocAllocator::deallocate( void * ptr, size_t /*size*/ )
|
||||
{
|
||||
if (ptr) {
|
||||
free(ptr);
|
||||
}
|
||||
}
|
||||
|
||||
void * MallocAllocator::reallocate(void * old_ptr, size_t /*old_size*/, size_t new_size)
|
||||
{
|
||||
void * ptr = realloc(old_ptr, new_size);
|
||||
|
||||
if (new_size > 0u && ptr == NULL) {
|
||||
throw_runtime_exception("Error: Malloc Allocator could not reallocate memory");
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace {
|
||||
|
||||
void * raw_aligned_allocate( size_t size, size_t alignment )
|
||||
{
|
||||
void * ptr = NULL;
|
||||
if ( size ) {
|
||||
#if defined( __INTEL_COMPILER ) && !defined ( KOKKOS_HAVE_CUDA )
|
||||
ptr = _mm_malloc( size , alignment );
|
||||
|
||||
#elif defined(KOKKOS_POSIX_MEMALIGN_AVAILABLE)
|
||||
|
||||
posix_memalign( & ptr, alignment , size );
|
||||
|
||||
#else
|
||||
// Over-allocate to and round up to guarantee proper alignment.
|
||||
size_t size_padded = size + alignment + sizeof(void *);
|
||||
void * alloc_ptr = malloc( size_padded );
|
||||
|
||||
if (alloc_ptr) {
|
||||
uintptr_t address = reinterpret_cast<uintptr_t>(alloc_ptr);
|
||||
// offset enough to record the alloc_ptr
|
||||
address += sizeof(void *);
|
||||
uintptr_t rem = address % alignment;
|
||||
uintptr_t offset = rem ? (alignment - rem) : 0u;
|
||||
address += offset;
|
||||
ptr = reinterpret_cast<void *>(address);
|
||||
// record the alloc'd pointer
|
||||
address -= sizeof(void *);
|
||||
*reinterpret_cast<void **>(address) = alloc_ptr;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void raw_aligned_deallocate( void * ptr, size_t /*size*/ )
|
||||
{
|
||||
if ( ptr ) {
|
||||
#if defined( __INTEL_COMPILER ) && !defined ( KOKKOS_HAVE_CUDA )
|
||||
_mm_free( ptr );
|
||||
|
||||
#elif defined(KOKKOS_POSIX_MEMALIGN_AVAILABLE)
|
||||
free( ptr );
|
||||
#else
|
||||
// get the alloc'd pointer
|
||||
void * alloc_ptr = *(reinterpret_cast<void **>(ptr) -1);
|
||||
free( alloc_ptr );
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void* AlignedAllocator::allocate( size_t size )
|
||||
{
|
||||
void * ptr = 0 ;
|
||||
|
||||
if ( size ) {
|
||||
ptr = raw_aligned_allocate(size, MEMORY_ALIGNMENT);
|
||||
|
||||
if (!ptr)
|
||||
{
|
||||
std::ostringstream msg ;
|
||||
msg << name() << ": allocate(" << size << ") FAILED";
|
||||
throw_runtime_exception( msg.str() );
|
||||
}
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void AlignedAllocator::deallocate( void * ptr, size_t size )
|
||||
{
|
||||
raw_aligned_deallocate( ptr, size);
|
||||
}
|
||||
|
||||
void * AlignedAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
|
||||
{
|
||||
void * ptr = old_ptr;;
|
||||
|
||||
if (old_size < new_size) {
|
||||
ptr = allocate( new_size );
|
||||
|
||||
memcpy(ptr, old_ptr, old_size );
|
||||
|
||||
deallocate( old_ptr, old_size );
|
||||
}
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
// mmap flags for private anonymous memory allocation
|
||||
#if defined( MAP_ANONYMOUS ) && defined( MAP_PRIVATE )
|
||||
#define MMAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS)
|
||||
#elif defined( MAP_ANON) && defined( MAP_PRIVATE )
|
||||
#define MMAP_FLAGS (MAP_PRIVATE | MAP_ANON)
|
||||
#else
|
||||
#define NO_MMAP
|
||||
#endif
|
||||
|
||||
// huge page tables
|
||||
#if !defined( NO_MMAP )
|
||||
#if defined( MAP_HUGETLB )
|
||||
#define MMAP_FLAGS_HUGE (MMAP_FLAGS | MAP_HUGETLB )
|
||||
#elif defined( MMAP_FLAGS )
|
||||
#define MMAP_FLAGS_HUGE MMAP_FLAGS
|
||||
#endif
|
||||
// threshold to use huge pages
|
||||
#define MMAP_USE_HUGE_PAGES (1u << 27)
|
||||
#endif
|
||||
|
||||
// read write access to private memory
|
||||
#if !defined( NO_MMAP )
|
||||
#define MMAP_PROTECTION (PROT_READ | PROT_WRITE)
|
||||
#endif
|
||||
|
||||
|
||||
void* PageAlignedAllocator::allocate( size_t size )
|
||||
{
|
||||
void *ptr = NULL;
|
||||
if (size) {
|
||||
#if !defined NO_MMAP
|
||||
if ( size < MMAP_USE_HUGE_PAGES ) {
|
||||
ptr = mmap( NULL, size, MMAP_PROTECTION, MMAP_FLAGS, -1 /*file descriptor*/, 0 /*offset*/);
|
||||
} else {
|
||||
ptr = mmap( NULL, size, MMAP_PROTECTION, MMAP_FLAGS_HUGE, -1 /*file descriptor*/, 0 /*offset*/);
|
||||
}
|
||||
if (ptr == MAP_FAILED) {
|
||||
ptr = NULL;
|
||||
}
|
||||
#else
|
||||
static const size_t page_size = 4096; // TODO: read in from sysconf( _SC_PAGE_SIZE )
|
||||
|
||||
ptr = raw_aligned_allocate( size, page_size);
|
||||
#endif
|
||||
if (!ptr)
|
||||
{
|
||||
std::ostringstream msg ;
|
||||
msg << name() << ": allocate(" << size << ") FAILED";
|
||||
throw_runtime_exception( msg.str() );
|
||||
}
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void PageAlignedAllocator::deallocate( void * ptr, size_t size )
|
||||
{
|
||||
#if !defined( NO_MMAP )
|
||||
munmap(ptr, size);
|
||||
#else
|
||||
raw_aligned_deallocate(ptr, size);
|
||||
#endif
|
||||
}
|
||||
|
||||
void * PageAlignedAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
|
||||
{
|
||||
void * ptr = NULL;
|
||||
#if defined( NO_MMAP ) || defined( __APPLE__ ) || defined( __CYGWIN__ )
|
||||
|
||||
if (old_size != new_size) {
|
||||
ptr = allocate( new_size );
|
||||
|
||||
memcpy(ptr, old_ptr, (old_size < new_size ? old_size : new_size) );
|
||||
|
||||
deallocate( old_ptr, old_size );
|
||||
}
|
||||
else {
|
||||
ptr = old_ptr;
|
||||
}
|
||||
#else
|
||||
ptr = mremap( old_ptr, old_size, new_size, MREMAP_MAYMOVE );
|
||||
|
||||
if (ptr == MAP_FAILED) {
|
||||
throw_runtime_exception("Error: Page Aligned Allocator could not reallocate memory");
|
||||
}
|
||||
#endif
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue