forked from lijiext/lammps
Merge pull request #601 from stanmoore1/kokkos_update
Update Kokkos library to v2.03.13
This commit is contained in:
commit
dd67989c76
|
@ -1,5 +1,46 @@
|
|||
# Change Log
|
||||
|
||||
## [2.03.13](https://github.com/kokkos/kokkos/tree/2.03.13) (2017-07-27)
|
||||
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.05...2.03.13)
|
||||
|
||||
**Implemented enhancements:**
|
||||
|
||||
- Disallow enabling both OpenMP and Threads in the same executable [\#406](https://github.com/kokkos/kokkos/issues/406)
|
||||
- Make Kokkos::OpenMP respect OMP environment even if hwloc is available [\#630](https://github.com/kokkos/kokkos/issues/630)
|
||||
- Improve Atomics Performance on KNL/Broadwell where PREFETCHW/RFO is Available [\#898](https://github.com/kokkos/kokkos/issues/898)
|
||||
- Kokkos::resize should test whether dimensions have changed before resizing [\#904](https://github.com/kokkos/kokkos/issues/904)
|
||||
- Develop performance-regression/acceptance tests [\#737](https://github.com/kokkos/kokkos/issues/737)
|
||||
- Make the deep\_copy Profiling hook a start/end system [\#890](https://github.com/kokkos/kokkos/issues/890)
|
||||
- Add deep\_copy Profiling hook [\#843](https://github.com/kokkos/kokkos/issues/843)
|
||||
- Append tag name to parallel construct name for Profiling [\#842](https://github.com/kokkos/kokkos/issues/842)
|
||||
- Add view label to `View bounds error` message for CUDA backend [\#870](https://github.com/kokkos/kokkos/issues/870)
|
||||
- Disable printing the loaded profiling library [\#824](https://github.com/kokkos/kokkos/issues/824)
|
||||
- "Declared but never referenced" warnings [\#853](https://github.com/kokkos/kokkos/issues/853)
|
||||
- Warnings about lock\_address\_cuda\_space [\#852](https://github.com/kokkos/kokkos/issues/852)
|
||||
- WorkGraph execution policy [\#771](https://github.com/kokkos/kokkos/issues/771)
|
||||
- Simplify makefiles by guarding compilation with appropriate KOKKOS\_ENABLE\_\#\#\# macros [\#716](https://github.com/kokkos/kokkos/issues/716)
|
||||
- Cmake build: wrong include install directory [\#668](https://github.com/kokkos/kokkos/issues/668)
|
||||
- Derived View type and allocation [\#566](https://github.com/kokkos/kokkos/issues/566)
|
||||
- Fix Compiler warnings when compiling core unit tests for Cuda [\#214](https://github.com/kokkos/kokkos/issues/214)
|
||||
|
||||
**Fixed bugs:**
|
||||
|
||||
- Out-of-bounds read in Kokkos\_Layout.hpp [\#975](https://github.com/kokkos/kokkos/issues/975)
|
||||
- CudaClang: Fix failing test with Clang 4.0 [\#941](https://github.com/kokkos/kokkos/issues/941)
|
||||
- Respawn when memory pool allocation fails \(not available memory\) [\#940](https://github.com/kokkos/kokkos/issues/940)
|
||||
- Memory pool aborts on zero allocation request, returns NULL for \< minimum [\#939](https://github.com/kokkos/kokkos/issues/939)
|
||||
- Error with TaskScheduler query of underlying memory pool [\#917](https://github.com/kokkos/kokkos/issues/917)
|
||||
- Profiling::\*Callee static variables declared in header [\#863](https://github.com/kokkos/kokkos/issues/863)
|
||||
- calling \*Space::name\(\) causes compile error [\#862](https://github.com/kokkos/kokkos/issues/862)
|
||||
- bug in Profiling::deallocateData [\#860](https://github.com/kokkos/kokkos/issues/860)
|
||||
- task\_depend test failing, CUDA 8.0 + Pascal + RDC [\#829](https://github.com/kokkos/kokkos/issues/829)
|
||||
- \[develop branch\] Standalone cmake issues [\#826](https://github.com/kokkos/kokkos/issues/826)
|
||||
- Kokkos CUDA failes to compile with OMPI\_CXX and MPICH\_CXX wrappers [\#776](https://github.com/kokkos/kokkos/issues/776)
|
||||
- Task Team reduction on Pascal [\#767](https://github.com/kokkos/kokkos/issues/767)
|
||||
- CUDA stack overflow with TaskDAG test [\#758](https://github.com/kokkos/kokkos/issues/758)
|
||||
- TeamVector test on Cuda [\#670](https://github.com/kokkos/kokkos/issues/670)
|
||||
- Clang 4.0 Cuda Build broken again [\#560](https://github.com/kokkos/kokkos/issues/560)
|
||||
|
||||
|
||||
## [2.03.05](https://github.com/kokkos/kokkos/tree/2.03.05) (2017-05-27)
|
||||
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.00...2.03.05)
|
||||
|
|
|
@ -33,6 +33,7 @@ KOKKOS_INTERNAL_USE_LIBRT := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "lib
|
|||
KOKKOS_INTERNAL_USE_MEMKIND := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "experimental_memkind" | wc -l))
|
||||
|
||||
# Check for advanced settings.
|
||||
KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "compiler_warnings" | wc -l))
|
||||
KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "aggressive_vectorization" | wc -l))
|
||||
KOKKOS_INTERNAL_DISABLE_PROFILING := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "disable_profiling" | wc -l))
|
||||
KOKKOS_INTERNAL_DISABLE_DUALVIEW_MODIFY_CHECK := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "disable_dualview_modify_check" | wc -l))
|
||||
|
@ -78,14 +79,14 @@ KOKKOS_INTERNAL_COMPILER_PGI := $(strip $(shell $(CXX) --version 2
|
|||
KOKKOS_INTERNAL_COMPILER_XL := $(strip $(shell $(CXX) -qversion 2>&1 | grep XL | wc -l))
|
||||
KOKKOS_INTERNAL_COMPILER_CRAY := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep "CC-" | wc -l))
|
||||
KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(CXX) --version 2>&1 | grep nvcc | wc -l))
|
||||
KOKKOS_INTERNAL_COMPILER_CLANG := $(strip $(shell $(CXX) --version 2>&1 | grep clang | wc -l))
|
||||
KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(strip $(shell $(CXX) --version 2>&1 | grep "apple-darwin" | wc -l))
|
||||
ifneq ($(OMPI_CXX),)
|
||||
KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(OMPI_CXX) --version 2>&1 | grep "nvcc" | wc -l))
|
||||
KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(OMPI_CXX) --version 2>&1 | grep nvcc | wc -l))
|
||||
endif
|
||||
ifneq ($(MPICH_CXX),)
|
||||
KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(MPICH_CXX) --version 2>&1 | grep "nvcc" | wc -l))
|
||||
KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(MPICH_CXX) --version 2>&1 | grep nvcc | wc -l))
|
||||
endif
|
||||
KOKKOS_INTERNAL_COMPILER_CLANG := $(strip $(shell $(CXX) --version 2>&1 | grep clang | wc -l))
|
||||
KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(strip $(shell $(CXX) --version 2>&1 | grep "apple-darwin" | wc -l))
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 2)
|
||||
KOKKOS_INTERNAL_COMPILER_CLANG = 1
|
||||
|
@ -111,6 +112,36 @@ ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
|
|||
endif
|
||||
endif
|
||||
|
||||
# Set compiler warnings flags.
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS), 1)
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
|
||||
# TODO check if PGI accepts GNU style warnings
|
||||
KOKKOS_INTERNAL_COMPILER_WARNINGS =
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
|
||||
KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_APPLE_CLANG), 1)
|
||||
KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
|
||||
KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
|
||||
# TODO check if cray accepts GNU style warnings
|
||||
KOKKOS_INTERNAL_COMPILER_WARNINGS =
|
||||
else
|
||||
#gcc
|
||||
KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
else
|
||||
KOKKOS_INTERNAL_COMPILER_WARNINGS =
|
||||
endif
|
||||
|
||||
# Set OpenMP flags.
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
|
||||
KOKKOS_INTERNAL_OPENMP_FLAG := -mp
|
||||
|
@ -162,6 +193,7 @@ endif
|
|||
|
||||
# Intel based.
|
||||
KOKKOS_INTERNAL_USE_ARCH_KNC := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNC | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_WSM := $(strip $(shell echo $(KOKKOS_ARCH) | grep WSM | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_SNB := $(strip $(shell echo $(KOKKOS_ARCH) | grep SNB | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_HSW := $(strip $(shell echo $(KOKKOS_ARCH) | grep HSW | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_BDW := $(strip $(shell echo $(KOKKOS_ARCH) | grep BDW | wc -l))
|
||||
|
@ -229,13 +261,14 @@ KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_
|
|||
KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(strip $(shell echo $(KOKKOS_ARCH) | grep AMDAVX | wc -l))
|
||||
|
||||
# Any AVX?
|
||||
KOKKOS_INTERNAL_USE_ARCH_SSE42 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_WSM) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
|
||||
|
||||
# Decide what ISA level we are able to support.
|
||||
KOKKOS_INTERNAL_USE_ISA_X86_64 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ISA_X86_64 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_WSM)+$(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ISA_KNC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNC) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc ))
|
||||
|
||||
|
@ -243,7 +276,7 @@ KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_
|
|||
KOKKOS_INTERNAL_USE_TM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
|
||||
|
||||
# Incompatible flags?
|
||||
KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1" | bc ))
|
||||
KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1" | bc ))
|
||||
KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1" | bc))
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1)
|
||||
|
@ -257,12 +290,10 @@ endif
|
|||
|
||||
KOKKOS_CPPFLAGS = -I./ -I$(KOKKOS_PATH)/core/src -I$(KOKKOS_PATH)/containers/src -I$(KOKKOS_PATH)/algorithms/src
|
||||
|
||||
# No warnings:
|
||||
KOKKOS_CXXFLAGS =
|
||||
# INTEL and CLANG warnings:
|
||||
#KOKKOS_CXXFLAGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
|
||||
# GCC warnings:
|
||||
#KOKKOS_CXXFLAGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized -Wignored-qualifiers -Wempty-body -Wclobbered
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS), 1)
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_WARNINGS)
|
||||
endif
|
||||
|
||||
KOKKOS_LIBS = -lkokkos -ldl
|
||||
KOKKOS_LDFLAGS = -L$(shell pwd)
|
||||
|
@ -486,6 +517,28 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX), 1)
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_SSE42), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_SSE42 1" >> KokkosCore_config.tmp )
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
|
||||
KOKKOS_CXXFLAGS += -xSSE4.2
|
||||
KOKKOS_LDFLAGS += -xSSE4.2
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
|
||||
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
|
||||
KOKKOS_CXXFLAGS += -tp=nehalem
|
||||
KOKKOS_LDFLAGS += -tp=nehalem
|
||||
else
|
||||
# Assume that this is a really a GNU compiler.
|
||||
KOKKOS_CXXFLAGS += -msse4.2
|
||||
KOKKOS_LDFLAGS += -msse4.2
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_AVX 1" >> KokkosCore_config.tmp )
|
||||
|
||||
|
@ -689,7 +742,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
|||
endif
|
||||
endif
|
||||
|
||||
KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h)
|
||||
KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h 2>&1)
|
||||
ifeq ($(KOKKOS_INTERNAL_LS_CONFIG), KokkosCore_config.h)
|
||||
KOKKOS_INTERNAL_NEW_CONFIG := $(strip $(shell diff KokkosCore_config.h KokkosCore_config.tmp | grep define | wc -l))
|
||||
else
|
||||
|
|
|
@ -20,8 +20,10 @@ Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Ta
|
|||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
|
||||
Kokkos_HostThreadTeam.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp
|
||||
Kokkos_spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
|
||||
Kokkos_Spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp
|
||||
Kokkos_Rendezvous.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Rendezvous.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Rendezvous.cpp
|
||||
Kokkos_Profiling_Interface.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp
|
||||
Kokkos_SharedAlloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_SharedAlloc.cpp
|
||||
|
@ -36,6 +38,8 @@ Kokkos_CudaSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cu
|
|||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp
|
||||
Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
|
||||
Kokkos_Cuda_Locks.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
|
||||
|
|
|
@ -61,14 +61,19 @@ protected:
|
|||
{
|
||||
std::cout << std::setprecision(5) << std::scientific;
|
||||
|
||||
unsigned threads_count = omp_get_max_threads();
|
||||
int threads_count = 0;
|
||||
#pragma omp parallel
|
||||
{
|
||||
#pragma omp atomic
|
||||
++threads_count;
|
||||
}
|
||||
|
||||
if ( Kokkos::hwloc::available() ) {
|
||||
threads_count = Kokkos::hwloc::get_available_numa_count() *
|
||||
Kokkos::hwloc::get_available_cores_per_numa();
|
||||
if (threads_count > 3) {
|
||||
threads_count /= 2;
|
||||
}
|
||||
|
||||
Kokkos::OpenMP::initialize( threads_count );
|
||||
Kokkos::OpenMP::print_configuration( std::cout );
|
||||
}
|
||||
|
||||
static void TearDownTestCase()
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
|
@ -35,7 +35,7 @@
|
|||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
|
||||
|
@ -283,12 +283,12 @@ struct test_random_scalar {
|
|||
RandomGenerator& pool,
|
||||
unsigned int num_draws)
|
||||
{
|
||||
using std::cerr;
|
||||
using std::cout;
|
||||
using std::endl;
|
||||
using Kokkos::parallel_reduce;
|
||||
|
||||
{
|
||||
cerr << " -- Testing randomness properties" << endl;
|
||||
cout << " -- Testing randomness properties" << endl;
|
||||
|
||||
RandomProperties result;
|
||||
typedef test_random_functor<RandomGenerator, Scalar> functor_type;
|
||||
|
@ -307,7 +307,7 @@ struct test_random_scalar {
|
|||
( 1.5*tolerance > variance_eps)) ? 1:0;
|
||||
pass_covar = ((-2.0*tolerance < covariance_eps) &&
|
||||
( 2.0*tolerance > covariance_eps)) ? 1:0;
|
||||
cerr << "Pass: " << pass_mean
|
||||
cout << "Pass: " << pass_mean
|
||||
<< " " << pass_var
|
||||
<< " " << mean_eps
|
||||
<< " " << variance_eps
|
||||
|
@ -315,7 +315,7 @@ struct test_random_scalar {
|
|||
<< " || " << tolerance << endl;
|
||||
}
|
||||
{
|
||||
cerr << " -- Testing 1-D histogram" << endl;
|
||||
cout << " -- Testing 1-D histogram" << endl;
|
||||
|
||||
RandomProperties result;
|
||||
typedef test_histogram1d_functor<typename RandomGenerator::device_type> functor_type;
|
||||
|
@ -335,7 +335,7 @@ struct test_random_scalar {
|
|||
pass_hist1d_covar = ((-0.06 < covariance_eps) &&
|
||||
( 0.06 > covariance_eps)) ? 1:0;
|
||||
|
||||
cerr << "Density 1D: " << mean_eps
|
||||
cout << "Density 1D: " << mean_eps
|
||||
<< " " << variance_eps
|
||||
<< " " << (result.covariance/HIST_DIM1D/HIST_DIM1D)
|
||||
<< " || " << tolerance
|
||||
|
@ -348,7 +348,7 @@ struct test_random_scalar {
|
|||
<< endl;
|
||||
}
|
||||
{
|
||||
cerr << " -- Testing 3-D histogram" << endl;
|
||||
cout << " -- Testing 3-D histogram" << endl;
|
||||
|
||||
RandomProperties result;
|
||||
typedef test_histogram3d_functor<typename RandomGenerator::device_type> functor_type;
|
||||
|
@ -368,7 +368,7 @@ struct test_random_scalar {
|
|||
pass_hist3d_covar = ((-tolerance < covariance_eps) &&
|
||||
( tolerance > covariance_eps)) ? 1:0;
|
||||
|
||||
cerr << "Density 3D: " << mean_eps
|
||||
cout << "Density 3D: " << mean_eps
|
||||
<< " " << variance_eps
|
||||
<< " " << result.covariance/HIST_DIM1D/HIST_DIM1D
|
||||
<< " || " << tolerance
|
||||
|
@ -381,18 +381,18 @@ struct test_random_scalar {
|
|||
template <class RandomGenerator>
|
||||
void test_random(unsigned int num_draws)
|
||||
{
|
||||
using std::cerr;
|
||||
using std::cout;
|
||||
using std::endl;
|
||||
typename test_random_functor<RandomGenerator,int>::type_1d density_1d("D1d");
|
||||
typename test_random_functor<RandomGenerator,int>::type_3d density_3d("D3d");
|
||||
|
||||
|
||||
uint64_t ticks = std::chrono::high_resolution_clock::now().time_since_epoch().count();
|
||||
cerr << "Test Seed:" << ticks << endl;
|
||||
cout << "Test Seed:" << ticks << endl;
|
||||
|
||||
RandomGenerator pool(ticks);
|
||||
|
||||
cerr << "Test Scalar=int" << endl;
|
||||
cout << "Test Scalar=int" << endl;
|
||||
test_random_scalar<RandomGenerator,int> test_int(density_1d,density_3d,pool,num_draws);
|
||||
ASSERT_EQ( test_int.pass_mean,1);
|
||||
ASSERT_EQ( test_int.pass_var,1);
|
||||
|
@ -406,7 +406,7 @@ void test_random(unsigned int num_draws)
|
|||
deep_copy(density_1d,0);
|
||||
deep_copy(density_3d,0);
|
||||
|
||||
cerr << "Test Scalar=unsigned int" << endl;
|
||||
cout << "Test Scalar=unsigned int" << endl;
|
||||
test_random_scalar<RandomGenerator,unsigned int> test_uint(density_1d,density_3d,pool,num_draws);
|
||||
ASSERT_EQ( test_uint.pass_mean,1);
|
||||
ASSERT_EQ( test_uint.pass_var,1);
|
||||
|
@ -420,7 +420,7 @@ void test_random(unsigned int num_draws)
|
|||
deep_copy(density_1d,0);
|
||||
deep_copy(density_3d,0);
|
||||
|
||||
cerr << "Test Scalar=int64_t" << endl;
|
||||
cout << "Test Scalar=int64_t" << endl;
|
||||
test_random_scalar<RandomGenerator,int64_t> test_int64(density_1d,density_3d,pool,num_draws);
|
||||
ASSERT_EQ( test_int64.pass_mean,1);
|
||||
ASSERT_EQ( test_int64.pass_var,1);
|
||||
|
@ -434,7 +434,7 @@ void test_random(unsigned int num_draws)
|
|||
deep_copy(density_1d,0);
|
||||
deep_copy(density_3d,0);
|
||||
|
||||
cerr << "Test Scalar=uint64_t" << endl;
|
||||
cout << "Test Scalar=uint64_t" << endl;
|
||||
test_random_scalar<RandomGenerator,uint64_t> test_uint64(density_1d,density_3d,pool,num_draws);
|
||||
ASSERT_EQ( test_uint64.pass_mean,1);
|
||||
ASSERT_EQ( test_uint64.pass_var,1);
|
||||
|
@ -448,7 +448,7 @@ void test_random(unsigned int num_draws)
|
|||
deep_copy(density_1d,0);
|
||||
deep_copy(density_3d,0);
|
||||
|
||||
cerr << "Test Scalar=float" << endl;
|
||||
cout << "Test Scalar=float" << endl;
|
||||
test_random_scalar<RandomGenerator,float> test_float(density_1d,density_3d,pool,num_draws);
|
||||
ASSERT_EQ( test_float.pass_mean,1);
|
||||
ASSERT_EQ( test_float.pass_var,1);
|
||||
|
@ -462,7 +462,7 @@ void test_random(unsigned int num_draws)
|
|||
deep_copy(density_1d,0);
|
||||
deep_copy(density_3d,0);
|
||||
|
||||
cerr << "Test Scalar=double" << endl;
|
||||
cout << "Test Scalar=double" << endl;
|
||||
test_random_scalar<RandomGenerator,double> test_double(density_1d,density_3d,pool,num_draws);
|
||||
ASSERT_EQ( test_double.pass_mean,1);
|
||||
ASSERT_EQ( test_double.pass_var,1);
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
|
@ -36,7 +36,7 @@
|
|||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
|
|
@ -44,12 +44,13 @@
|
|||
#include<Kokkos_Core.hpp>
|
||||
#include<impl/Kokkos_Timer.hpp>
|
||||
#include<bench.hpp>
|
||||
#include<cstdlib>
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
Kokkos::initialize();
|
||||
|
||||
|
||||
if(argc<10) {
|
||||
|
||||
if(argc<10) {
|
||||
printf("Arguments: N K R D U F T S\n");
|
||||
printf(" P: Precision (1==float, 2==double)\n");
|
||||
printf(" N,K: dimensions of the 2D array to allocate\n");
|
||||
|
@ -68,7 +69,7 @@ int main(int argc, char* argv[]) {
|
|||
Kokkos::finalize();
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
int P = atoi(argv[1]);
|
||||
int N = atoi(argv[2]);
|
||||
|
@ -80,7 +81,7 @@ int main(int argc, char* argv[]) {
|
|||
int T = atoi(argv[8]);
|
||||
int S = atoi(argv[9]);
|
||||
|
||||
if(U>8) {printf("U must be 1-8\n"); return 0;}
|
||||
if(U>8) {printf("U must be 1-8\n"); return 0;}
|
||||
if( (D!=1) && (D!=2) && (D!=4) && (D!=8) && (D!=16) && (D!=32)) {printf("D must be one of 1,2,4,8,16,32\n"); return 0;}
|
||||
if( (P!=1) && (P!=2) ) {printf("P must be one of 1,2\n"); return 0;}
|
||||
|
||||
|
|
|
@ -44,11 +44,11 @@
|
|||
#include<Kokkos_Core.hpp>
|
||||
#include<impl/Kokkos_Timer.hpp>
|
||||
#include<gather.hpp>
|
||||
#include<cstdlib>
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
Kokkos::initialize(argc,argv);
|
||||
|
||||
|
||||
if(argc<8) {
|
||||
printf("Arguments: S N K D\n");
|
||||
printf(" S: Scalar Type Size (1==float, 2==double, 4=complex<double>)\n");
|
||||
|
|
|
@ -0,0 +1,44 @@
|
|||
KOKKOS_PATH = ../..
|
||||
SRC = $(wildcard *.cpp)
|
||||
|
||||
default: build
|
||||
echo "Start Build"
|
||||
|
||||
ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
|
||||
CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
|
||||
CXXFLAGS = -O3 -g
|
||||
LINK = ${CXX}
|
||||
LINKFLAGS =
|
||||
EXE = policy_performance.cuda
|
||||
KOKKOS_DEVICES = "Cuda,OpenMP"
|
||||
KOKKOS_ARCH = "SNB,Kepler35"
|
||||
KOKKOS_CUDA_OPTIONS+=enable_lambda
|
||||
else
|
||||
CXX = g++
|
||||
CXXFLAGS = -O3 -g -Wall -Werror
|
||||
LINK = ${CXX}
|
||||
LINKFLAGS =
|
||||
EXE = policy_performance.host
|
||||
KOKKOS_DEVICES = "OpenMP"
|
||||
KOKKOS_ARCH = "SNB"
|
||||
endif
|
||||
|
||||
DEPFLAGS = -M
|
||||
|
||||
OBJ = $(SRC:.cpp=.o)
|
||||
LIB =
|
||||
|
||||
include $(KOKKOS_PATH)/Makefile.kokkos
|
||||
|
||||
build: $(EXE)
|
||||
|
||||
$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
|
||||
|
||||
clean: kokkos-clean
|
||||
rm -f *.o *.cuda *.host
|
||||
|
||||
# Compilation rules
|
||||
|
||||
%.o:%.cpp $(KOKKOS_CPP_DEPENDS) main.cpp policy_perf_test.hpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
|
|
@ -0,0 +1,170 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include "policy_perf_test.hpp"
|
||||
|
||||
int main(int argc, char* argv[] ) {
|
||||
Kokkos::initialize(argc,argv);
|
||||
|
||||
if(argc<10) {
|
||||
printf(" Ten arguments are needed to run this program:\n");
|
||||
printf(" (1)team_range, (2)thread_range, (3)vector_range, (4)outer_repeat, (5)thread_repeat, (6)vector_repeat, (7)team_size, (8)vector_size, (9)schedule, (10)test_type\n");
|
||||
printf(" team_range: number of teams (league_size)\n");
|
||||
printf(" thread_range: range for nested TeamThreadRange parallel_*\n");
|
||||
printf(" vector_range: range for nested ThreadVectorRange parallel_*\n");
|
||||
printf(" outer_repeat: number of repeats for outer parallel_* call\n");
|
||||
printf(" thread_repeat: number of repeats for TeamThreadRange parallel_* call\n");
|
||||
printf(" vector_repeat: number of repeats for ThreadVectorRange parallel_* call\n");
|
||||
printf(" team_size: number of team members (team_size)\n");
|
||||
printf(" vector_size: desired vectorization (if possible)\n");
|
||||
printf(" schedule: 1 == Static 2 == Dynamic\n");
|
||||
printf(" test_type: 3-digit code XYZ for testing (nested) parallel_*\n");
|
||||
printf(" code key: XYZ X in {1,2,3,4,5}, Y in {0,1,2}, Z in {0,1,2}\n");
|
||||
printf(" TeamPolicy:\n");
|
||||
printf(" X: 0 = none (never used, makes no sense); 1 = parallel_for; 2 = parallel_reduce\n");
|
||||
printf(" Y: 0 = none; 1 = parallel_for; 2 = parallel_reduce\n");
|
||||
printf(" Z: 0 = none; 1 = parallel_for; 2 = parallel_reduce\n");
|
||||
printf(" RangePolicy:\n");
|
||||
printf(" X: 3 = parallel_for; 4 = parallel_reduce; 5 = parallel_scan\n");
|
||||
printf(" Y: 0 = none\n");
|
||||
printf(" Z: 0 = none\n");
|
||||
printf(" Example Input:\n");
|
||||
printf(" 100000 32 32 100 100 100 8 1 1 100\n");
|
||||
Kokkos::finalize();
|
||||
return 0;
|
||||
}
|
||||
|
||||
int team_range = atoi(argv[1]);
|
||||
int thread_range = atoi(argv[2]);
|
||||
int vector_range = atoi(argv[3]);
|
||||
|
||||
int outer_repeat = atoi(argv[4]);
|
||||
int thread_repeat = atoi(argv[5]);
|
||||
int vector_repeat = atoi(argv[6]);
|
||||
|
||||
int team_size = atoi(argv[7]);
|
||||
int vector_size = atoi(argv[8]);
|
||||
int schedule = atoi(argv[9]);
|
||||
int test_type = atoi(argv[10]);
|
||||
|
||||
int disable_verbose_output = 0;
|
||||
if ( argc > 11 ) {
|
||||
disable_verbose_output = atoi(argv[11]);
|
||||
}
|
||||
|
||||
if ( schedule != 1 && schedule != 2 ) {
|
||||
printf("schedule: %d\n", schedule);
|
||||
printf("Options for schedule are: 1 == Static 2 == Dynamic\n");
|
||||
Kokkos::finalize();
|
||||
return -1;
|
||||
}
|
||||
|
||||
if ( test_type != 100 && test_type != 110 && test_type != 111 && test_type != 112 && test_type != 120 && test_type != 121 && test_type != 122
|
||||
&& test_type != 200 && test_type != 210 && test_type != 211 && test_type != 212 && test_type != 220 && test_type != 221 && test_type != 222
|
||||
&& test_type != 300 && test_type != 400 && test_type != 500
|
||||
)
|
||||
{
|
||||
printf("Incorrect test_type option\n");
|
||||
Kokkos::finalize();
|
||||
return -2;
|
||||
}
|
||||
|
||||
double result = 0.0;
|
||||
|
||||
Kokkos::parallel_reduce( "parallel_reduce warmup", Kokkos::TeamPolicy<>(10,1),
|
||||
KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type team, double& lval) {
|
||||
lval += 1;
|
||||
}, result);
|
||||
|
||||
typedef Kokkos::View<double*, Kokkos::LayoutRight> view_type_1d;
|
||||
typedef Kokkos::View<double**, Kokkos::LayoutRight> view_type_2d;
|
||||
typedef Kokkos::View<double***, Kokkos::LayoutRight> view_type_3d;
|
||||
|
||||
// Allocate view without initializing
|
||||
// Call a 'warmup' test with 1 repeat - this will initialize the corresponding view appropriately for test and should obey first-touch etc
|
||||
// Second call to test is the one we actually care about and time
|
||||
view_type_1d v_1( Kokkos::ViewAllocateWithoutInitializing("v_1"), team_range*team_size);
|
||||
view_type_2d v_2( Kokkos::ViewAllocateWithoutInitializing("v_2"), team_range*team_size, thread_range);
|
||||
view_type_3d v_3( Kokkos::ViewAllocateWithoutInitializing("v_3"), team_range*team_size, thread_range, vector_range);
|
||||
|
||||
double result_computed = 0.0;
|
||||
double result_expect = 0.0;
|
||||
double time = 0.0;
|
||||
|
||||
if(schedule==1) {
|
||||
if ( test_type != 500 ) {
|
||||
// warmup - no repeat of loops
|
||||
test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
|
||||
test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
|
||||
}
|
||||
else {
|
||||
// parallel_scan: initialize 1d view for parallel_scan
|
||||
test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,100,v_1,v_2,v_3,result_computed,result_expect,time);
|
||||
test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
|
||||
}
|
||||
}
|
||||
if(schedule==2) {
|
||||
if ( test_type != 500 ) {
|
||||
// warmup - no repeat of loops
|
||||
test_policy<Kokkos::Schedule<Kokkos::Dynamic>,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
|
||||
test_policy<Kokkos::Schedule<Kokkos::Dynamic>,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
|
||||
}
|
||||
else {
|
||||
// parallel_scan: initialize 1d view for parallel_scan
|
||||
test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,1,1,1,team_size,vector_size,100,v_1,v_2,v_3,result_computed,result_expect,time);
|
||||
test_policy<Kokkos::Schedule<Kokkos::Static>,int>(team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,test_type,v_1,v_2,v_3,result_computed,result_expect,time);
|
||||
}
|
||||
}
|
||||
|
||||
if ( disable_verbose_output == 0 ) {
|
||||
printf("%7i %4i %2i %9i %4i %4i %4i %2i %1i %3i %e %e %lf\n",team_range,thread_range,vector_range,outer_repeat,thread_repeat,vector_repeat,team_size,vector_size,schedule,test_type,result_computed,result_expect,time);
|
||||
}
|
||||
else {
|
||||
printf("%lf\n",time);
|
||||
}
|
||||
|
||||
Kokkos::finalize();
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,354 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
template < class ViewType >
|
||||
struct ParallelScanFunctor {
|
||||
using value_type = double;
|
||||
ViewType v;
|
||||
|
||||
ParallelScanFunctor( const ViewType & v_ )
|
||||
: v(v_)
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()( const int idx, value_type& val, const bool& final ) const
|
||||
{
|
||||
// inclusive scan
|
||||
val += v(idx);
|
||||
if ( final ) {
|
||||
v(idx) = val;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<class ScheduleType,class IndexType,class ViewType1, class ViewType2, class ViewType3>
|
||||
void test_policy(int team_range, int thread_range, int vector_range,
|
||||
int outer_repeat, int thread_repeat, int inner_repeat,
|
||||
int team_size, int vector_size, int test_type,
|
||||
ViewType1 &v1, ViewType2 &v2, ViewType3 &v3,
|
||||
double &result, double &result_expect, double &time) {
|
||||
|
||||
typedef Kokkos::TeamPolicy<ScheduleType,IndexType> t_policy;
|
||||
typedef typename t_policy::member_type t_team;
|
||||
Kokkos::Timer timer;
|
||||
|
||||
for(int orep = 0; orep<outer_repeat; orep++) {
|
||||
|
||||
if (test_type == 100) {
|
||||
Kokkos::parallel_for("100 outer for", t_policy(team_range,team_size),
|
||||
KOKKOS_LAMBDA (const t_team& team) {
|
||||
long idx = team.league_rank()*team.team_size() + team.team_rank();
|
||||
v1(idx) = idx;
|
||||
// prevent compiler optimizing loop away
|
||||
});
|
||||
}
|
||||
|
||||
if (test_type == 110) {
|
||||
Kokkos::parallel_for("110 outer for", t_policy(team_range,team_size),
|
||||
KOKKOS_LAMBDA (const t_team& team) {
|
||||
long idx = team.league_rank()*team.team_size() + team.team_rank();
|
||||
for (int tr = 0; tr<thread_repeat; ++tr) {
|
||||
// Each team launches a parallel_for; thread_range is partitioned among team members
|
||||
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
|
||||
v2( idx, t ) = t;
|
||||
// prevent compiler optimizing loop away
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
if (test_type == 111) {
|
||||
Kokkos::parallel_for("111 outer for", t_policy(team_range,team_size,vector_size),
|
||||
KOKKOS_LAMBDA (const t_team& team) {
|
||||
long idx = team.league_rank()*team.team_size() + team.team_rank();
|
||||
for (int tr = 0; tr<thread_repeat; ++tr) {
|
||||
// Each team launches a parallel_for; thread_range is partitioned among team members
|
||||
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
|
||||
for (int vr = 0; vr<inner_repeat; ++vr)
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,vector_range), [&] (const int vi) {
|
||||
v3( idx, t, vi ) = vi;
|
||||
// prevent compiler optimizing loop away
|
||||
});
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
if (test_type == 112) {
|
||||
Kokkos::parallel_for("112 outer for", t_policy(team_range,team_size,vector_size),
|
||||
KOKKOS_LAMBDA (const t_team& team) {
|
||||
long idx = team.league_rank()*team.team_size() + team.team_rank();
|
||||
for (int tr = 0; tr<thread_repeat; ++tr) {
|
||||
// Each team launches a parallel_for; thread_range is partitioned among team members
|
||||
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
|
||||
double vector_result = 0.0;
|
||||
for (int vr = 0; vr<inner_repeat; ++vr) {
|
||||
vector_result = 0.0;
|
||||
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,vector_range), [&] (const int vi, double &vval) {
|
||||
vval += 1;
|
||||
}, vector_result);
|
||||
}
|
||||
v2( idx, t ) = vector_result;
|
||||
// prevent compiler optimizing loop away
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
if (test_type == 120) {
|
||||
Kokkos::parallel_for("120 outer for", t_policy(team_range,team_size),
|
||||
KOKKOS_LAMBDA (const t_team& team) {
|
||||
long idx = team.league_rank()*team.team_size() + team.team_rank();
|
||||
double team_result = 0.0;
|
||||
for (int tr = 0; tr<thread_repeat; ++tr) {
|
||||
team_result = 0.0;
|
||||
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double &lval) {
|
||||
lval += 1;
|
||||
}, team_result);
|
||||
}
|
||||
v1(idx) = team_result;
|
||||
// prevent compiler optimizing loop away
|
||||
});
|
||||
}
|
||||
if (test_type == 121) {
|
||||
Kokkos::parallel_for("121 outer for", t_policy(team_range,team_size,vector_size),
|
||||
KOKKOS_LAMBDA (const t_team& team) {
|
||||
long idx = team.league_rank()*team.team_size() + team.team_rank();
|
||||
double team_result = 0.0;
|
||||
for (int tr = 0; tr<thread_repeat; ++tr) {
|
||||
team_result = 0.0;
|
||||
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double &lval) {
|
||||
lval += 1;
|
||||
for (int vr = 0; vr<inner_repeat; ++vr) {
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,vector_range), [&] (const int vi) {
|
||||
v3( idx, t, vi ) = vi;
|
||||
// prevent compiler optimizing loop away
|
||||
});
|
||||
}
|
||||
}, team_result);
|
||||
}
|
||||
v3( idx, 0, 0 ) = team_result;
|
||||
// prevent compiler optimizing loop away
|
||||
});
|
||||
}
|
||||
if (test_type == 122) {
|
||||
Kokkos::parallel_for("122 outer for", t_policy(team_range,team_size,vector_size),
|
||||
KOKKOS_LAMBDA (const t_team& team) {
|
||||
long idx = team.league_rank()*team.team_size() + team.team_rank();
|
||||
double team_result = 0.0;
|
||||
for (int tr = 0; tr<thread_repeat; ++tr) {
|
||||
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double &lval) {
|
||||
double vector_result = 0.0;
|
||||
for (int vr = 0; vr<inner_repeat; ++vr)
|
||||
vector_result = 0.0;
|
||||
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,vector_range), [&] (const int vi, double &vval) {
|
||||
vval += 1;
|
||||
}, vector_result);
|
||||
lval += vector_result;
|
||||
}, team_result);
|
||||
}
|
||||
v1(idx) = team_result;
|
||||
// prevent compiler optimizing loop away
|
||||
});
|
||||
}
|
||||
if (test_type == 200) {
|
||||
Kokkos::parallel_reduce("200 outer reduce", t_policy(team_range,team_size),
|
||||
KOKKOS_LAMBDA (const t_team& team, double& lval) {
|
||||
lval+=team.team_size()*team.league_rank() + team.team_rank();
|
||||
},result);
|
||||
result_expect = 0.5* (team_range*team_size)*(team_range*team_size-1);
|
||||
// sum ( seq( [0, team_range*team_size) )
|
||||
}
|
||||
if (test_type == 210) {
|
||||
Kokkos::parallel_reduce("210 outer reduce", t_policy(team_range,team_size),
|
||||
KOKKOS_LAMBDA (const t_team& team, double& lval) {
|
||||
long idx = team.league_rank()*team.team_size() + team.team_rank();
|
||||
double thread_for = 1.0;
|
||||
for(int tr = 0; tr<thread_repeat; tr++) {
|
||||
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
|
||||
v2(idx,t) = t;
|
||||
// prevent compiler optimizing loop away
|
||||
});
|
||||
}
|
||||
lval+=(team.team_size()*team.league_rank() + team.team_rank() + thread_for);
|
||||
},result);
|
||||
result_expect = 0.5* (team_range*team_size)*(team_range*team_size-1) + (team_range*team_size);
|
||||
// sum ( seq( [0, team_range*team_size) + 1 per team_member (total of team_range*team_size) )
|
||||
}
|
||||
if (test_type == 211) {
|
||||
Kokkos::parallel_reduce("211 outer reduce", t_policy(team_range,team_size,vector_size),
|
||||
KOKKOS_LAMBDA (const t_team& team, double& lval) {
|
||||
long idx = team.league_rank()*team.team_size() + team.team_rank();
|
||||
double thread_for = 1.0;
|
||||
for(int tr = 0; tr<thread_repeat; tr++) {
|
||||
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
|
||||
for (int vr = 0; vr<inner_repeat; ++vr)
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_range), [&] (const int vi) {
|
||||
v3(idx, t, vi) = vi;
|
||||
// prevent compiler optimizing loop away
|
||||
});
|
||||
});
|
||||
}
|
||||
lval+=idx+thread_for;
|
||||
},result);
|
||||
result_expect = 0.5*(team_range*team_size)*(team_range*team_size-1) + (team_range*team_size);
|
||||
// sum ( seq( [0, team_range*team_size) + 1 per team_member (total of team_range*team_size) )
|
||||
}
|
||||
if (test_type == 212) {
|
||||
Kokkos::parallel_reduce("212 outer reduce", t_policy(team_range,team_size,vector_size),
|
||||
KOKKOS_LAMBDA (const t_team& team, double& lval) {
|
||||
long idx = team.league_rank()*team.team_size() + team.team_rank();
|
||||
double vector_result = 0.0;
|
||||
for(int tr = 0; tr<thread_repeat; tr++) {
|
||||
// This parallel_for is executed by each team; the thread_range is partitioned among the team members
|
||||
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t) {
|
||||
v2(idx,t) = t;
|
||||
// prevent compiler optimizing loop away
|
||||
for (int vr = 0; vr<inner_repeat; ++vr) {
|
||||
vector_result = 0.0;
|
||||
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, vector_range), [&] (const int vi, double &vval) {
|
||||
vval += vi;
|
||||
}, vector_result );
|
||||
}
|
||||
});
|
||||
}
|
||||
lval+= idx + vector_result;
|
||||
},result);
|
||||
result_expect = 0.5*(team_range*team_size)*(team_range*team_size-1) + (0.5*vector_range*(vector_range-1)*team_range*team_size);
|
||||
// sum ( seq( [0, team_range*team_size) + sum( seq( [0, vector_range) ) per team_member (total of team_range*team_size) )
|
||||
}
|
||||
if (test_type == 220) {
|
||||
Kokkos::parallel_reduce("220 outer reduce", t_policy(team_range,team_size),
|
||||
KOKKOS_LAMBDA (const t_team& team, double& lval) {
|
||||
double team_result = 0.0;
|
||||
for(int tr = 0; tr<thread_repeat; tr++) {
|
||||
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double& tval) {
|
||||
tval += t;
|
||||
},team_result);
|
||||
}
|
||||
lval+=team_result*team.league_rank(); // constant * league_rank
|
||||
},result);
|
||||
result_expect = 0.5*(team_range)*(team_range-1) * team_size * 0.5*(thread_range)*(thread_range-1);
|
||||
// sum ( seq( [0, team_range) * constant ); constant = sum( seq( [0, thread_range) )*team_size (1 per member, result for each team)
|
||||
}
|
||||
if (test_type == 221) {
|
||||
Kokkos::parallel_reduce("221 outer reduce", t_policy(team_range,team_size,vector_size),
|
||||
KOKKOS_LAMBDA (const t_team& team, double& lval) {
|
||||
long idx = team.league_rank()*team.team_size() + team.team_rank();
|
||||
double team_result = 0;
|
||||
for(int tr = 0; tr<thread_repeat; tr++) {
|
||||
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double& tval) {
|
||||
double vector_for = 1.0;
|
||||
for (int vr = 0; vr<inner_repeat; ++vr) {
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_range), [&] (const int vi) {
|
||||
v3(idx, t, vi) = vi;
|
||||
// prevent compiler optimizing loop away
|
||||
});
|
||||
}
|
||||
tval += t + vector_for;
|
||||
},team_result);
|
||||
}
|
||||
lval+=team_result*team.league_rank();
|
||||
},result);
|
||||
result_expect = 0.5* (team_range)*(team_range-1) * team_size * (0.5*(thread_range) * (thread_range-1) + thread_range);
|
||||
// sum ( seq( [0, team_range) * constant ) + 1 per member per team; constant = sum( seq( [0, thread_range) )*team_size (1 per member, result for each team)
|
||||
}
|
||||
if (test_type == 222) {
|
||||
Kokkos::parallel_reduce("222 outer reduce", t_policy(team_range,team_size,vector_size),
|
||||
KOKKOS_LAMBDA (const t_team& team, double& lval) {
|
||||
double team_result = 0.0;
|
||||
for(int tr = 0; tr<thread_repeat; tr++) {
|
||||
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,thread_range), [&] (const int t, double& tval) {
|
||||
double vector_result = 0.0;
|
||||
for (int vr = 0; vr<inner_repeat; ++vr) {
|
||||
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, vector_range), [&] (const int vi, double& vval) {
|
||||
vval += vi;
|
||||
}, vector_result);
|
||||
}
|
||||
tval += t + vector_result;
|
||||
},team_result);
|
||||
}
|
||||
lval+=team_result*team.league_rank();
|
||||
},result);
|
||||
result_expect = 0.5* (team_range)*(team_range-1) * team_size * (0.5*(thread_range) * (thread_range-1) + thread_range*0.5*(vector_range)*(vector_range-1));
|
||||
// sum ( seq( [0, team_range) * constant ) + 1 + sum( seq([0,vector_range) ) per member per team; constant = sum( seq( [0, thread_range) )*team_size (1 per member, result for each team)
|
||||
}
|
||||
|
||||
// parallel_for RangePolicy: range = team_size*team_range
|
||||
if (test_type == 300) {
|
||||
Kokkos::parallel_for("300 outer for", team_size*team_range,
|
||||
KOKKOS_LAMBDA (const int idx) {
|
||||
v1(idx) = idx;
|
||||
// prevent compiler from optimizing away the loop
|
||||
});
|
||||
}
|
||||
// parallel_reduce RangePolicy: range = team_size*team_range
|
||||
if (test_type == 400) {
|
||||
Kokkos::parallel_reduce("400 outer reduce", team_size*team_range,
|
||||
KOKKOS_LAMBDA (const int idx, double& val) {
|
||||
val += idx;
|
||||
}, result);
|
||||
result_expect = 0.5*(team_size*team_range)*(team_size*team_range-1);
|
||||
}
|
||||
// parallel_scan RangePolicy: range = team_size*team_range
|
||||
if (test_type == 500) {
|
||||
Kokkos::parallel_scan("500 outer scan", team_size*team_range,
|
||||
ParallelScanFunctor<ViewType1>(v1)
|
||||
#if 0
|
||||
// This does not compile with pre Cuda 8.0 - see Github Issue #913 for explanation
|
||||
KOKKOS_LAMBDA (const int idx, double& val, const bool& final) {
|
||||
// inclusive scan
|
||||
val += v1(idx);
|
||||
if ( final ) {
|
||||
v1(idx) = val;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
);
|
||||
// result = v1( team_size*team_range - 1 ); // won't work with Cuda - need to copy result back to host to print
|
||||
// result_expect = 0.5*(team_size*team_range)*(team_size*team_range-1);
|
||||
}
|
||||
|
||||
} // end outer for loop
|
||||
|
||||
time = timer.seconds();
|
||||
} //end test_policy
|
|
@ -0,0 +1,53 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Script to check policy_perf_test code works with each possible combo of options
|
||||
|
||||
echo "Performance test results for parallel_reduce code computing sum of sequence [0,N) with various (nested) policies"
|
||||
|
||||
EXECUTABLE=policy_performance
|
||||
|
||||
TEAMRANGE=1000
|
||||
THREADRANGE=4
|
||||
VECTORRANGE=32
|
||||
TEAMSIZE=4
|
||||
VECTORSIZE=1
|
||||
OREPEAT=1
|
||||
MREPEAT=1
|
||||
IREPEAT=1
|
||||
SCHEDULE=1
|
||||
|
||||
SUFFIX=host
|
||||
if [ -e $EXECUTABLE.$SUFFIX ]
|
||||
then
|
||||
SCHEDULE=1
|
||||
echo "Host tests Static schedule"
|
||||
for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500}
|
||||
do
|
||||
OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
|
||||
done
|
||||
|
||||
SCHEDULE=2
|
||||
echo "Host tests Dynamic schedule"
|
||||
for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500}
|
||||
do
|
||||
OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
|
||||
done
|
||||
fi
|
||||
|
||||
SUFFIX=cuda
|
||||
if [ -e $EXECUTABLE.$SUFFIX ]
|
||||
then
|
||||
SCHEDULE=1
|
||||
echo "Cuda tests Static schedule"
|
||||
for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500}
|
||||
do
|
||||
./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
|
||||
done
|
||||
|
||||
SCHEDULE=2
|
||||
echo "Cuda tests Dynamic schedule"
|
||||
for CODE in {100,110,111,112,120,121,122,200,210,211,212,220,221,222,300,400,500}
|
||||
do
|
||||
./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
|
||||
done
|
||||
fi
|
|
@ -0,0 +1,126 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Sample script for benchmarking policy performance
|
||||
|
||||
# Suggested enviroment variables to export prior to executing script:
|
||||
# KNL:
|
||||
# OMP_NUM_THREADS=256 KMP_AFFINITY=compact
|
||||
# Power:
|
||||
# OMP_NUM_THREADS=64 OMP_PROC_BIND=true
|
||||
|
||||
# Constants and Variables:
|
||||
# Vary: TEAMSIZE, and THREADRANGE
|
||||
# for TEAMSIZE in {1,2,4,5,8}; do
|
||||
# for THREADRANGE in {32,41,1000}; do
|
||||
# Fixed: TEAMRANGE, VECTORRANGE, VECTORSIZE
|
||||
# System specific: Adjust REPEAT values to architecture tests are run on
|
||||
|
||||
# Tests
|
||||
# Static SCHEDULE = 1
|
||||
# Tier 1: parallel_for + RangePolicy 300
|
||||
# Tier 2: parallel_reduce, parallel_scan + RangePolicy 400 500
|
||||
# Tier 3: 'outer' parallel_for with TeamPolicy (nested parallelism) 1XY
|
||||
# Tier 4: 'outer' parallel_reduce with TeamPolicy (nested parallelism) 2XY
|
||||
# Dynamic SCHEDULE = 2
|
||||
# Tier 5: parallel_for + RangePolicy 300
|
||||
# Tier 6: parallel_reduce, parallel_scan + RangePolicy 400 500
|
||||
# Tier 7: 'outer' parallel_for with TeamPolicy (nested parallelism) 1XY
|
||||
# Tier 8: 'outer' parallel_reduce with TeamPolicy (nested parallelism) 2XY
|
||||
|
||||
# Results grouped by:
|
||||
# 0) SCHEDULE 1) CODE (test) 2) TEAMRANGE 3) TEAMSIZE 4) THREADRANGE
|
||||
|
||||
EXECUTABLE=policy_performance
|
||||
|
||||
# Default defined values
|
||||
TEAMRANGE=1000
|
||||
THREADRANGE=1
|
||||
VECTORRANGE=32
|
||||
TEAMSIZE=1
|
||||
VECTORSIZE=1
|
||||
OREPEAT=1
|
||||
MREPEAT=1
|
||||
IREPEAT=1
|
||||
SCHEDULE=1
|
||||
|
||||
# Host tests
|
||||
SUFFIX=host
|
||||
if [ -e $EXECUTABLE.$SUFFIX ]; then
|
||||
echo "Host"
|
||||
|
||||
for SCHEDULE in {1,2}; do
|
||||
|
||||
# Tier 1 and 2, 5 and 6
|
||||
for CODE in {300,400,500}; do
|
||||
for TEAMSIZE in {1,2,4,5,8}; do
|
||||
OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
|
||||
done
|
||||
done
|
||||
|
||||
# Tier 3, 7
|
||||
for CODE in {100,110,111,112,120,121,122}; do
|
||||
for TEAMSIZE in {1,2,4,5,8}; do
|
||||
for THREADRANGE in {32,41,1000}; do
|
||||
OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
|
||||
done
|
||||
done
|
||||
done
|
||||
|
||||
# Tier 4, 8
|
||||
for CODE in {200,210,211,212,220,221,222}; do
|
||||
for TEAMSIZE in {1,2,4,5,8}; do
|
||||
for THREADRANGE in {32,41,1000}; do
|
||||
OMP_PROC_BIND=true ./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
|
||||
done
|
||||
done
|
||||
done
|
||||
|
||||
done # end SCHEDULE
|
||||
|
||||
fi # end host
|
||||
|
||||
|
||||
# Cuda tests
|
||||
SUFFIX=cuda
|
||||
# TEAMRANGE=10000, TEAMSIZE=8 too large
|
||||
# TEAMRANGE=10000, TEAMSIZE=8, THREADRANGE=1000 too large
|
||||
if [ -e $EXECUTABLE.$SUFFIX ]; then
|
||||
echo "Cuda"
|
||||
|
||||
for SCHEDULE in {1,2}; do
|
||||
|
||||
# Reset defaults
|
||||
TEAMRANGE=1000
|
||||
THREADRANGE=1
|
||||
VECTORRANGE=32
|
||||
TEAMSIZE=1
|
||||
VECTORSIZE=1
|
||||
|
||||
# Tier 1 and 2, 5 and 6
|
||||
for CODE in {300,400,500}; do
|
||||
for TEAMSIZE in {1,2,4,5,8}; do
|
||||
./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
|
||||
done
|
||||
done
|
||||
|
||||
# Tier 3, 7
|
||||
for CODE in {100,110,111,112,120,121,122}; do
|
||||
for TEAMSIZE in {1,2,4,5,8}; do
|
||||
for THREADRANGE in {32,41,1000}; do
|
||||
./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
|
||||
done
|
||||
done
|
||||
done
|
||||
|
||||
# Tier 4, 8
|
||||
for CODE in {200,210,211,212,220,221,222}; do
|
||||
for TEAMSIZE in {1,2,4,5,8}; do
|
||||
for THREADRANGE in {32,41,1000}; do
|
||||
./$EXECUTABLE.$SUFFIX $TEAMRANGE $THREADRANGE $VECTORRANGE $OREPEAT $MREPEAT $IREPEAT $TEAMSIZE $VECTORSIZE $SCHEDULE $CODE
|
||||
done
|
||||
done
|
||||
done
|
||||
|
||||
done # end SCHEDULE
|
||||
|
||||
fi #end cuda
|
|
@ -0,0 +1,454 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
################################################################################
|
||||
# Check if hwloc commands exist
|
||||
################################################################################
|
||||
declare -i HPCBIND_HAS_HWLOC=1
|
||||
type hwloc-bind >/dev/null 2>&1
|
||||
HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
|
||||
|
||||
type hwloc-distrib >/dev/null 2>&1
|
||||
HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
|
||||
|
||||
type hwloc-ls >/dev/null 2>&1
|
||||
HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
|
||||
|
||||
type hwloc-calc >/dev/null 2>&1
|
||||
HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
|
||||
|
||||
type hwloc-ps >/dev/null 2>&1
|
||||
HPCBIND_HAS_HWLOC=$((HPCBIND_HAS_HWLOC & ! $?))
|
||||
|
||||
if [[ ${HPCBIND_HAS_HWLOC} -eq 0 ]]; then
|
||||
echo "hwloc not found, no process binding will occur"
|
||||
fi
|
||||
|
||||
# Get parent cpuset
|
||||
HPCBIND_HWLOC_PARENT_CPUSET=""
|
||||
if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
|
||||
MY_PID="$BASHPID"
|
||||
HPCBIND_HWLOC_PARENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2)
|
||||
fi
|
||||
|
||||
################################################################################
|
||||
# Check if nvidia-smi exist
|
||||
################################################################################
|
||||
declare -i HPCBIND_HAS_NVIDIA=0
|
||||
type nvidia-smi >/dev/null 2>&1
|
||||
HPCBIND_HAS_NVIDIA=$((!$?))
|
||||
|
||||
|
||||
################################################################################
|
||||
# Get visible gpu
|
||||
################################################################################
|
||||
declare -i NUM_GPUS=0
|
||||
HPCBIND_VISIBLE_GPUS=""
|
||||
if [[ ${HPCBIND_HAS_NVIDIA} -eq 1 ]]; then
|
||||
NUM_GPUS=$(nvidia-smi -L | wc -l);
|
||||
GPU_LIST="$( seq 0 $((NUM_GPUS-1)) )"
|
||||
HPCBIND_VISIBLE_GPUS=${CUDA_VISIBLE_DEVICES:-${GPU_LIST}}
|
||||
fi
|
||||
|
||||
declare -i HPCBIND_ENABLE_GPU_MAPPING=$((NUM_GPUS > 0))
|
||||
|
||||
|
||||
################################################################################
|
||||
# Get queue id
|
||||
# supports sbatch, bsub, aprun
|
||||
################################################################################
|
||||
HPCBIND_QUEUE_NAME=""
|
||||
declare -i HPCBIND_QUEUE_INDEX=0
|
||||
declare -i HPCBIND_QUEUE_GPU_MAPPING=0
|
||||
|
||||
if [[ ! -z "${SLURM_LOCAL_ID}" ]]; then
|
||||
HPCBIND_QUEUE_GPU_MAPPING=1
|
||||
HPCBIND_QUEUE_NAME="sbatch"
|
||||
HPCBIND_QUEUE_INDEX=${SLURM_LOCAL_ID}
|
||||
elif [[ ! -z "${LBS_JOBINDEX}" ]]; then
|
||||
HPCBIND_QUEUE_GPU_MAPPING=1
|
||||
HPCBIND_QUEUE_NAME="bsub"
|
||||
HPCBIND_QUEUE_INDEX=${LBS_JOBINDEX}
|
||||
elif [[ ! -z "${ALPS_APP_PE}" ]]; then
|
||||
HPCBIND_QUEUE_GPU_MAPPING=1
|
||||
HPCBIND_QUEUE_NAME="aprun"
|
||||
HPCBIND_QUEUE_INDEX=${ALPS_APP_PE}
|
||||
fi
|
||||
|
||||
|
||||
################################################################################
|
||||
# Show help
|
||||
################################################################################
|
||||
function show_help {
|
||||
local cmd=$(basename "$0")
|
||||
echo "Usage: ${cmd} <options> -- command ..."
|
||||
echo " Set the process mask, OMP environment variables and CUDA environment"
|
||||
echo " variables to sane values if possible. Uses hwloc and nvidia-smi if"
|
||||
echo " available. Will preserve the current process binding, so it is safe"
|
||||
echo " to use with a queuing system or mpiexec."
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo " --no-hwloc-bind Disable binding"
|
||||
echo " --proc-bind=<LOC> Set the initial process mask for the script"
|
||||
echo " LOC can be any valid location argument for"
|
||||
echo " hwloc-calc Default: all"
|
||||
echo " --distribute=N Distribute the current cpuset into N partitions"
|
||||
echo " --distribute-partition=I"
|
||||
echo " Use the i'th partition (zero based)"
|
||||
echo " --visible-gpus=<L> Comma separated list of gpu ids"
|
||||
echo " Default: CUDA_VISIBLE_DEVICES or all gpus in"
|
||||
echo " sequential order"
|
||||
echo " --gpu-ignore-queue Ignore queue job id when choosing visible GPU"
|
||||
echo " --no-gpu-mapping Do not set CUDA_VISIBLE_DEVICES"
|
||||
echo " --openmp=M.m Set env variables for the given OpenMP version"
|
||||
echo " Default: 4.0"
|
||||
echo " --openmp-percent=N Integer percentage of cpuset to use for OpenMP"
|
||||
echo " threads Default: 100"
|
||||
echo " --openmp-places=<Op> Op=threads|cores|sockets. Default: threads"
|
||||
echo " --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES"
|
||||
echo " --force-openmp-num-threads=N"
|
||||
echo " Override logic for selecting OMP_NUM_THREADS"
|
||||
echo " --force-openmp-proc-bind=<OP>"
|
||||
echo " Override logic for selecting OMP_PROC_BIND"
|
||||
echo " --no-openmp-nested Set OMP_NESTED to false"
|
||||
echo " --show-bindings Show the bindings"
|
||||
echo " --lstopo Show bindings in lstopo without executing a command"
|
||||
echo " -v|--verbose Show options and relevant environment variables"
|
||||
echo " -h|--help Show this message"
|
||||
echo ""
|
||||
echo "Sample Usage:"
|
||||
echo " Split the current process cpuset into 4 and use the 3rd partition"
|
||||
echo " ${cmd} --distribute=4 --distribute-partition=2 -v -- command ..."
|
||||
echo " Bing the process to all even cores"
|
||||
echo " ${cmd} --proc-bind=core:even -v -- command ..."
|
||||
echo " Bind to the first 64 cores and split the current process cpuset into 4"
|
||||
echo " ${cmd} --proc-bind=core:0-63 --distribute=4 --distribute-partition=0 -- command ..."
|
||||
echo " skip GPU 0 when mapping visible devices"
|
||||
echo " ${cmd} --distribute=4 --distribute-partition=0 --visible-gpus=1,2 -v -- command ..."
|
||||
echo " Display the current bindings"
|
||||
echo " ${cmd} --proc-bind=numa:0 --show-bindings -- command"
|
||||
echo " Display the current bindings using lstopo"
|
||||
echo " ${cmd} --proc-bind=numa:0.core:odd --lstopo"
|
||||
echo ""
|
||||
}
|
||||
|
||||
|
||||
################################################################################
|
||||
# Parse command line arguments
|
||||
################################################################################
|
||||
# Show help if no command line arguments given
|
||||
if [[ "$#" -eq 0 ]]; then
|
||||
show_help
|
||||
exit 0
|
||||
fi
|
||||
|
||||
declare -a UNKNOWN_ARGS=()
|
||||
declare -i HPCBIND_ENABLE_HWLOC_BIND=${HPCBIND_HAS_HWLOC}
|
||||
declare -i HPCBIND_DISTRIBUTE=1
|
||||
declare -i HPCBIND_PARTITION=0
|
||||
HPCBIND_PROC_BIND="all"
|
||||
HPCBIND_OPENMP_VERSION=4.0
|
||||
declare -i HPCBIND_OPENMP_PERCENT=100
|
||||
HPCBIND_OPENMP_PLACES=${OMP_PLACES:-threads}
|
||||
declare -i HPCBIND_OPENMP_PROC_BIND=1
|
||||
declare -i HPCBIND_OPENMP_FORCE_NUM_THREADS=-1
|
||||
HPCBIND_OPENMP_FORCE_PROC_BIND=""
|
||||
HPCBIND_OPENMP_NESTED=${OMP_NESTED:-true}
|
||||
declare -i HPCBIND_VERBOSE=0
|
||||
|
||||
declare -i HPCBIND_SHOW_BINDINGS=0
|
||||
declare -i HPCBIND_LSTOPO=0
|
||||
|
||||
for i in $@; do
|
||||
case $i in
|
||||
# number of partitions to create
|
||||
--no-hwloc-bind)
|
||||
HPCBIND_ENABLE_HWLOC_BIND=0
|
||||
shift
|
||||
;;
|
||||
--proc-bind=*)
|
||||
HPCBIND_PROC_BIND="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--distribute=*)
|
||||
HPCBIND_DISTRIBUTE="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
# which partition to use
|
||||
--distribute-partition=*)
|
||||
HPCBIND_PARTITION="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--visible-gpus=*)
|
||||
HPCBIND_VISIBLE_GPUS=$(echo "${i#*=}" | tr ',' ' ')
|
||||
shift
|
||||
;;
|
||||
--gpu-ignore-queue)
|
||||
HPCBIND_QUEUE_GPU_MAPPING=0
|
||||
shift
|
||||
;;
|
||||
--no-gpu-mapping)
|
||||
HPCBIND_ENABLE_GPU_MAPPING=0
|
||||
shift
|
||||
;;
|
||||
--openmp=*)
|
||||
HPCBIND_OPENMP_VERSION="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--openmp-percent=*)
|
||||
HPCBIND_OPENMP_PERCENT="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--openmp-places=*)
|
||||
HPCBIND_OPENMP_PLACES="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--no-openmp-proc-bind)
|
||||
HPCBIND_OPENMP_PROC_BIND=0
|
||||
shift
|
||||
;;
|
||||
--force-openmp-proc-bind=*)
|
||||
HPCBIND_OPENMP_FORCE_PROC_BIND="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--force-openmp-num-threads=*)
|
||||
HPCBIND_OPENMP_FORCE_NUM_THREADS="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--no-openmp-nested)
|
||||
HPCBIND_OPENMP_NESTED="false"
|
||||
shift
|
||||
;;
|
||||
--show-bindings)
|
||||
HPCBIND_VERBOSE=1
|
||||
HPCBIND_SHOW_BINDINGS=1
|
||||
shift
|
||||
;;
|
||||
--lstopo)
|
||||
HPCBIND_VERBOSE=1
|
||||
HPCBIND_SHOW_BINDINGS=0
|
||||
HPCBIND_LSTOPO=1
|
||||
shift
|
||||
;;
|
||||
-v|--verbose)
|
||||
HPCBIND_VERBOSE=1
|
||||
shift
|
||||
;;
|
||||
-h|--help)
|
||||
show_help
|
||||
exit 0
|
||||
;;
|
||||
# ignore remaining arguments
|
||||
--)
|
||||
shift
|
||||
break
|
||||
;;
|
||||
# unknown option
|
||||
*)
|
||||
UNKNOWN_ARGS+=("$i")
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
|
||||
################################################################################
|
||||
# Check unknown arguments
|
||||
################################################################################
|
||||
if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then
|
||||
echo "Uknown options: ${UNKNOWN_ARGS[*]}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
################################################################################
|
||||
# Check that visible gpus are valid
|
||||
################################################################################
|
||||
HPCBIND_VISIBLE_GPUS=(${HPCBIND_VISIBLE_GPUS})
|
||||
if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then
|
||||
for ((i=0; i < ${#HPCBIND_VISIBLE_GPUS[*]}; i++)); do
|
||||
if [[ ${HPCBIND_VISIBLE_GPUS[$i]} -ge ${NUM_GPUS} ||
|
||||
${HPCBIND_VISIBLE_GPUS[$i]} -lt 0 ]]; then
|
||||
echo "Invaild GPU ID ${HPCBIND_VISIBLE_GPUS[$i]}, setting to 0"
|
||||
HPCBIND_VISIBLE_GPUS[$i]=0;
|
||||
fi
|
||||
done
|
||||
NUM_GPUS=${#HPCBIND_VISIBLE_GPUS[@]}
|
||||
fi
|
||||
|
||||
|
||||
################################################################################
|
||||
# Check OpenMP percent
|
||||
################################################################################
|
||||
if [[ ${HPCBIND_OPENMP_PERCENT} -lt 1 ]]; then
|
||||
echo "OpenMP percent < 1, setting to 1"
|
||||
HPCBIND_OPENMP_PERCENT=1
|
||||
elif [[ ${HPCBIND_OPENMP_PERCENT} -gt 100 ]]; then
|
||||
echo "OpenMP percent > 100, setting to 100"
|
||||
HPCBIND_OPENMP_PERCENT=100
|
||||
fi
|
||||
|
||||
################################################################################
|
||||
# Check distribute
|
||||
################################################################################
|
||||
if [[ ${HPCBIND_DISTRIBUTE} -le 0 ]]; then
|
||||
echo "Invalid input for distribute, changing distribute to 1"
|
||||
HPCBIND_DISTRIBUTE=1
|
||||
fi
|
||||
|
||||
if [[ ${HPCBIND_PARTITION} -ge ${HPCBIND_DISTRIBUTE} ]]; then
|
||||
echo "Invalid input for distribute-partition, changing to 0"
|
||||
HPCBIND_PARTITION=0
|
||||
fi
|
||||
|
||||
|
||||
################################################################################
|
||||
# Find cpuset and num threads
|
||||
################################################################################
|
||||
HPCBIND_HWLOC_CPUSET=""
|
||||
declare -i HPCBIND_NUM_PUS=0
|
||||
|
||||
if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
|
||||
if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then
|
||||
BINDING=$(hwloc-calc ${HPCBIND_PROC_BIND})
|
||||
else
|
||||
BINDING=$(hwloc-calc --restrict ${HPCBIND_HWLOC_PARENT_CPUSET} ${HPCBIND_PROC_BIND})
|
||||
fi
|
||||
|
||||
CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${HPCBIND_DISTRIBUTE}))
|
||||
HPCBIND_HWLOC_CPUSET=${CPUSETS[${HPCBIND_PARTITION}]}
|
||||
HPCBIND_NUM_PUS=$(hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu | wc -l)
|
||||
else
|
||||
HPCBIND_NUM_PUS=$(cat /proc/cpuinfo | grep -c processor)
|
||||
fi
|
||||
|
||||
declare -i HPCBIND_OPENMP_NUM_THREADS=$((HPCBIND_NUM_PUS * HPCBIND_OPENMP_PERCENT))
|
||||
HPCBIND_OPENMP_NUM_THREADS=$((HPCBIND_OPENMP_NUM_THREADS / 100))
|
||||
|
||||
|
||||
if [[ ${HPCBIND_OPENMP_NUM_THREADS} -lt 1 ]]; then
|
||||
HPCBIND_OPENMP_NUM_THREADS=1
|
||||
elif [[ ${HPCBIND_OPENMP_NUM_THREADS} -gt ${HPCBIND_NUM_PUS} ]]; then
|
||||
HPCBIND_OPENMP_NUM_THREADS=${HPCBIND_NUM_PUS}
|
||||
fi
|
||||
|
||||
if [[ ${HPCBIND_OPENMP_FORCE_NUM_THREADS} -gt 0 ]]; then
|
||||
HPCBIND_OPENMP_NUM_THREADS=${HPCBIND_OPENMP_FORCE_NUM_THREADS}
|
||||
fi
|
||||
|
||||
################################################################################
|
||||
# Set OpenMP environment variables
|
||||
################################################################################
|
||||
|
||||
# set OMP_NUM_THREADS
|
||||
export OMP_NUM_THREADS=${HPCBIND_OPENMP_NUM_THREADS}
|
||||
|
||||
# set OMP_PROC_BIND and OMP_PLACES
|
||||
if [[ ${HPCBIND_OPENMP_PROC_BIND} -eq 1 ]]; then
|
||||
if [[ "${HPCBIND_OPENMP_FORCE_PROC_BIND}" == "" ]]; then
|
||||
#default proc bind logic
|
||||
if [[ "${HPCBIND_OPENMP_VERSION}" == "4.0" || "${HPCBIND_OPENMP_VERSION}" > "4.0" ]]; then
|
||||
export OMP_PLACES="${HPCBIND_OPENMP_PLACES}"
|
||||
export OMP_PROC_BIND="spread"
|
||||
else
|
||||
export OMP_PROC_BIND="true"
|
||||
unset OMP_PLACES
|
||||
fi
|
||||
else
|
||||
#force proc bind
|
||||
export OMP_PLACES="${HPCBIND_OPENMP_PLACES}"
|
||||
export OMP_PROC_BIND="${HPCBIND_OPENMP_FORCE_PROC_BIND}"
|
||||
fi
|
||||
else
|
||||
# no openmp proc bind
|
||||
unset OMP_PLACES
|
||||
unset OMP_PROC_BIND
|
||||
fi
|
||||
|
||||
# set OMP_NESTED
|
||||
export OMP_NESTED=${HPCBIND_OPENMP_NESTED}
|
||||
|
||||
|
||||
################################################################################
|
||||
# Set CUDA environment variables
|
||||
################################################################################
|
||||
|
||||
if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then
|
||||
if [[ ${HPCBIND_QUEUE_GPU_MAPPING} -eq 0 ]]; then
|
||||
declare -i GPU_ID=$((HPCBIND_PARTITION % NUM_GPUS))
|
||||
export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}
|
||||
else
|
||||
declare -i MY_TASK_ID=$((HPCBIND_QUEUE_INDEX * HPCBIND_DISTRIBUTE + HPCBIND_PARTITION))
|
||||
declare -i GPU_ID=$((MY_TASK_ID % NUM_GPUS))
|
||||
export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}
|
||||
fi
|
||||
fi
|
||||
|
||||
################################################################################
|
||||
# Set hpcbind environment variables
|
||||
################################################################################
|
||||
export HPCBIND_HAS_HWLOC=${HPCBIND_HAS_HWLOC}
|
||||
export HPCBIND_HAS_NVIDIA=${HPCBIND_HAS_NVIDIA}
|
||||
export HPCBIND_NUM_PUS=${HPCBIND_NUM_PUS}
|
||||
export HPCBIND_HWLOC_CPUSET=${HPCBIND_HWLOC_CPUSET}
|
||||
export HPCBIND_HWLOC_DISTRIBUTE=${HPCBIND_DISTRIBUTE}
|
||||
export HPCBIND_HWLOC_DISTRIBUTE_PARTITION=${HPCBIND_PARTITION}
|
||||
if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then
|
||||
export HPCBIND_HWLOC_PARENT_CPUSET="all"
|
||||
else
|
||||
export HPCBIND_HWLOC_PARENT_CPUSET=${HPCBIND_HWLOC_PARENT_CPUSET}
|
||||
fi
|
||||
export HPCBIND_HWLOC_PROC_BIND=${HPCBIND_PROC_BIND}
|
||||
export HPCBIND_NVIDIA_ENABLE_GPU_MAPPING=${HPCBIND_ENABLE_GPU_MAPPING}
|
||||
export HPCBIND_NVIDIA_VISIBLE_GPUS=$(echo "${HPCBIND_VISIBLE_GPUS[*]}" | tr ' ' ',')
|
||||
export HPCBIND_OPENMP_VERSION=${HPCBIND_OPENMP_VERSION}
|
||||
if [[ "${HPCBIND_QUEUE_NAME}" != "" ]]; then
|
||||
export HPCBIND_QUEUE_INDEX=${HPCBIND_QUEUE_INDEX}
|
||||
export HPCBIND_QUEUE_NAME=${HPCBIND_QUEUE_NAME}
|
||||
export HPCBIND_QUEUE_GPU_MAPPING=${HPCBIND_QUEUE_GPU_MAPPING}
|
||||
fi
|
||||
|
||||
|
||||
################################################################################
|
||||
# Print verbose
|
||||
################################################################################
|
||||
|
||||
if [[ ${HPCBIND_VERBOSE} -eq 1 ]]; then
|
||||
MY_ENV=$(env | sort)
|
||||
echo "[HPCBIND]"
|
||||
echo "${MY_ENV}" | grep -E "^HPCBIND_"
|
||||
echo "[CUDA]"
|
||||
echo "${MY_ENV}" | grep -E "^CUDA_"
|
||||
echo "[OPENMP]"
|
||||
echo "${MY_ENV}" | grep -E "^OMP_"
|
||||
fi
|
||||
|
||||
if [[ ${HPCBIND_HAS_HWLOC} -eq 1 && ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then
|
||||
echo "[BINDINGS]"
|
||||
hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu
|
||||
elif [[ ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then
|
||||
echo "Unable to show bindings, hwloc not available."
|
||||
fi
|
||||
|
||||
################################################################################
|
||||
# Run command
|
||||
################################################################################
|
||||
|
||||
if [[ ${HPCBIND_LSTOPO} -eq 0 ]]; then
|
||||
if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
|
||||
hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- $@
|
||||
else
|
||||
eval $@
|
||||
fi
|
||||
else
|
||||
if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
|
||||
if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 && ! -z ${DISPLAY} ]]; then
|
||||
echo "[BINDINGS]"
|
||||
hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu
|
||||
hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- lstopo --pid 0
|
||||
else
|
||||
hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET}
|
||||
fi
|
||||
else
|
||||
echo "Unable to show bindings, hwloc not available."
|
||||
fi
|
||||
fi
|
|
@ -0,0 +1,221 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# check if hwloc commands exist
|
||||
declare -i HAS_HWLOC=0
|
||||
type hwloc-bind >/dev/null 2>&1
|
||||
HAS_HWLOC="${HAS_HWLOC} + $?"
|
||||
|
||||
type hwloc-distrib >/dev/null 2>&1
|
||||
HAS_HWLOC="${HAS_HWLOC} + $?"
|
||||
|
||||
type hwloc-ls >/dev/null 2>&1
|
||||
HAS_HWLOC="${HAS_HWLOC} + $?"
|
||||
|
||||
type hwloc-calc >/dev/null 2>&1
|
||||
HAS_HWLOC="${HAS_HWLOC} + $?"
|
||||
|
||||
type hwloc-ps >/dev/null 2>&1
|
||||
HAS_HWLOC="${HAS_HWLOC} + $?"
|
||||
|
||||
|
||||
#parse args
|
||||
declare -a UNKNOWN_ARGS=()
|
||||
declare -i DISTRIBUTE=1
|
||||
declare -i INDEX=0
|
||||
PROC_BIND="all"
|
||||
CURRENT_CPUSET=""
|
||||
OPENMP_VERSION=4.0
|
||||
OPENMP_PROC_BIND=True
|
||||
OPENMP_NESTED=True
|
||||
VERBOSE=False
|
||||
|
||||
#get the current process cpuset
|
||||
if [[ ${HAS_HWLOC} -eq 0 ]]; then
|
||||
MY_PID="$BASHPID"
|
||||
CURRENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2)
|
||||
echo "$CURRENT_CPUSET"
|
||||
fi
|
||||
|
||||
function show_help {
|
||||
local cmd=$(basename "$0")
|
||||
echo "Usage: ${cmd} <options> -- command ..."
|
||||
echo " Uses hwloc to divide the node into the given number of groups,"
|
||||
echo " set the appropriate OMP_NUM_THREADS and execute the command on the"
|
||||
echo " selected group."
|
||||
echo ""
|
||||
echo " NOTE: This command assumes it has exclusive use of the node"
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo " --proc-bind=<LOC> Set the initial process mask for the script. "
|
||||
echo " LOC can be any valid location argumnet for"
|
||||
echo " hwloc-calc. Defaults to the entire machine"
|
||||
echo " --distribute=N Distribute the current proc-bind into N groups"
|
||||
echo " --index=I Use the i'th group (zero based)"
|
||||
echo " --openmp=M.m Set env variables for the given OpenMP version"
|
||||
echo " (default 4.0)"
|
||||
echo " --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES"
|
||||
echo " --no-openmp-nested Set OMP_NESTED to false"
|
||||
echo " -v|--verbose"
|
||||
echo " -h|--help"
|
||||
echo ""
|
||||
echo "Sample Usage:"
|
||||
echo " ${cmd} --distribute=4 --index=2 -v -- command ..."
|
||||
echo ""
|
||||
}
|
||||
|
||||
if [[ "$#" -eq 0 ]]; then
|
||||
show_help
|
||||
exit 0
|
||||
fi
|
||||
|
||||
|
||||
for i in $@; do
|
||||
case $i in
|
||||
# number of partitions to create
|
||||
--proc-bind=*)
|
||||
PROC_BIND="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--distribute=*)
|
||||
DISTRIBUTE="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
# which group to use
|
||||
--index=*)
|
||||
INDEX="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--openmp=*)
|
||||
OPENMP_VERSION="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
--no-openmp-proc-bind)
|
||||
OPENMP_PROC_BIND=False
|
||||
shift
|
||||
;;
|
||||
--no-openmp-nested)
|
||||
OPENMP_NESTED=False
|
||||
shift
|
||||
;;
|
||||
-v|--verbose)
|
||||
VERBOSE=True
|
||||
shift
|
||||
;;
|
||||
-h|--help)
|
||||
show_help
|
||||
exit 0
|
||||
;;
|
||||
# ignore remaining arguments
|
||||
--)
|
||||
shift
|
||||
break
|
||||
;;
|
||||
# unknown option
|
||||
*)
|
||||
UNKNOWN_ARGS+=("$i")
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then
|
||||
echo "Uknown options: ${UNKNOWN_ARGS[*]}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ ${DISTRIBUTE} -le 0 ]]; then
|
||||
echo "Invalid input for distribute, changing distribute to 1"
|
||||
DISTRIBUTE=1
|
||||
fi
|
||||
|
||||
if [[ ${INDEX} -ge ${DISTRIBUTE} ]]; then
|
||||
echo "Invalid input for index, changing index to 0"
|
||||
INDEX=0
|
||||
fi
|
||||
|
||||
if [[ ${HAS_HWLOC} -ne 0 ]]; then
|
||||
echo "hwloc not found, no process binding will occur"
|
||||
DISTRIBUTE=1
|
||||
INDEX=0
|
||||
fi
|
||||
|
||||
if [[ ${HAS_HWLOC} -eq 0 ]]; then
|
||||
|
||||
if [[ "${CURRENT_CPUSET}" == "" ]]; then
|
||||
BINDING=$(hwloc-calc ${PROC_BIND})
|
||||
else
|
||||
BINDING=$(hwloc-calc --restrict ${CURRENT_CPUSET} ${PROC_BIND})
|
||||
fi
|
||||
|
||||
CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${DISTRIBUTE}))
|
||||
CPUSET=${CPUSETS[${INDEX}]}
|
||||
NUM_THREADS=$(hwloc-ls --restrict ${CPUSET} --only pu | wc -l)
|
||||
|
||||
if [[ "${VERBOSE}" == "True" ]]; then
|
||||
echo "hwloc: true"
|
||||
echo " proc_bind: ${PROC_BIND}"
|
||||
echo " distribute: ${DISTRIBUTE}"
|
||||
echo " index: ${INDEX}"
|
||||
echo " parent_cpuset: ${CURRENT_CPUSET}"
|
||||
echo " cpuset: ${CPUSET}"
|
||||
echo "omp_num_threads: ${NUM_THREADS}"
|
||||
echo "omp_proc_bind: ${OPENMP_PROC_BIND}"
|
||||
echo "omp_nested: ${OPENMP_NESTED}"
|
||||
echo "OpenMP: ${OPENMP_VERSION}"
|
||||
fi
|
||||
|
||||
# set OMP env
|
||||
if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then
|
||||
if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then
|
||||
export OMP_PLACES="threads"
|
||||
export OMP_PROC_BIND="spread"
|
||||
else
|
||||
export OMP_PROC_BIND="true"
|
||||
unset OMP_PLACES
|
||||
fi
|
||||
else
|
||||
unset OMP_PLACES
|
||||
unset OMP_PROC_BIND
|
||||
fi
|
||||
if [[ "${OPENMP_NESTED}" == "True" ]]; then
|
||||
export OMP_NESTED="true"
|
||||
else
|
||||
export OMP_NESTED="false"
|
||||
fi
|
||||
export OMP_NUM_THREADS="${NUM_THREADS}"
|
||||
|
||||
hwloc-bind ${CPUSET} -- $@
|
||||
else
|
||||
NUM_THREADS=$(cat /proc/cpuinfo | grep -c processor)
|
||||
|
||||
if [[ "${VERBOSE}" == "True" ]]; then
|
||||
echo "hwloc: false"
|
||||
echo "omp_num_threads: ${NUM_THREADS}"
|
||||
echo "omp_proc_bind: ${OPENMP_PROC_BIND}"
|
||||
echo "omp_nested: ${OPENMP_NESTED}"
|
||||
echo "OpenMP: ${OPENMP_VERSION}"
|
||||
fi
|
||||
|
||||
# set OMP env
|
||||
if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then
|
||||
if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then
|
||||
export OMP_PLACES="threads"
|
||||
export OMP_PROC_BIND="spread"
|
||||
else
|
||||
export OMP_PROC_BIND="true"
|
||||
unset OMP_PLACES
|
||||
fi
|
||||
else
|
||||
unset OMP_PLACES
|
||||
unset OMP_PROC_BIND
|
||||
fi
|
||||
if [[ "${OPENMP_NESTED}" == "True" ]]; then
|
||||
export OMP_NESTED="true"
|
||||
else
|
||||
export OMP_NESTED="false"
|
||||
fi
|
||||
export OMP_NUM_THREADS="${NUM_THREADS}"
|
||||
|
||||
eval $@
|
||||
fi
|
||||
|
|
@ -0,0 +1,165 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
function get_path() {
|
||||
cd "$(dirname "$0")"
|
||||
cd ..
|
||||
echo "$(pwd -P)"
|
||||
}
|
||||
|
||||
KOKKOS_PATH="$(get_path "$0")"
|
||||
|
||||
function show_help() {
|
||||
local cmd=$(basename "$0")
|
||||
echo "Usage: ${cmd} <options> "
|
||||
echo " Build and run the tests"
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo " -j=N|--make-j=N Build the tests in parallel"
|
||||
echo " -c|--clean Clean build and regenerate make files"
|
||||
echo " --clean-on-pass Clean build when runtest passes"
|
||||
echo " --output-prefix=<pre> Prefix of log files Default: runtest"
|
||||
echo " --build-only Only build the tests"
|
||||
echo " -v|--verbose Tee STDOUT and STDERR to screen and files"
|
||||
echo " -h|--help Show this message"
|
||||
echo ""
|
||||
${KOKKOS_PATH}/generate_makefile.bash --help
|
||||
return 0
|
||||
}
|
||||
|
||||
|
||||
declare -a GENERATE_ARGS=()
|
||||
declare -i VERBOSE=0
|
||||
declare -i CLEAN=0
|
||||
declare -i CLEAN_ON_PASS=0
|
||||
declare -i BUILD_ONLY=0
|
||||
OUTPUT="runtest"
|
||||
|
||||
declare -i MAKE_J=${HPCBIND_NUM_PUS:-1}
|
||||
|
||||
for i in $@; do
|
||||
case $i in
|
||||
-j=*|--make-j=*)
|
||||
MAKE_J=${i#*=}
|
||||
shift
|
||||
;;
|
||||
-c|--clean)
|
||||
CLEAN=1
|
||||
shift
|
||||
;;
|
||||
--clean-on-pass)
|
||||
CLEAN_ON_PASS=1
|
||||
shift
|
||||
;;
|
||||
--output-prefix=*)
|
||||
OUTPUT=${i#*=}
|
||||
shift
|
||||
;;
|
||||
--build-only)
|
||||
BUILD_ONLY=1
|
||||
shift
|
||||
;;
|
||||
-v|--verbose)
|
||||
VERBOSE=1
|
||||
shift
|
||||
;;
|
||||
-h|--help)
|
||||
show_help
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
GENERATE_ARGS+=("$i")
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ "$(pwd -P)" == ${KOKKOS_PATH} ]]; then
|
||||
echo "Cannot call $0 from root repository path ${KOKKOS_PATH}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Some makefile dependencies are incorrect, so clean needs to force
|
||||
# a new call to generate_makefiles.bash
|
||||
if [[ ${CLEAN} -eq 1 ]]; then
|
||||
START=${SECONDS}
|
||||
echo "Cleaning"
|
||||
/bin/rm -rf algorithms containers core example install Makefile >/dev/null 2>&1
|
||||
END=${SECONDS}
|
||||
echo " $((END-START)) seconds"
|
||||
if [[ ${VERBOSE} -eq 1 ]]; then
|
||||
echo ""
|
||||
echo ""
|
||||
fi
|
||||
fi
|
||||
|
||||
declare -i START=${SECONDS}
|
||||
echo "Generating Makefile"
|
||||
echo " ${KOKKOS_PATH}/generate_makefile.bash --kokkos-path=${KOKKOS_PATH} ${GENERATE_ARGS[@]}"
|
||||
|
||||
if [[ ${VERBOSE} -eq 0 ]]; then
|
||||
"${KOKKOS_PATH}"/generate_makefile.bash --kokkos-path="${KOKKOS_PATH}" "${GENERATE_ARGS[@]}" > ${OUTPUT}.out 2> >(tee ${OUTPUT}.err >&2)
|
||||
else
|
||||
"${KOKKOS_PATH}"/generate_makefile.bash --kokkos-path="${KOKKOS_PATH}" "${GENERATE_ARGS[@]}" > >(tee ${OUTPUT}.out) 2> >(tee ${OUTPUT}.err >&2)
|
||||
fi
|
||||
declare -i RESULT=$?
|
||||
declare -i END=${SECONDS}
|
||||
if [[ ${RESULT} -eq 0 ]]; then
|
||||
echo " PASS: $((END-START)) seconds"
|
||||
if [[ ${VERBOSE} -eq 1 ]]; then
|
||||
echo ""
|
||||
echo ""
|
||||
fi
|
||||
else
|
||||
cat ${OUTPUT}.out | grep "FAIL"
|
||||
cat ${OUTPUT}.err | grep "FAIL"
|
||||
echo " FAIL: $((END-START)) seconds"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
START=${SECONDS}
|
||||
echo "Building"
|
||||
if [[ ${VERBOSE} -eq 0 ]]; then
|
||||
make --keep-going -j ${MAKE_J} build-test >> ${OUTPUT}.out 2> >(tee -a ${OUTPUT}.err >&2)
|
||||
else
|
||||
make --keep-going -j ${MAKE_J} build-test > >(tee -a ${OUTPUT}.out) 2> >(tee -a ${OUTPUT}.err >&2)
|
||||
fi
|
||||
RESULT=$?
|
||||
END=${SECONDS}
|
||||
if [[ ${RESULT} -eq 0 ]]; then
|
||||
echo " PASS: $((END-START)) seconds"
|
||||
if [[ ${VERBOSE} -eq 1 ]]; then
|
||||
echo ""
|
||||
echo ""
|
||||
fi
|
||||
else
|
||||
cat ${OUTPUT}.out | grep -E "[[:space:]]error:[[:space:]]"
|
||||
cat ${OUTPUT}.err | grep -E "[[:space:]]error:[[:space:]]"
|
||||
echo " FAIL: $((END-START)) seconds"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ ${BUILD_ONLY} -eq 0 ]]; then
|
||||
START=${SECONDS}
|
||||
echo "Testing"
|
||||
if [[ ${VERBOSE} -eq 0 ]]; then
|
||||
make --keep-going test >> ${OUTPUT}.out 2> >(tee -a ${OUTPUT}.err >&2)
|
||||
else
|
||||
make --keep-going test > >(tee -a ${OUTPUT}.out) 2> >(tee -a ${OUTPUT}.err >&2)
|
||||
fi
|
||||
RESULT=$?
|
||||
END=${SECONDS}
|
||||
if [[ ${RESULT} -eq 0 ]]; then
|
||||
echo " PASS: $((END-START)) seconds"
|
||||
if [[ ${CLEAN_ON_PASS} -eq 1 ]]; then
|
||||
make clean
|
||||
fi
|
||||
else
|
||||
cat ${OUTPUT}.out | grep "FAIL"
|
||||
cat ${OUTPUT}.err | grep "FAIL"
|
||||
echo " FAIL: $((END-START)) seconds"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
exit ${RESULT}
|
||||
|
|
@ -999,8 +999,12 @@ SET (Kokkos_INCLUDE_DIRS
|
|||
${Kokkos_SOURCE_DIR}/containers/src
|
||||
${Kokkos_SOURCE_DIR}/algorithms/src
|
||||
${Kokkos_BINARY_DIR} # to find KokkosCore_config.h
|
||||
${KOKKOS_INCLUDE_DIRS}
|
||||
)
|
||||
|
||||
# pass include dirs back to parent scope
|
||||
SET(Kokkos_INCLUDE_DIRS_RET ${Kokkos_INCLUDE_DIRS} PARENT_SCOPE)
|
||||
|
||||
INCLUDE_DIRECTORIES(${Kokkos_INCLUDE_DIRS})
|
||||
|
||||
IF(KOKKOS_SEPARATE_LIBS)
|
||||
|
|
|
@ -7,3 +7,4 @@ tag: 2.02.07 date: 12:16:2016 master: 4b4cc4ba develop: 382c0966
|
|||
tag: 2.02.15 date: 02:10:2017 master: 8c64cd93 develop: 28dea8b6
|
||||
tag: 2.03.00 date: 04:25:2017 master: 120d9ce7 develop: 015ba641
|
||||
tag: 2.03.05 date: 05:27:2017 master: 36b92f43 develop: 79073186
|
||||
tag: 2.03.13 date: 07:27:2017 master: da314444 develop: 29ccb58a
|
||||
|
|
|
@ -0,0 +1,24 @@
|
|||
#include <cstdio>
|
||||
#include <cuda_runtime_api.h>
|
||||
int main()
|
||||
{
|
||||
cudaDeviceProp prop;
|
||||
const cudaError_t err_code = cudaGetDeviceProperties(&prop, 0);
|
||||
if (cudaSuccess != err_code) {
|
||||
fprintf(stderr,"cudaGetDeviceProperties failed: %s\n", cudaGetErrorString(err_code));
|
||||
return -1;
|
||||
}
|
||||
switch (prop.major) {
|
||||
case 3:
|
||||
printf("Kepler"); break;
|
||||
case 5:
|
||||
printf("Maxwell"); break;
|
||||
case 6:
|
||||
printf("Pascal"); break;
|
||||
default:
|
||||
fprintf(stderr, "Unspported Device %d%d\n", (int)prop.major, (int)prop.minor);
|
||||
return -1;
|
||||
}
|
||||
printf("%d%d\n", (int)prop.major, (int)prop.minor);
|
||||
return 0;
|
||||
}
|
|
@ -160,9 +160,14 @@ if [ "$MACHINE" = "sems" ]; then
|
|||
# Format: (compiler module-list build-list exe-name warning-flag)
|
||||
COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"intel/16.0.3 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
|
||||
"clang/3.7.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
|
||||
"clang/3.8.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
|
||||
|
@ -280,13 +285,13 @@ elif [ "$MACHINE" = "apollo" ]; then
|
|||
"gcc/5.1.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
|
||||
"intel/16.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
|
||||
"clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS"
|
||||
"clang/head $CLANG_MODULE_LIST "Cuda_Pthread" clang++ $CUDA_WARNING_FLAGS"
|
||||
"clang/4.0.0 $CLANG_MODULE_LIST "Cuda_Pthread" clang++ $CUDA_WARNING_FLAGS"
|
||||
"cuda/8.0.44 $CUDA_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
)
|
||||
else
|
||||
# Format: (compiler module-list build-list exe-name warning-flag)
|
||||
COMPILERS=("cuda/8.0.44 $CUDA8_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
"clang/head $CLANG_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS"
|
||||
"clang/4.0.0 $CLANG_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS"
|
||||
"clang/3.9.0 $CLANG_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS"
|
||||
"gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
|
@ -584,7 +589,7 @@ single_build_and_test() {
|
|||
else
|
||||
run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
|
||||
local -i build_start_time=$(date +%s)
|
||||
run_cmd make build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
|
||||
run_cmd make -j 32 build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
|
||||
local -i build_end_time=$(date +%s)
|
||||
comment="build_time=$(($build_end_time-$build_start_time))"
|
||||
|
||||
|
|
|
@ -28,14 +28,14 @@ export JENKINS_DO_PTHREAD=ON
|
|||
export JENKINS_DO_SERIAL=OFF
|
||||
export JENKINS_DO_COMPLEX=OFF
|
||||
|
||||
export ARCH_CXX_FLAG="-xCORE-AVX2 -mkl"
|
||||
export ARCH_C_FLAG="-xCORE-AVX2 -mkl"
|
||||
export JENKINS_ARCH_CXX_FLAG="-xCORE-AVX2 -mkl"
|
||||
export JENKINS_ARCH_C_FLAG="-xCORE-AVX2 -mkl"
|
||||
export BLAS_LIBRARIES="-mkl;${MKLROOT}/lib/intel64/libmkl_intel_lp64.a;${MKLROOT}/lib/intel64/libmkl_intel_thread.a;${MKLROOT}/lib/intel64/libmkl_core.a"
|
||||
export LAPACK_LIBRARIES=${BLAS_LIBRARIES}
|
||||
|
||||
export JENKINS_DO_TESTS=ON
|
||||
export JENKINS_DO_EXAMPLES=ON
|
||||
export JENKINS_DO_SHARED=OFF
|
||||
export JENKINS_DO_SHARED=ON
|
||||
|
||||
export QUEUE=haswell
|
||||
|
||||
|
|
|
@ -28,14 +28,14 @@ export JENKINS_DO_PTHREAD=OFF
|
|||
export JENKINS_DO_SERIAL=ON
|
||||
export JENKINS_DO_COMPLEX=ON
|
||||
|
||||
export ARCH_CXX_FLAG="-xCORE-AVX2 -mkl"
|
||||
export ARCH_C_FLAG="-xCORE-AVX2 -mkl"
|
||||
export JENKINS_ARCH_CXX_FLAG="-xCORE-AVX2 -mkl"
|
||||
export JENKINS_ARCH_C_FLAG="-xCORE-AVX2 -mkl"
|
||||
export BLAS_LIBRARIES="-mkl;${MKLROOT}/lib/intel64/libmkl_intel_lp64.a;${MKLROOT}/lib/intel64/libmkl_intel_thread.a;${MKLROOT}/lib/intel64/libmkl_core.a"
|
||||
export LAPACK_LIBRARIES=${BLAS_LIBRARIES}
|
||||
|
||||
export JENKINS_DO_TESTS=ON
|
||||
export JENKINS_DO_EXAMPLES=ON
|
||||
export JENKINS_DO_SHARED=OFF
|
||||
export JENKINS_DO_SHARED=ON
|
||||
|
||||
export QUEUE=haswell
|
||||
|
||||
|
|
|
@ -60,7 +60,6 @@ test-threads: KokkosContainers_PerformanceTest_Threads
|
|||
test-openmp: KokkosContainers_PerformanceTest_OpenMP
|
||||
./KokkosContainers_PerformanceTest_OpenMP
|
||||
|
||||
|
||||
build_all: $(TARGETS)
|
||||
|
||||
test: $(TEST_TARGETS)
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
|
@ -36,12 +36,15 @@
|
|||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <cstdlib>
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
::testing::InitGoogleTest(&argc,argv);
|
||||
|
|
|
@ -69,30 +69,13 @@ protected:
|
|||
{
|
||||
std::cout << std::setprecision(5) << std::scientific;
|
||||
|
||||
unsigned num_threads = 4;
|
||||
|
||||
if (Kokkos::hwloc::available()) {
|
||||
num_threads = Kokkos::hwloc::get_available_numa_count()
|
||||
* Kokkos::hwloc::get_available_cores_per_numa()
|
||||
* Kokkos::hwloc::get_available_threads_per_core()
|
||||
;
|
||||
|
||||
}
|
||||
|
||||
std::cout << "OpenMP: " << num_threads << std::endl;
|
||||
|
||||
Kokkos::OpenMP::initialize( num_threads );
|
||||
|
||||
std::cout << "available threads: " << omp_get_max_threads() << std::endl;
|
||||
Kokkos::OpenMP::initialize();
|
||||
Kokkos::OpenMP::print_configuration( std::cout );
|
||||
}
|
||||
|
||||
static void TearDownTestCase()
|
||||
{
|
||||
Kokkos::OpenMP::finalize();
|
||||
|
||||
omp_set_num_threads(1);
|
||||
|
||||
ASSERT_EQ( 1 , omp_get_max_threads() );
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -564,7 +564,7 @@ namespace Impl {
|
|||
template< class D, class A1, class A2, class A3, class ... Args >
|
||||
struct DualViewSubview {
|
||||
|
||||
typedef typename Kokkos::Experimental::Impl::ViewMapping
|
||||
typedef typename Kokkos::Impl::ViewMapping
|
||||
< void
|
||||
, Kokkos::ViewTraits< D, A1, A2, A3 >
|
||||
, Args ...
|
||||
|
|
|
@ -46,19 +46,6 @@
|
|||
///
|
||||
/// This header file declares and defines Kokkos::Experimental::DynRankView and its
|
||||
/// related nonmember functions.
|
||||
/*
|
||||
* Changes from View
|
||||
* 1. The rank of the DynRankView is returned by the method rank()
|
||||
* 2. Max rank of a DynRankView is 7
|
||||
* 3. subview name is subdynrankview
|
||||
* 4. Every subdynrankview is returned with LayoutStride
|
||||
*
|
||||
* NEW: Redesigned DynRankView
|
||||
* 5. subview function name now available
|
||||
* 6. Copy and Copy-Assign View to DynRankView
|
||||
* 7. deep_copy between Views and DynRankViews
|
||||
* 8. rank( view ); returns the rank of View or DynRankView
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_DYNRANKVIEW_HPP
|
||||
#define KOKKOS_DYNRANKVIEW_HPP
|
||||
|
@ -117,6 +104,14 @@ struct DynRankDimTraits {
|
|||
, layout.dimension[7] );
|
||||
}
|
||||
|
||||
// Extra overload to match that for specialize types v2
|
||||
template <typename Layout, typename ... P>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static size_t computeRank( const Kokkos::Impl::ViewCtorProp<P...>& prop, const Layout& layout )
|
||||
{
|
||||
return computeRank(layout);
|
||||
}
|
||||
|
||||
// Create the layout for the rank-7 view.
|
||||
// Non-strided Layout
|
||||
template <typename Layout>
|
||||
|
@ -158,8 +153,17 @@ struct DynRankDimTraits {
|
|||
);
|
||||
}
|
||||
|
||||
// Extra overload to match that for specialize types
|
||||
template <typename Traits, typename ... P>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static typename std::enable_if< (std::is_same<typename Traits::array_layout , Kokkos::LayoutRight>::value || std::is_same<typename Traits::array_layout , Kokkos::LayoutLeft>::value || std::is_same<typename Traits::array_layout , Kokkos::LayoutStride>::value) , typename Traits::array_layout >::type createLayout( const ViewCtorProp<P...>& prop, const typename Traits::array_layout& layout )
|
||||
{
|
||||
return createLayout( layout );
|
||||
}
|
||||
|
||||
// Create a view from the given dimension arguments.
|
||||
// This is only necessary because the shmem constructor doesn't take a layout.
|
||||
// NDE shmem View's are not compatible with the added view_alloc value_type / fad_dim deduction functionality
|
||||
template <typename ViewType, typename ViewArg>
|
||||
static ViewType createView( const ViewArg& arg
|
||||
, const size_t N0
|
||||
|
@ -186,7 +190,8 @@ struct DynRankDimTraits {
|
|||
// Non-strided Layout
|
||||
template <typename Layout , typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutRight>::value || std::is_same<Layout , Kokkos::LayoutLeft>::value) && std::is_integral<iType>::value , Layout >::type reconstructLayout( const Layout& layout , iType dynrank )
|
||||
static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutRight>::value || std::is_same<Layout , Kokkos::LayoutLeft>::value) && std::is_integral<iType>::value , Layout >::type
|
||||
reconstructLayout( const Layout& layout , iType dynrank )
|
||||
{
|
||||
return Layout( dynrank > 0 ? layout.dimension[0] : ~size_t(0)
|
||||
, dynrank > 1 ? layout.dimension[1] : ~size_t(0)
|
||||
|
@ -202,7 +207,8 @@ struct DynRankDimTraits {
|
|||
// LayoutStride
|
||||
template <typename Layout , typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutStride>::value) && std::is_integral<iType>::value , Layout >::type reconstructLayout( const Layout& layout , iType dynrank )
|
||||
static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutStride>::value) && std::is_integral<iType>::value , Layout >::type
|
||||
reconstructLayout( const Layout& layout , iType dynrank )
|
||||
{
|
||||
return Layout( dynrank > 0 ? layout.dimension[0] : ~size_t(0)
|
||||
, dynrank > 0 ? layout.stride[0] : (0)
|
||||
|
@ -311,6 +317,11 @@ void dyn_rank_view_verify_operator_bounds
|
|||
/** \brief Assign compatible default mappings */
|
||||
struct ViewToDynRankViewTag {};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Experimental
|
||||
|
||||
namespace Impl {
|
||||
|
||||
template< class DstTraits , class SrcTraits >
|
||||
class ViewMapping< DstTraits , SrcTraits ,
|
||||
typename std::enable_if<(
|
||||
|
@ -337,7 +348,7 @@ class ViewMapping< DstTraits , SrcTraits ,
|
|||
)
|
||||
)
|
||||
)
|
||||
) , ViewToDynRankViewTag >::type >
|
||||
) , Kokkos::Experimental::Impl::ViewToDynRankViewTag >::type >
|
||||
{
|
||||
private:
|
||||
|
||||
|
@ -376,7 +387,7 @@ public:
|
|||
|
||||
typedef typename DstType::offset_type dst_offset_type ;
|
||||
dst.m_map.m_offset = dst_offset_type(std::integral_constant<unsigned,0>() , src.layout() ); //Check this for integer input1 for padding, etc
|
||||
dst.m_map.m_handle = Kokkos::Experimental::Impl::ViewDataHandle< DstTraits >::assign( src.m_map.m_handle , src.m_track );
|
||||
dst.m_map.m_handle = Kokkos::Impl::ViewDataHandle< DstTraits >::assign( src.m_map.m_handle , src.m_track );
|
||||
dst.m_track.assign( src.m_track , DstTraits::is_managed );
|
||||
dst.m_rank = src.Rank ;
|
||||
}
|
||||
|
@ -384,22 +395,20 @@ public:
|
|||
|
||||
} //end Impl
|
||||
|
||||
namespace Experimental {
|
||||
|
||||
/* \class DynRankView
|
||||
* \brief Container that creates a Kokkos view with rank determined at runtime.
|
||||
* Essentially this is a rank 7 view that wraps the access operators
|
||||
* to yield the functionality of a view
|
||||
* Essentially this is a rank 7 view
|
||||
*
|
||||
* Changes from View
|
||||
* 1. The rank of the DynRankView is returned by the method rank()
|
||||
* 2. Max rank of a DynRankView is 7
|
||||
* 3. subview name is subdynrankview
|
||||
* 4. Every subdynrankview is returned with LayoutStride
|
||||
*
|
||||
* NEW: Redesigned DynRankView
|
||||
* 5. subview function name now available
|
||||
* 6. Copy and Copy-Assign View to DynRankView
|
||||
* 7. deep_copy between Views and DynRankViews
|
||||
* 8. rank( view ); returns the rank of View or DynRankView
|
||||
* 3. subview called with 'subview(...)' or 'subdynrankview(...)' (backward compatibility)
|
||||
* 4. Every subview is returned with LayoutStride
|
||||
* 5. Copy and Copy-Assign View to DynRankView
|
||||
* 6. deep_copy between Views and DynRankViews
|
||||
* 7. rank( view ); returns the rank of View or DynRankView
|
||||
*
|
||||
*/
|
||||
|
||||
|
@ -427,7 +436,7 @@ public:
|
|||
|
||||
|
||||
private:
|
||||
typedef Kokkos::Experimental::Impl::ViewMapping< traits , void > map_type ;
|
||||
typedef Kokkos::Impl::ViewMapping< traits , void > map_type ;
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationTracker track_type ;
|
||||
|
||||
track_type m_track ;
|
||||
|
@ -556,7 +565,7 @@ public:
|
|||
// Allow specializations to query their specialized map
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const Kokkos::Experimental::Impl::ViewMapping< traits , void > &
|
||||
const Kokkos::Impl::ViewMapping< traits , void > &
|
||||
implementation_map() const { return m_map ; }
|
||||
|
||||
//----------------------------------------
|
||||
|
@ -803,7 +812,7 @@ public:
|
|||
, m_rank(rhs.m_rank)
|
||||
{
|
||||
typedef typename DynRankView<RT,RP...> ::traits SrcTraits ;
|
||||
typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
|
||||
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
|
||||
static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
|
||||
Mapping::assign( m_map , rhs.m_map , rhs.m_track );
|
||||
}
|
||||
|
@ -813,7 +822,7 @@ public:
|
|||
DynRankView & operator = (const DynRankView<RT,RP...> & rhs )
|
||||
{
|
||||
typedef typename DynRankView<RT,RP...> ::traits SrcTraits ;
|
||||
typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
|
||||
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
|
||||
static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
|
||||
Mapping::assign( m_map , rhs.m_map , rhs.m_track );
|
||||
m_track.assign( rhs.m_track , traits::is_managed );
|
||||
|
@ -831,7 +840,7 @@ public:
|
|||
, m_rank( rhs.Rank )
|
||||
{
|
||||
typedef typename View<RT,RP...>::traits SrcTraits ;
|
||||
typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag > Mapping ;
|
||||
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag > Mapping ;
|
||||
static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
|
||||
Mapping::assign( *this , rhs );
|
||||
}
|
||||
|
@ -841,7 +850,7 @@ public:
|
|||
DynRankView & operator = ( const View<RT,RP...> & rhs )
|
||||
{
|
||||
typedef typename View<RT,RP...>::traits SrcTraits ;
|
||||
typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag > Mapping ;
|
||||
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag > Mapping ;
|
||||
static_assert( Mapping::is_assignable , "Incompatible View to DynRankView copy assignment" );
|
||||
Mapping::assign( *this , rhs );
|
||||
return *this ;
|
||||
|
@ -870,7 +879,7 @@ public:
|
|||
)
|
||||
: m_track()
|
||||
, m_map()
|
||||
, m_rank( Impl::DynRankDimTraits<typename traits::specialize>::computeRank(arg_layout) )
|
||||
, m_rank( Impl::DynRankDimTraits<typename traits::specialize>::template computeRank< typename traits::array_layout, P...>(arg_prop, arg_layout) )
|
||||
{
|
||||
// Append layout and spaces if not input
|
||||
typedef Impl::ViewCtorProp< P ... > alloc_prop_input ;
|
||||
|
@ -923,7 +932,7 @@ public:
|
|||
//------------------------------------------------------------
|
||||
|
||||
Kokkos::Experimental::Impl::SharedAllocationRecord<> *
|
||||
record = m_map.allocate_shared( prop , Impl::DynRankDimTraits<typename traits::specialize>::createLayout(arg_layout) );
|
||||
record = m_map.allocate_shared( prop , Impl::DynRankDimTraits<typename traits::specialize>::template createLayout<traits, P...>(arg_prop, arg_layout) );
|
||||
|
||||
//------------------------------------------------------------
|
||||
#if defined( KOKKOS_ENABLE_CUDA )
|
||||
|
@ -947,8 +956,8 @@ public:
|
|||
>::type const & arg_layout
|
||||
)
|
||||
: m_track() // No memory tracking
|
||||
, m_map( arg_prop , Impl::DynRankDimTraits<typename traits::specialize>::createLayout(arg_layout) )
|
||||
, m_rank( Impl::DynRankDimTraits<typename traits::specialize>::computeRank(arg_layout) )
|
||||
, m_map( arg_prop , Impl::DynRankDimTraits<typename traits::specialize>::template createLayout<traits, P...>(arg_prop, arg_layout) )
|
||||
, m_rank( Impl::DynRankDimTraits<typename traits::specialize>::template computeRank< typename traits::array_layout, P...>(arg_prop, arg_layout) )
|
||||
{
|
||||
static_assert(
|
||||
std::is_same< pointer_type
|
||||
|
@ -1034,6 +1043,7 @@ public:
|
|||
{}
|
||||
|
||||
// For backward compatibility
|
||||
// NDE This ctor does not take ViewCtorProp argument - should not use alternative createLayout call
|
||||
explicit inline
|
||||
DynRankView( const ViewAllocateWithoutInitializing & arg_prop
|
||||
, const typename traits::array_layout & arg_layout
|
||||
|
@ -1179,6 +1189,11 @@ namespace Impl {
|
|||
|
||||
struct DynRankSubviewTag {};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Experimental
|
||||
|
||||
namespace Impl {
|
||||
|
||||
template< class SrcTraits , class ... Args >
|
||||
struct ViewMapping
|
||||
< typename std::enable_if<(
|
||||
|
@ -1192,7 +1207,7 @@ struct ViewMapping
|
|||
std::is_same< typename SrcTraits::array_layout
|
||||
, Kokkos::LayoutStride >::value
|
||||
)
|
||||
), DynRankSubviewTag >::type
|
||||
), Kokkos::Experimental::Impl::DynRankSubviewTag >::type
|
||||
, SrcTraits
|
||||
, Args ... >
|
||||
{
|
||||
|
@ -1264,7 +1279,7 @@ public:
|
|||
};
|
||||
|
||||
|
||||
typedef DynRankView< value_type , array_layout , typename SrcTraits::device_type , typename SrcTraits::memory_traits > ret_type;
|
||||
typedef Kokkos::Experimental::DynRankView< value_type , array_layout , typename SrcTraits::device_type , typename SrcTraits::memory_traits > ret_type;
|
||||
|
||||
template < typename T , class ... P >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
@ -1336,9 +1351,10 @@ public:
|
|||
|
||||
} // end Impl
|
||||
|
||||
namespace Experimental {
|
||||
|
||||
template< class V , class ... Args >
|
||||
using Subdynrankview = typename Kokkos::Experimental::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , V , Args... >::ret_type ;
|
||||
using Subdynrankview = typename Kokkos::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , V , Args... >::ret_type ;
|
||||
|
||||
template< class D , class ... P , class ...Args >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
@ -1348,7 +1364,7 @@ subdynrankview( const Kokkos::Experimental::DynRankView< D , P... > &src , Args.
|
|||
if ( src.rank() > sizeof...(Args) ) //allow sizeof...(Args) >= src.rank(), ignore the remaining args
|
||||
{ Kokkos::abort("subdynrankview: num of args must be >= rank of the source DynRankView"); }
|
||||
|
||||
typedef Kokkos::Experimental::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , Kokkos::ViewTraits< D*******, P... > , Args... > metafcn ;
|
||||
typedef Kokkos::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , Kokkos::ViewTraits< D*******, P... > , Args... > metafcn ;
|
||||
|
||||
return metafcn::subview( src.rank() , src , args... );
|
||||
}
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
|
@ -36,7 +36,7 @@
|
|||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
@ -57,7 +57,7 @@ namespace Experimental {
|
|||
*/
|
||||
template< typename DataType , typename ... P >
|
||||
class DynamicView : public Kokkos::ViewTraits< DataType , P ... >
|
||||
{
|
||||
{
|
||||
public:
|
||||
|
||||
typedef Kokkos::ViewTraits< DataType , P ... > traits ;
|
||||
|
@ -68,7 +68,7 @@ private:
|
|||
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationTracker track_type ;
|
||||
|
||||
static_assert( traits::rank == 1 && traits::rank_dynamic == 1
|
||||
static_assert( traits::rank == 1 && traits::rank_dynamic == 1
|
||||
, "DynamicView must be rank-one" );
|
||||
|
||||
static_assert( std::is_trivial< typename traits::value_type >::value &&
|
||||
|
@ -216,14 +216,14 @@ public:
|
|||
// Verify that allocation of the requested chunk in in progress.
|
||||
|
||||
// The allocated chunk counter is m_chunks[ m_chunk_max ]
|
||||
const uintptr_t n =
|
||||
const uintptr_t n =
|
||||
*reinterpret_cast<uintptr_t volatile *>( m_chunks + m_chunk_max );
|
||||
|
||||
if ( n <= ic ) {
|
||||
Kokkos::abort("Kokkos::DynamicView array bounds error");
|
||||
}
|
||||
|
||||
// Allocation of this chunk is in progress
|
||||
// Allocation of this chunk is in progress
|
||||
// so wait for allocation to complete.
|
||||
while ( 0 == *ch );
|
||||
}
|
||||
|
@ -267,7 +267,7 @@ public:
|
|||
const uintptr_t jc_try = jc ;
|
||||
|
||||
// Jump iteration to the chunk counter.
|
||||
|
||||
|
||||
jc = atomic_compare_exchange( pc , jc_try , jc_try + 1 );
|
||||
|
||||
if ( jc_try == jc ) {
|
||||
|
@ -316,7 +316,7 @@ public:
|
|||
}
|
||||
else {
|
||||
while ( NC + 1 <= *pc ) {
|
||||
--*pc ;
|
||||
--*pc ;
|
||||
m_pool.deallocate( m_chunks[*pc]
|
||||
, sizeof(value_type) << m_chunk_shift );
|
||||
m_chunks[*pc] = 0 ;
|
||||
|
@ -331,7 +331,7 @@ public:
|
|||
typename traits::value_type ** m_chunks ;
|
||||
uintptr_t * m_pc ;
|
||||
uintptr_t m_nc ;
|
||||
unsigned m_chunk_shift ;
|
||||
unsigned m_chunk_shift ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()( int ) const
|
||||
|
@ -348,7 +348,7 @@ public:
|
|||
}
|
||||
else {
|
||||
while ( m_nc + 1 <= *m_pc ) {
|
||||
--*m_pc ;
|
||||
--*m_pc ;
|
||||
m_pool.deallocate( m_chunks[*m_pc]
|
||||
, sizeof(value_type) << m_chunk_shift );
|
||||
m_chunks[*m_pc] = 0 ;
|
||||
|
@ -482,7 +482,7 @@ public:
|
|||
};
|
||||
|
||||
|
||||
/**\brief Allocation constructor
|
||||
/**\brief Allocation constructor
|
||||
*
|
||||
* Memory is allocated in chunks from the memory pool.
|
||||
* The chunk size conforms to the memory pool's chunk size.
|
||||
|
@ -557,7 +557,7 @@ void deep_copy( const View<T,DP...> & dst
|
|||
|
||||
if ( DstExecCanAccessSrc ) {
|
||||
// Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
|
||||
Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type >( dst , src );
|
||||
Kokkos::Impl::ViewRemap< dst_type , src_type >( dst , src );
|
||||
}
|
||||
else {
|
||||
Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation");
|
||||
|
@ -581,7 +581,7 @@ void deep_copy( const DynamicView<T,DP...> & dst
|
|||
|
||||
if ( DstExecCanAccessSrc ) {
|
||||
// Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
|
||||
Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type >( dst , src );
|
||||
Kokkos::Impl::ViewRemap< dst_type , src_type >( dst , src );
|
||||
}
|
||||
else {
|
||||
Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation");
|
||||
|
|
|
@ -69,6 +69,8 @@
|
|||
#include <Kokkos_ErrorReporter.hpp>
|
||||
#include <TestErrorReporter.hpp>
|
||||
|
||||
#include <TestViewCtorPropEmbeddedDim.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
@ -94,6 +96,10 @@ TEST_F( cuda , dyn_view_api) {
|
|||
TestDynViewAPI< double , Kokkos::Cuda >();
|
||||
}
|
||||
|
||||
TEST_F( cuda, viewctorprop_embedded_dim ) {
|
||||
TestViewCtorProp_EmbeddedDim< Kokkos::Cuda >::test_vcpt( 2, 3 );
|
||||
}
|
||||
|
||||
TEST_F( cuda , staticcrsgraph )
|
||||
{
|
||||
TestStaticCrsGraph::run_test_graph< Kokkos::Cuda >();
|
||||
|
|
|
@ -66,6 +66,8 @@
|
|||
#include <Kokkos_ErrorReporter.hpp>
|
||||
#include <TestErrorReporter.hpp>
|
||||
|
||||
#include <TestViewCtorPropEmbeddedDim.hpp>
|
||||
|
||||
#include <iomanip>
|
||||
|
||||
namespace Test {
|
||||
|
@ -76,14 +78,7 @@ protected:
|
|||
{
|
||||
std::cout << std::setprecision(5) << std::scientific;
|
||||
|
||||
unsigned threads_count = 4 ;
|
||||
|
||||
if ( Kokkos::hwloc::available() ) {
|
||||
threads_count = Kokkos::hwloc::get_available_numa_count() *
|
||||
Kokkos::hwloc::get_available_cores_per_numa();
|
||||
}
|
||||
|
||||
Kokkos::OpenMP::initialize( threads_count );
|
||||
Kokkos::OpenMP::initialize();
|
||||
}
|
||||
|
||||
static void TearDownTestCase()
|
||||
|
@ -96,6 +91,10 @@ TEST_F( openmp, dyn_view_api) {
|
|||
TestDynViewAPI< double , Kokkos::OpenMP >();
|
||||
}
|
||||
|
||||
TEST_F( openmp, viewctorprop_embedded_dim ) {
|
||||
TestViewCtorProp_EmbeddedDim< Kokkos::OpenMP >::test_vcpt( 2, 3 );
|
||||
}
|
||||
|
||||
TEST_F( openmp, bitset )
|
||||
{
|
||||
test_bitset<Kokkos::OpenMP>();
|
||||
|
|
|
@ -67,6 +67,8 @@
|
|||
#include <Kokkos_ErrorReporter.hpp>
|
||||
#include <TestErrorReporter.hpp>
|
||||
|
||||
#include <TestViewCtorPropEmbeddedDim.hpp>
|
||||
|
||||
namespace Test {
|
||||
|
||||
class serial : public ::testing::Test {
|
||||
|
@ -85,6 +87,10 @@ TEST_F( serial, dyn_view_api) {
|
|||
TestDynViewAPI< double , Kokkos::Serial >();
|
||||
}
|
||||
|
||||
TEST_F( serial, viewctorprop_embedded_dim ) {
|
||||
TestViewCtorProp_EmbeddedDim< Kokkos::Serial >::test_vcpt( 2, 3 );
|
||||
}
|
||||
|
||||
TEST_F( serial , staticcrsgraph )
|
||||
{
|
||||
TestStaticCrsGraph::run_test_graph< Kokkos::Serial >();
|
||||
|
|
|
@ -70,6 +70,8 @@
|
|||
#include <Kokkos_ErrorReporter.hpp>
|
||||
#include <TestErrorReporter.hpp>
|
||||
|
||||
#include <TestViewCtorPropEmbeddedDim.hpp>
|
||||
|
||||
namespace Test {
|
||||
|
||||
class threads : public ::testing::Test {
|
||||
|
@ -103,6 +105,10 @@ TEST_F( threads , dyn_view_api) {
|
|||
TestDynViewAPI< double , Kokkos::Threads >();
|
||||
}
|
||||
|
||||
TEST_F( threads, viewctorprop_embedded_dim ) {
|
||||
TestViewCtorProp_EmbeddedDim< Kokkos::Threads >::test_vcpt( 2, 3 );
|
||||
}
|
||||
|
||||
TEST_F( threads , staticcrsgraph )
|
||||
{
|
||||
TestStaticCrsGraph::run_test_graph< Kokkos::Threads >();
|
||||
|
|
|
@ -0,0 +1,213 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <cstdio>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <Kokkos_DynRankView.hpp>
|
||||
|
||||
#include <type_traits>
|
||||
#include <typeinfo>
|
||||
|
||||
namespace Test {
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename ExecSpace >
|
||||
struct TestViewCtorProp_EmbeddedDim {
|
||||
|
||||
using ViewIntType = typename Kokkos::View< int**, ExecSpace >;
|
||||
using ViewDoubleType = typename Kokkos::View< double*, ExecSpace >;
|
||||
|
||||
using DynRankViewIntType = typename Kokkos::DynRankView< int, ExecSpace >;
|
||||
using DynRankViewDoubleType = typename Kokkos::DynRankView< double, ExecSpace >;
|
||||
|
||||
// Cuda 7.0 has issues with using a lamda in parallel_for to initialize the view - replace with this functor
|
||||
template < class ViewType >
|
||||
struct Functor {
|
||||
|
||||
ViewType v;
|
||||
|
||||
Functor( const ViewType & v_ ) : v(v_) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()( const int i ) const {
|
||||
v(i) = i;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
static void test_vcpt( const int N0, const int N1 )
|
||||
{
|
||||
|
||||
// Create two views to test
|
||||
{
|
||||
using VIT = typename TestViewCtorProp_EmbeddedDim::ViewIntType ;
|
||||
using VDT = typename TestViewCtorProp_EmbeddedDim::ViewDoubleType ;
|
||||
|
||||
VIT vi1("vi1", N0, N1);
|
||||
VDT vd1("vd1", N0);
|
||||
|
||||
// TEST: Test for common type between two views, one with type double, other with type int
|
||||
// Deduce common value_type and construct a view with that type
|
||||
{
|
||||
// Two views
|
||||
auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1, vd1);
|
||||
typedef typename decltype( view_alloc_arg )::value_type CommonViewValueType;
|
||||
typedef typename Kokkos::View< CommonViewValueType*, ExecSpace > CVT;
|
||||
typedef typename CVT::HostMirror HostCVT;
|
||||
|
||||
// Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
|
||||
CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
|
||||
|
||||
Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1),
|
||||
Functor<CVT>(cv1)
|
||||
);
|
||||
|
||||
HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
|
||||
Kokkos::deep_copy( hcv1, cv1 );
|
||||
|
||||
ASSERT_EQ( (std::is_same< CommonViewValueType, double >::value) , true ) ;
|
||||
#if 0
|
||||
// debug output
|
||||
for ( int i = 0; i < N0*N1; ++i ) {
|
||||
printf(" Output check: hcv1(%d) = %lf\n ", i, hcv1(i) );
|
||||
}
|
||||
|
||||
printf( " Common value type view: %s \n", typeid( CVT() ).name() );
|
||||
printf( " Common value type: %s \n", typeid( CommonViewValueType() ).name() );
|
||||
if ( std::is_same< CommonViewValueType, double >::value == true ) {
|
||||
printf("Proper common value_type\n");
|
||||
}
|
||||
else {
|
||||
printf("WRONG common value_type\n");
|
||||
}
|
||||
// end debug output
|
||||
#endif
|
||||
}
|
||||
|
||||
{
|
||||
// Single view
|
||||
auto view_alloc_arg = Kokkos::common_view_alloc_prop(vi1);
|
||||
typedef typename decltype( view_alloc_arg )::value_type CommonViewValueType;
|
||||
typedef typename Kokkos::View< CommonViewValueType*, ExecSpace > CVT;
|
||||
typedef typename CVT::HostMirror HostCVT;
|
||||
|
||||
// Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
|
||||
CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
|
||||
|
||||
Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1),
|
||||
Functor<CVT>(cv1)
|
||||
);
|
||||
|
||||
HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
|
||||
Kokkos::deep_copy( hcv1, cv1 );
|
||||
|
||||
ASSERT_EQ( (std::is_same< CommonViewValueType, int>::value) , true ) ;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Create two dynamic rank views to test
|
||||
{
|
||||
using VIT = typename TestViewCtorProp_EmbeddedDim::DynRankViewIntType ;
|
||||
using VDT = typename TestViewCtorProp_EmbeddedDim::DynRankViewDoubleType ;
|
||||
|
||||
VIT vi1("vi1", N0, N1);
|
||||
VDT vd1("vd1", N0);
|
||||
|
||||
// TEST: Test for common type between two views, one with type double, other with type int
|
||||
// Deduce common value_type and construct a view with that type
|
||||
{
|
||||
// Two views
|
||||
auto view_alloc_arg = Kokkos::common_view_alloc_prop( vi1, vd1 );
|
||||
typedef typename decltype( view_alloc_arg )::value_type CommonViewValueType;
|
||||
typedef typename Kokkos::View< CommonViewValueType*, ExecSpace > CVT;
|
||||
typedef typename CVT::HostMirror HostCVT;
|
||||
|
||||
// Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
|
||||
CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
|
||||
|
||||
|
||||
Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1),
|
||||
Functor<CVT>(cv1)
|
||||
);
|
||||
|
||||
HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
|
||||
Kokkos::deep_copy( hcv1, cv1 );
|
||||
|
||||
ASSERT_EQ( (std::is_same< CommonViewValueType, double >::value) , true ) ;
|
||||
}
|
||||
|
||||
{
|
||||
// Single views
|
||||
auto view_alloc_arg = Kokkos::common_view_alloc_prop( vi1 );
|
||||
typedef typename decltype( view_alloc_arg )::value_type CommonViewValueType;
|
||||
typedef typename Kokkos::View< CommonViewValueType*, ExecSpace > CVT;
|
||||
typedef typename CVT::HostMirror HostCVT;
|
||||
|
||||
// Construct View using the common type; for case of specialization, an 'embedded_dim' would be stored by view_alloc_arg
|
||||
CVT cv1( Kokkos::view_alloc( "cv1", view_alloc_arg ), N0*N1 );
|
||||
|
||||
Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace >(0, N0*N1),
|
||||
Functor<CVT>(cv1)
|
||||
);
|
||||
|
||||
HostCVT hcv1 = Kokkos::create_mirror_view( cv1 );
|
||||
Kokkos::deep_copy( hcv1, cv1 );
|
||||
|
||||
ASSERT_EQ( (std::is_same< CommonViewValueType, int>::value) , true ) ;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} // end test_vcpt
|
||||
|
||||
}; // end struct
|
||||
|
||||
} // namespace
|
||||
|
||||
} // namespace Test
|
|
@ -1,13 +1,13 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
|
@ -36,12 +36,14 @@
|
|||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <cstdlib>
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
::testing::InitGoogleTest(&argc,argv);
|
||||
|
|
|
@ -79,7 +79,6 @@ test-mempool: KokkosCore_PerformanceTest_Mempool
|
|||
test-taskdag: KokkosCore_PerformanceTest_TaskDAG
|
||||
./KokkosCore_PerformanceTest_TaskDAG
|
||||
|
||||
|
||||
build_all: $(TARGETS)
|
||||
|
||||
test: $(TEST_TARGETS)
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
|
@ -36,12 +36,14 @@
|
|||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <cstdlib>
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
namespace Test {
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -53,6 +53,7 @@
|
|||
#include <impl/Kokkos_Error.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_abort.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Error.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Locks.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
@ -125,53 +126,12 @@ unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits:
|
|||
|
||||
#endif
|
||||
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
struct CudaLockArraysStruct {
|
||||
int* atomic;
|
||||
int* scratch;
|
||||
int* threadid;
|
||||
int n;
|
||||
};
|
||||
}
|
||||
}
|
||||
__device__ __constant__
|
||||
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
||||
extern
|
||||
#endif
|
||||
Kokkos::Impl::CudaLockArraysStruct kokkos_impl_cuda_lock_arrays ;
|
||||
|
||||
#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
|
||||
#define CUDA_SPACE_ATOMIC_XOR_MASK 0x15A39
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink = false);
|
||||
}
|
||||
}
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
__device__ inline
|
||||
bool lock_address_cuda_space(void* ptr) {
|
||||
size_t offset = size_t(ptr);
|
||||
offset = offset >> 2;
|
||||
offset = offset & CUDA_SPACE_ATOMIC_MASK;
|
||||
return (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[offset],0,1));
|
||||
}
|
||||
|
||||
__device__ inline
|
||||
void unlock_address_cuda_space(void* ptr) {
|
||||
size_t offset = size_t(ptr);
|
||||
offset = offset >> 2;
|
||||
offset = offset & CUDA_SPACE_ATOMIC_MASK;
|
||||
atomicExch( &kokkos_impl_cuda_lock_arrays.atomic[ offset ], 0);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
template< typename T >
|
||||
inline
|
||||
__device__
|
||||
|
@ -192,7 +152,7 @@ namespace Impl {
|
|||
// For 2.0 capability: 48 KB L1 and 16 KB shared
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template< class DriverType >
|
||||
template< class DriverType>
|
||||
__global__
|
||||
static void cuda_parallel_launch_constant_memory()
|
||||
{
|
||||
|
@ -202,19 +162,39 @@ static void cuda_parallel_launch_constant_memory()
|
|||
driver();
|
||||
}
|
||||
|
||||
template< class DriverType >
|
||||
template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
|
||||
__global__
|
||||
__launch_bounds__(maxTperB, minBperSM)
|
||||
static void cuda_parallel_launch_constant_memory()
|
||||
{
|
||||
const DriverType & driver =
|
||||
*((const DriverType *) kokkos_impl_cuda_constant_memory_buffer );
|
||||
|
||||
driver();
|
||||
}
|
||||
|
||||
template< class DriverType>
|
||||
__global__
|
||||
static void cuda_parallel_launch_local_memory( const DriverType driver )
|
||||
{
|
||||
driver();
|
||||
}
|
||||
|
||||
template < class DriverType ,
|
||||
bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ) >
|
||||
template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
|
||||
__global__
|
||||
__launch_bounds__(maxTperB, minBperSM)
|
||||
static void cuda_parallel_launch_local_memory( const DriverType driver )
|
||||
{
|
||||
driver();
|
||||
}
|
||||
|
||||
template < class DriverType
|
||||
, class LaunchBounds = Kokkos::LaunchBounds<>
|
||||
, bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ) >
|
||||
struct CudaParallelLaunch ;
|
||||
|
||||
template < class DriverType >
|
||||
struct CudaParallelLaunch< DriverType , true > {
|
||||
template < class DriverType, class LaunchBounds >
|
||||
struct CudaParallelLaunch< DriverType, LaunchBounds, true > {
|
||||
|
||||
inline
|
||||
CudaParallelLaunch( const DriverType & driver
|
||||
|
@ -238,26 +218,19 @@ struct CudaParallelLaunch< DriverType , true > {
|
|||
}
|
||||
#ifndef KOKKOS_ARCH_KEPLER //On Kepler the L1 has no benefit since it doesn't cache reads
|
||||
else if ( shmem ) {
|
||||
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferShared ) );
|
||||
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferShared ) );
|
||||
} else {
|
||||
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferL1 ) );
|
||||
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferL1 ) );
|
||||
}
|
||||
#endif
|
||||
|
||||
// Copy functor to constant memory on the device
|
||||
cudaMemcpyToSymbol( kokkos_impl_cuda_constant_memory_buffer , & driver , sizeof(DriverType) );
|
||||
|
||||
#ifndef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
||||
Kokkos::Impl::CudaLockArraysStruct locks;
|
||||
locks.atomic = atomic_lock_array_cuda_space_ptr(false);
|
||||
locks.scratch = scratch_lock_array_cuda_space_ptr(false);
|
||||
locks.threadid = threadid_lock_array_cuda_space_ptr(false);
|
||||
locks.n = Kokkos::Cuda::concurrency();
|
||||
cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
|
||||
#endif
|
||||
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
|
||||
|
||||
// Invoke the driver function on the device
|
||||
cuda_parallel_launch_constant_memory< DriverType ><<< grid , block , shmem , stream >>>();
|
||||
cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM ><<< grid , block , shmem , stream >>>();
|
||||
|
||||
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
|
@ -267,8 +240,8 @@ struct CudaParallelLaunch< DriverType , true > {
|
|||
}
|
||||
};
|
||||
|
||||
template < class DriverType >
|
||||
struct CudaParallelLaunch< DriverType , false > {
|
||||
template < class DriverType, class LaunchBounds >
|
||||
struct CudaParallelLaunch< DriverType, LaunchBounds, false > {
|
||||
|
||||
inline
|
||||
CudaParallelLaunch( const DriverType & driver
|
||||
|
@ -284,22 +257,15 @@ struct CudaParallelLaunch< DriverType , false > {
|
|||
}
|
||||
#ifndef KOKKOS_ARCH_KEPLER //On Kepler the L1 has no benefit since it doesn't cache reads
|
||||
else if ( shmem ) {
|
||||
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType > , cudaFuncCachePreferShared ) );
|
||||
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferShared ) );
|
||||
} else {
|
||||
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType > , cudaFuncCachePreferL1 ) );
|
||||
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferL1 ) );
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
||||
Kokkos::Impl::CudaLockArraysStruct locks;
|
||||
locks.atomic = atomic_lock_array_cuda_space_ptr(false);
|
||||
locks.scratch = scratch_lock_array_cuda_space_ptr(false);
|
||||
locks.threadid = threadid_lock_array_cuda_space_ptr(false);
|
||||
locks.n = Kokkos::Cuda::concurrency();
|
||||
cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
|
||||
#endif
|
||||
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
|
||||
|
||||
cuda_parallel_launch_local_memory< DriverType ><<< grid , block , shmem , stream >>>( driver );
|
||||
cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM ><<< grid , block , shmem , stream >>>( driver );
|
||||
|
||||
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
|
|
|
@ -230,18 +230,6 @@ void CudaHostPinnedSpace::deallocate( void * const arg_alloc_ptr , const size_t
|
|||
} catch(...) {}
|
||||
}
|
||||
|
||||
constexpr const char* CudaSpace::name() {
|
||||
return m_name;
|
||||
}
|
||||
|
||||
constexpr const char* CudaUVMSpace::name() {
|
||||
return m_name;
|
||||
}
|
||||
|
||||
constexpr const char* CudaHostPinnedSpace::name() {
|
||||
return m_name;
|
||||
}
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
@ -655,11 +643,12 @@ reallocate_tracked( void * const arg_alloc_ptr
|
|||
SharedAllocationRecord< Kokkos::CudaSpace , void > *
|
||||
SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record( void * alloc_ptr )
|
||||
{
|
||||
using Header = SharedAllocationHeader ;
|
||||
using RecordBase = SharedAllocationRecord< void , void > ;
|
||||
using RecordCuda = SharedAllocationRecord< Kokkos::CudaSpace , void > ;
|
||||
|
||||
#if 0
|
||||
using Header = SharedAllocationHeader ;
|
||||
|
||||
// Copy the header from the allocation
|
||||
Header head ;
|
||||
|
||||
|
@ -812,83 +801,6 @@ print_records( std::ostream & s , const Kokkos::CudaHostPinnedSpace & space , bo
|
|||
SharedAllocationRecord< void , void >::print_host_accessible_records( s , "CudaHostPinned" , & s_root_record , detail );
|
||||
}
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
namespace {
|
||||
__global__ void init_lock_array_kernel_atomic() {
|
||||
unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
|
||||
if(i<CUDA_SPACE_ATOMIC_MASK+1)
|
||||
kokkos_impl_cuda_lock_arrays.atomic[i] = 0;
|
||||
}
|
||||
|
||||
__global__ void init_lock_array_kernel_scratch_threadid(int N) {
|
||||
unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
|
||||
if(i<N) {
|
||||
kokkos_impl_cuda_lock_arrays.scratch[i] = 0;
|
||||
kokkos_impl_cuda_lock_arrays.threadid[i] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
namespace Impl {
|
||||
int* atomic_lock_array_cuda_space_ptr(bool deallocate) {
|
||||
static int* ptr = NULL;
|
||||
if(deallocate) {
|
||||
cudaFree(ptr);
|
||||
ptr = NULL;
|
||||
}
|
||||
|
||||
if(ptr==NULL && !deallocate)
|
||||
cudaMalloc(&ptr,sizeof(int)*(CUDA_SPACE_ATOMIC_MASK+1));
|
||||
return ptr;
|
||||
}
|
||||
|
||||
int* scratch_lock_array_cuda_space_ptr(bool deallocate) {
|
||||
static int* ptr = NULL;
|
||||
if(deallocate) {
|
||||
cudaFree(ptr);
|
||||
ptr = NULL;
|
||||
}
|
||||
|
||||
if(ptr==NULL && !deallocate)
|
||||
cudaMalloc(&ptr,sizeof(int)*(Cuda::concurrency()));
|
||||
return ptr;
|
||||
}
|
||||
|
||||
int* threadid_lock_array_cuda_space_ptr(bool deallocate) {
|
||||
static int* ptr = NULL;
|
||||
if(deallocate) {
|
||||
cudaFree(ptr);
|
||||
ptr = NULL;
|
||||
}
|
||||
|
||||
if(ptr==NULL && !deallocate)
|
||||
cudaMalloc(&ptr,sizeof(int)*(Cuda::concurrency()));
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void init_lock_arrays_cuda_space() {
|
||||
static int is_initialized = 0;
|
||||
if(! is_initialized) {
|
||||
Kokkos::Impl::CudaLockArraysStruct locks;
|
||||
locks.atomic = atomic_lock_array_cuda_space_ptr(false);
|
||||
locks.scratch = scratch_lock_array_cuda_space_ptr(false);
|
||||
locks.threadid = threadid_lock_array_cuda_space_ptr(false);
|
||||
locks.n = Kokkos::Cuda::concurrency();
|
||||
cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
|
||||
init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK+255)/256,256>>>();
|
||||
init_lock_array_kernel_scratch_threadid<<<(Kokkos::Cuda::concurrency()+255)/256,256>>>(Kokkos::Cuda::concurrency());
|
||||
}
|
||||
}
|
||||
|
||||
void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink) {
|
||||
static void* ptr = NULL;
|
||||
static std::int64_t current_size = 0;
|
||||
|
@ -908,8 +820,8 @@ void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink) {
|
|||
return ptr;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
#else
|
||||
void KOKKOS_CORE_SRC_CUDA_CUDASPACE_PREVENT_LINK_ERROR() {}
|
||||
#endif // KOKKOS_ENABLE_CUDA
|
||||
|
|
|
@ -51,6 +51,7 @@
|
|||
|
||||
#include <Cuda/Kokkos_Cuda_Error.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Internal.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Locks.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
#include <impl/Kokkos_Profiling_Interface.hpp>
|
||||
|
||||
|
@ -69,9 +70,6 @@
|
|||
__device__ __constant__
|
||||
unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits::ConstantMemoryUsage / sizeof(unsigned long) ] ;
|
||||
|
||||
__device__ __constant__
|
||||
Kokkos::Impl::CudaLockArraysStruct kokkos_impl_cuda_lock_arrays ;
|
||||
|
||||
#endif
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
@ -103,6 +101,7 @@ int cuda_kernel_arch()
|
|||
return arch ;
|
||||
}
|
||||
|
||||
#ifdef KOKKOS_ENABLE_CUDA_UVM
|
||||
bool cuda_launch_blocking()
|
||||
{
|
||||
const char * env = getenv("CUDA_LAUNCH_BLOCKING");
|
||||
|
@ -111,16 +110,13 @@ bool cuda_launch_blocking()
|
|||
|
||||
return atoi(env);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void cuda_device_synchronize()
|
||||
{
|
||||
// static const bool launch_blocking = cuda_launch_blocking();
|
||||
|
||||
// if (!launch_blocking) {
|
||||
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
|
||||
// }
|
||||
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
void cuda_internal_error_throw( cudaError e , const char * name, const char * file, const int line )
|
||||
|
@ -240,6 +236,7 @@ public:
|
|||
unsigned m_maxWarpCount ;
|
||||
unsigned m_maxBlock ;
|
||||
unsigned m_maxSharedWords ;
|
||||
uint32_t m_maxConcurrency ;
|
||||
size_type m_scratchSpaceCount ;
|
||||
size_type m_scratchFlagsCount ;
|
||||
size_type m_scratchUnifiedCount ;
|
||||
|
@ -248,6 +245,7 @@ public:
|
|||
size_type * m_scratchSpace ;
|
||||
size_type * m_scratchFlags ;
|
||||
size_type * m_scratchUnified ;
|
||||
uint32_t * m_scratchConcurrentBitset ;
|
||||
cudaStream_t * m_stream ;
|
||||
|
||||
static int was_initialized;
|
||||
|
@ -274,6 +272,7 @@ public:
|
|||
, m_maxWarpCount( 0 )
|
||||
, m_maxBlock( 0 )
|
||||
, m_maxSharedWords( 0 )
|
||||
, m_maxConcurrency( 0 )
|
||||
, m_scratchSpaceCount( 0 )
|
||||
, m_scratchFlagsCount( 0 )
|
||||
, m_scratchUnifiedCount( 0 )
|
||||
|
@ -282,6 +281,7 @@ public:
|
|||
, m_scratchSpace( 0 )
|
||||
, m_scratchFlags( 0 )
|
||||
, m_scratchUnified( 0 )
|
||||
, m_scratchConcurrentBitset( 0 )
|
||||
, m_stream( 0 )
|
||||
{}
|
||||
|
||||
|
@ -327,7 +327,8 @@ CudaInternal::~CudaInternal()
|
|||
if ( m_stream ||
|
||||
m_scratchSpace ||
|
||||
m_scratchFlags ||
|
||||
m_scratchUnified ) {
|
||||
m_scratchUnified ||
|
||||
m_scratchConcurrentBitset ) {
|
||||
std::cerr << "Kokkos::Cuda ERROR: Failed to call Kokkos::Cuda::finalize()"
|
||||
<< std::endl ;
|
||||
std::cerr.flush();
|
||||
|
@ -339,6 +340,7 @@ CudaInternal::~CudaInternal()
|
|||
m_maxWarpCount = 0 ;
|
||||
m_maxBlock = 0 ;
|
||||
m_maxSharedWords = 0 ;
|
||||
m_maxConcurrency = 0 ;
|
||||
m_scratchSpaceCount = 0 ;
|
||||
m_scratchFlagsCount = 0 ;
|
||||
m_scratchUnifiedCount = 0 ;
|
||||
|
@ -347,6 +349,7 @@ CudaInternal::~CudaInternal()
|
|||
m_scratchSpace = 0 ;
|
||||
m_scratchFlags = 0 ;
|
||||
m_scratchUnified = 0 ;
|
||||
m_scratchConcurrentBitset = 0 ;
|
||||
m_stream = 0 ;
|
||||
}
|
||||
|
||||
|
@ -485,6 +488,33 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
|
|||
(void) scratch_space( reduce_block_count * 16 * sizeof(size_type) );
|
||||
}
|
||||
//----------------------------------
|
||||
// Concurrent bitset for obtaining unique tokens from within
|
||||
// an executing kernel.
|
||||
{
|
||||
const unsigned max_threads_per_sm = 2048 ; // up to capability 7.0
|
||||
|
||||
m_maxConcurrency =
|
||||
max_threads_per_sm * cudaProp.multiProcessorCount ;
|
||||
|
||||
const int32_t buffer_bound =
|
||||
Kokkos::Impl::concurrent_bitset::buffer_bound( m_maxConcurrency );
|
||||
|
||||
// Allocate and initialize uint32_t[ buffer_bound ]
|
||||
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;
|
||||
|
||||
Record * const r = Record::allocate( Kokkos::CudaSpace()
|
||||
, "InternalScratchBitset"
|
||||
, sizeof(uint32_t) * buffer_bound );
|
||||
|
||||
Record::increment( r );
|
||||
|
||||
m_scratchConcurrentBitset = reinterpret_cast<uint32_t *>( r->data() );
|
||||
|
||||
CUDA_SAFE_CALL( cudaMemset( m_scratchConcurrentBitset , 0 , sizeof(uint32_t) * buffer_bound ) );
|
||||
|
||||
}
|
||||
//----------------------------------
|
||||
|
||||
if ( stream_count ) {
|
||||
m_stream = (cudaStream_t*) ::malloc( stream_count * sizeof(cudaStream_t) );
|
||||
|
@ -543,16 +573,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
|
|||
cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
|
||||
|
||||
// Init the array for used for arbitrarily sized atomics
|
||||
Impl::init_lock_arrays_cuda_space();
|
||||
|
||||
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
||||
Kokkos::Impl::CudaLockArraysStruct locks;
|
||||
locks.atomic = atomic_lock_array_cuda_space_ptr(false);
|
||||
locks.scratch = scratch_lock_array_cuda_space_ptr(false);
|
||||
locks.threadid = threadid_lock_array_cuda_space_ptr(false);
|
||||
locks.n = Kokkos::Cuda::concurrency();
|
||||
cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
|
||||
#endif
|
||||
Impl::initialize_host_cuda_lock_arrays();
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
@ -635,9 +656,7 @@ void CudaInternal::finalize()
|
|||
was_finalized = 1;
|
||||
if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {
|
||||
|
||||
atomic_lock_array_cuda_space_ptr(true);
|
||||
scratch_lock_array_cuda_space_ptr(true);
|
||||
threadid_lock_array_cuda_space_ptr(true);
|
||||
Impl::finalize_host_cuda_lock_arrays();
|
||||
|
||||
if ( m_stream ) {
|
||||
for ( size_type i = 1 ; i < m_streamCount ; ++i ) {
|
||||
|
@ -653,6 +672,7 @@ void CudaInternal::finalize()
|
|||
RecordCuda::decrement( RecordCuda::get_record( m_scratchFlags ) );
|
||||
RecordCuda::decrement( RecordCuda::get_record( m_scratchSpace ) );
|
||||
RecordHost::decrement( RecordHost::get_record( m_scratchUnified ) );
|
||||
RecordCuda::decrement( RecordCuda::get_record( m_scratchConcurrentBitset ) );
|
||||
|
||||
m_cudaDev = -1 ;
|
||||
m_multiProcCount = 0 ;
|
||||
|
@ -666,6 +686,7 @@ void CudaInternal::finalize()
|
|||
m_scratchSpace = 0 ;
|
||||
m_scratchFlags = 0 ;
|
||||
m_scratchUnified = 0 ;
|
||||
m_scratchConcurrentBitset = 0 ;
|
||||
m_stream = 0 ;
|
||||
}
|
||||
}
|
||||
|
@ -713,9 +734,8 @@ namespace Kokkos {
|
|||
Cuda::size_type Cuda::detect_device_count()
|
||||
{ return Impl::CudaInternalDevices::singleton().m_cudaDevCount ; }
|
||||
|
||||
int Cuda::concurrency() {
|
||||
return 131072;
|
||||
}
|
||||
int Cuda::concurrency()
|
||||
{ return Impl::CudaInternal::singleton().m_maxConcurrency ; }
|
||||
|
||||
int Cuda::is_initialized()
|
||||
{ return Impl::CudaInternal::singleton().is_initialized(); }
|
||||
|
@ -798,7 +818,22 @@ void Cuda::fence()
|
|||
const char* Cuda::name() { return "Cuda"; }
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
|
||||
UniqueToken< Kokkos::Cuda , Kokkos::Experimental::UniqueTokenScope::Global >::
|
||||
UniqueToken( Kokkos::Cuda const & )
|
||||
: m_buffer( Kokkos::Impl::CudaInternal::singleton().m_scratchConcurrentBitset )
|
||||
, m_count( Kokkos::Impl::CudaInternal::singleton().m_maxConcurrency )
|
||||
{}
|
||||
|
||||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
#else
|
||||
|
||||
void KOKKOS_CORE_SRC_CUDA_IMPL_PREVENT_LINK_ERROR() {}
|
||||
|
||||
#endif // KOKKOS_ENABLE_CUDA
|
||||
|
||||
|
|
|
@ -0,0 +1,119 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
|
||||
#include <Cuda/Kokkos_Cuda_Locks.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Error.hpp>
|
||||
#include <Kokkos_Cuda.hpp>
|
||||
|
||||
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
__device__ __constant__
|
||||
CudaLockArrays g_device_cuda_lock_arrays = { nullptr, nullptr, 0 };
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
namespace {
|
||||
|
||||
__global__ void init_lock_array_kernel_atomic() {
|
||||
unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
if(i<CUDA_SPACE_ATOMIC_MASK+1) {
|
||||
Kokkos::Impl::g_device_cuda_lock_arrays.atomic[i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void init_lock_array_kernel_threadid(int N) {
|
||||
unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
if(i<(unsigned)N) {
|
||||
Kokkos::Impl::g_device_cuda_lock_arrays.scratch[i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace Impl {
|
||||
|
||||
CudaLockArrays g_host_cuda_lock_arrays = { nullptr, nullptr, 0 };
|
||||
|
||||
void initialize_host_cuda_lock_arrays() {
|
||||
if (g_host_cuda_lock_arrays.atomic != nullptr) return;
|
||||
CUDA_SAFE_CALL(cudaMalloc(&g_host_cuda_lock_arrays.atomic,
|
||||
sizeof(int)*(CUDA_SPACE_ATOMIC_MASK+1)));
|
||||
CUDA_SAFE_CALL(cudaMalloc(&g_host_cuda_lock_arrays.scratch,
|
||||
sizeof(int)*(Cuda::concurrency())));
|
||||
CUDA_SAFE_CALL(cudaDeviceSynchronize());
|
||||
g_host_cuda_lock_arrays.n = Cuda::concurrency();
|
||||
KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
|
||||
init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK+1+255)/256,256>>>();
|
||||
init_lock_array_kernel_threadid<<<(Kokkos::Cuda::concurrency()+255)/256,256>>>(Kokkos::Cuda::concurrency());
|
||||
CUDA_SAFE_CALL(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
void finalize_host_cuda_lock_arrays() {
|
||||
if (g_host_cuda_lock_arrays.atomic == nullptr) return;
|
||||
cudaFree(g_host_cuda_lock_arrays.atomic);
|
||||
g_host_cuda_lock_arrays.atomic = nullptr;
|
||||
cudaFree(g_host_cuda_lock_arrays.scratch);
|
||||
g_host_cuda_lock_arrays.scratch = nullptr;
|
||||
g_host_cuda_lock_arrays.n = 0;
|
||||
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
||||
KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace Impl
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
#else
|
||||
|
||||
void KOKKOS_CORE_SRC_CUDA_CUDA_LOCKS_PREVENT_LINK_ERROR() {}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,166 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_CUDA_LOCKS_HPP
|
||||
#define KOKKOS_CUDA_LOCKS_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#include <Cuda/Kokkos_Cuda_Error.hpp>
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
struct CudaLockArrays {
|
||||
std::int32_t* atomic;
|
||||
std::int32_t* scratch;
|
||||
std::int32_t n;
|
||||
};
|
||||
|
||||
/// \brief This global variable in Host space is the central definition
|
||||
/// of these arrays.
|
||||
extern Kokkos::Impl::CudaLockArrays g_host_cuda_lock_arrays ;
|
||||
|
||||
/// \brief After this call, the g_host_cuda_lock_arrays variable has
|
||||
/// valid, initialized arrays.
|
||||
///
|
||||
/// This call is idempotent.
|
||||
void initialize_host_cuda_lock_arrays();
|
||||
|
||||
/// \brief After this call, the g_host_cuda_lock_arrays variable has
|
||||
/// all null pointers, and all array memory has been freed.
|
||||
///
|
||||
/// This call is idempotent.
|
||||
void finalize_host_cuda_lock_arrays();
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
#if defined( __CUDACC__ )
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
/// \brief This global variable in CUDA space is what kernels use
|
||||
/// to get access to the lock arrays.
|
||||
///
|
||||
/// When relocatable device code is enabled, there can be one single
|
||||
/// instance of this global variable for the entire executable,
|
||||
/// whose definition will be in Kokkos_Cuda_Locks.cpp (and whose declaration
|
||||
/// here must then be extern.
|
||||
/// This one instance will be initialized by initialize_host_cuda_lock_arrays
|
||||
/// and need not be modified afterwards.
|
||||
///
|
||||
/// When relocatable device code is disabled, an instance of this variable
|
||||
/// will be created in every translation unit that sees this header file
|
||||
/// (we make this clear by marking it static, meaning no other translation
|
||||
/// unit can link to it).
|
||||
/// Since the Kokkos_Cuda_Locks.cpp translation unit cannot initialize the
|
||||
/// instances in other translation units, we must update this CUDA global
|
||||
/// variable based on the Host global variable prior to running any kernels
|
||||
/// that will use it.
|
||||
/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro.
|
||||
__device__ __constant__
|
||||
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
||||
extern
|
||||
#endif
|
||||
Kokkos::Impl::CudaLockArrays g_device_cuda_lock_arrays ;
|
||||
|
||||
#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
|
||||
|
||||
/// \brief Aquire a lock for the address
|
||||
///
|
||||
/// This function tries to aquire the lock for the hash value derived
|
||||
/// from the provided ptr. If the lock is successfully aquired the
|
||||
/// function returns true. Otherwise it returns false.
|
||||
__device__ inline
|
||||
bool lock_address_cuda_space(void* ptr) {
|
||||
size_t offset = size_t(ptr);
|
||||
offset = offset >> 2;
|
||||
offset = offset & CUDA_SPACE_ATOMIC_MASK;
|
||||
return (0 == atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset],0,1));
|
||||
}
|
||||
|
||||
/// \brief Release lock for the address
|
||||
///
|
||||
/// This function releases the lock for the hash value derived
|
||||
/// from the provided ptr. This function should only be called
|
||||
/// after previously successfully aquiring a lock with
|
||||
/// lock_address.
|
||||
__device__ inline
|
||||
void unlock_address_cuda_space(void* ptr) {
|
||||
size_t offset = size_t(ptr);
|
||||
offset = offset >> 2;
|
||||
offset = offset & CUDA_SPACE_ATOMIC_MASK;
|
||||
atomicExch( &Kokkos::Impl::g_device_cuda_lock_arrays.atomic[ offset ], 0);
|
||||
}
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
/* Dan Ibanez: it is critical that this code be a macro, so that it will
|
||||
capture the right address for Kokkos::Impl::g_device_cuda_lock_arrays!
|
||||
putting this in an inline function will NOT do the right thing! */
|
||||
#define KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \
|
||||
{ \
|
||||
CUDA_SAFE_CALL(cudaMemcpyToSymbol( \
|
||||
Kokkos::Impl::g_device_cuda_lock_arrays , \
|
||||
& Kokkos::Impl::g_host_cuda_lock_arrays , \
|
||||
sizeof(Kokkos::Impl::CudaLockArrays) ) ); \
|
||||
}
|
||||
|
||||
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
||||
#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
|
||||
#else
|
||||
#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()
|
||||
#endif
|
||||
|
||||
#endif /* defined( __CUDACC__ ) */
|
||||
|
||||
#endif /* defined( KOKKOS_ENABLE_CUDA ) */
|
||||
|
||||
#endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */
|
|
@ -58,6 +58,7 @@
|
|||
#include <Cuda/Kokkos_CudaExec.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_ReduceScan.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Internal.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Locks.hpp>
|
||||
#include <Kokkos_Vectorization.hpp>
|
||||
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
|
@ -65,6 +66,8 @@
|
|||
#include <typeinfo>
|
||||
#endif
|
||||
|
||||
#include <KokkosExp_MDRangePolicy.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
@ -318,6 +321,7 @@ private:
|
|||
typedef Kokkos::RangePolicy< Traits ... > Policy;
|
||||
typedef typename Policy::member_type Member ;
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::launch_bounds LaunchBounds ;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
|
@ -363,7 +367,7 @@ public:
|
|||
const dim3 block( 1 , CudaTraits::WarpSize * cuda_internal_maximum_warp_count(), 1);
|
||||
const dim3 grid( std::min( ( nwork + block.y - 1 ) / block.y , cuda_internal_maximum_grid_count() ) , 1 , 1);
|
||||
|
||||
CudaParallelLaunch< ParallelFor >( *this , grid , block , 0 );
|
||||
CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
|
||||
}
|
||||
|
||||
ParallelFor( const FunctorType & arg_functor ,
|
||||
|
@ -373,6 +377,115 @@ public:
|
|||
{ }
|
||||
};
|
||||
|
||||
|
||||
// MDRangePolicy impl
|
||||
template< class FunctorType , class ... Traits >
|
||||
class ParallelFor< FunctorType
|
||||
, Kokkos::Experimental::MDRangePolicy< Traits ... >
|
||||
, Kokkos::Cuda
|
||||
>
|
||||
{
|
||||
private:
|
||||
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > Policy ;
|
||||
using RP = Policy;
|
||||
typedef typename Policy::array_index_type array_index_type;
|
||||
typedef typename Policy::index_type index_type;
|
||||
typedef typename Policy::launch_bounds LaunchBounds;
|
||||
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_rp ;
|
||||
|
||||
public:
|
||||
|
||||
inline
|
||||
__device__
|
||||
void operator()(void) const
|
||||
{
|
||||
Kokkos::Experimental::Impl::Refactor::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag>(m_rp,m_functor).exec_range();
|
||||
}
|
||||
|
||||
|
||||
inline
|
||||
void execute() const
|
||||
{
|
||||
const array_index_type maxblocks = static_cast<array_index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
|
||||
if ( RP::rank == 2 )
|
||||
{
|
||||
const dim3 block( m_rp.m_tile[0] , m_rp.m_tile[1] , 1);
|
||||
const dim3 grid(
|
||||
std::min( ( m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1 ) / block.x , maxblocks )
|
||||
, std::min( ( m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1 ) / block.y , maxblocks )
|
||||
, 1
|
||||
);
|
||||
CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
|
||||
}
|
||||
else if ( RP::rank == 3 )
|
||||
{
|
||||
const dim3 block( m_rp.m_tile[0] , m_rp.m_tile[1] , m_rp.m_tile[2] );
|
||||
const dim3 grid(
|
||||
std::min( ( m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1 ) / block.x , maxblocks )
|
||||
, std::min( ( m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1 ) / block.y , maxblocks )
|
||||
, std::min( ( m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1 ) / block.z , maxblocks )
|
||||
);
|
||||
CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
|
||||
}
|
||||
else if ( RP::rank == 4 )
|
||||
{
|
||||
// id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to threadIdx.z
|
||||
const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2] , m_rp.m_tile[3] );
|
||||
const dim3 grid(
|
||||
std::min( static_cast<index_type>( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] )
|
||||
, static_cast<index_type>(maxblocks) )
|
||||
, std::min( ( m_rp.m_upper[2] - m_rp.m_lower[2] + block.y - 1 ) / block.y , maxblocks )
|
||||
, std::min( ( m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1 ) / block.z , maxblocks )
|
||||
);
|
||||
CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
|
||||
}
|
||||
else if ( RP::rank == 5 )
|
||||
{
|
||||
// id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to threadIdx.z
|
||||
const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2]*m_rp.m_tile[3] , m_rp.m_tile[4] );
|
||||
const dim3 grid(
|
||||
std::min( static_cast<index_type>( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] )
|
||||
, static_cast<index_type>(maxblocks) )
|
||||
, std::min( static_cast<index_type>( m_rp.m_tile_end[2] * m_rp.m_tile_end[3] )
|
||||
, static_cast<index_type>(maxblocks) )
|
||||
, std::min( ( m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1 ) / block.z , maxblocks )
|
||||
);
|
||||
CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
|
||||
}
|
||||
else if ( RP::rank == 6 )
|
||||
{
|
||||
// id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to threadIdx.z
|
||||
const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2]*m_rp.m_tile[3] , m_rp.m_tile[4]*m_rp.m_tile[5] );
|
||||
const dim3 grid(
|
||||
std::min( static_cast<index_type>( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] )
|
||||
, static_cast<index_type>(maxblocks) )
|
||||
, std::min( static_cast<index_type>( m_rp.m_tile_end[2] * m_rp.m_tile_end[3] )
|
||||
, static_cast<index_type>(maxblocks) )
|
||||
, std::min( static_cast<index_type>( m_rp.m_tile_end[4] * m_rp.m_tile_end[5] )
|
||||
, static_cast<index_type>(maxblocks) )
|
||||
);
|
||||
CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n");
|
||||
Kokkos::abort("Aborting");
|
||||
}
|
||||
|
||||
} //end execute
|
||||
|
||||
// inline
|
||||
ParallelFor( const FunctorType & arg_functor
|
||||
, Policy arg_policy )
|
||||
: m_functor( arg_functor )
|
||||
, m_rp( arg_policy )
|
||||
{}
|
||||
};
|
||||
|
||||
|
||||
template< class FunctorType , class ... Properties >
|
||||
class ParallelFor< FunctorType
|
||||
, Kokkos::TeamPolicy< Properties ... >
|
||||
|
@ -384,6 +497,7 @@ private:
|
|||
typedef TeamPolicyInternal< Kokkos::Cuda , Properties ... > Policy ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::launch_bounds LaunchBounds ;
|
||||
|
||||
public:
|
||||
|
||||
|
@ -430,15 +544,15 @@ public:
|
|||
if ( m_scratch_size[1]>0 ) {
|
||||
__shared__ int base_thread_id;
|
||||
if (threadIdx.x==0 && threadIdx.y==0 ) {
|
||||
threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % kokkos_impl_cuda_lock_arrays.n;
|
||||
threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % Kokkos::Impl::g_device_cuda_lock_arrays.n;
|
||||
threadid = ((threadid + blockDim.x * blockDim.y-1)/(blockDim.x * blockDim.y)) * blockDim.x * blockDim.y;
|
||||
if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
|
||||
if(threadid > Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
|
||||
int done = 0;
|
||||
while (!done) {
|
||||
done = (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[threadid],0,1));
|
||||
done = (0 == atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid],0,1));
|
||||
if(!done) {
|
||||
threadid += blockDim.x * blockDim.y;
|
||||
if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid = 0;
|
||||
if(threadid > Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid = 0;
|
||||
}
|
||||
}
|
||||
base_thread_id = threadid;
|
||||
|
@ -448,7 +562,8 @@ public:
|
|||
}
|
||||
|
||||
|
||||
for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) {
|
||||
const int int_league_size = (int)m_league_size;
|
||||
for ( int league_rank = blockIdx.x ; league_rank < int_league_size ; league_rank += gridDim.x ) {
|
||||
|
||||
this-> template exec_team< WorkTag >(
|
||||
typename Policy::member_type( kokkos_impl_cuda_shared_memory<void>()
|
||||
|
@ -462,7 +577,7 @@ public:
|
|||
if ( m_scratch_size[1]>0 ) {
|
||||
__syncthreads();
|
||||
if (threadIdx.x==0 && threadIdx.y==0 )
|
||||
kokkos_impl_cuda_lock_arrays.atomic[threadid]=0;
|
||||
Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid]=0;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -473,7 +588,7 @@ public:
|
|||
const dim3 grid( int(m_league_size) , 1 , 1 );
|
||||
const dim3 block( int(m_vector_size) , int(m_team_size) , 1 );
|
||||
|
||||
CudaParallelLaunch< ParallelFor >( *this, grid, block, shmem_size_total ); // copy to device and execute
|
||||
CudaParallelLaunch< ParallelFor, LaunchBounds >( *this, grid, block, shmem_size_total ); // copy to device and execute
|
||||
|
||||
}
|
||||
|
||||
|
@ -529,6 +644,7 @@ private:
|
|||
typedef typename Policy::WorkRange WorkRange ;
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
typedef typename Policy::launch_bounds LaunchBounds ;
|
||||
|
||||
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
|
||||
typedef typename ReducerConditional::type ReducerTypeFwd;
|
||||
|
@ -563,6 +679,7 @@ private:
|
|||
typedef int DummySHMEMReductionType;
|
||||
|
||||
public:
|
||||
// Make the exec_range calls call to Reduce::DeviceIterateTile
|
||||
template< class TagType >
|
||||
__device__ inline
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
|
@ -686,7 +803,7 @@ public:
|
|||
|
||||
const int shmem = UseShflReduction?0:cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( m_functor , block.y );
|
||||
|
||||
CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem ); // copy to device and execute
|
||||
CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
|
||||
|
||||
Cuda::fence();
|
||||
|
||||
|
@ -737,6 +854,232 @@ public:
|
|||
{ }
|
||||
};
|
||||
|
||||
|
||||
// MDRangePolicy impl
|
||||
template< class FunctorType , class ReducerType, class ... Traits >
|
||||
class ParallelReduce< FunctorType
|
||||
, Kokkos::Experimental::MDRangePolicy< Traits ... >
|
||||
, ReducerType
|
||||
, Kokkos::Cuda
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > Policy ;
|
||||
typedef typename Policy::array_index_type array_index_type;
|
||||
typedef typename Policy::index_type index_type;
|
||||
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
typedef typename Policy::launch_bounds LaunchBounds;
|
||||
|
||||
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
|
||||
typedef typename ReducerConditional::type ReducerTypeFwd;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ;
|
||||
typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, WorkTag > ValueJoin ;
|
||||
|
||||
public:
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::value_type value_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
typedef FunctorType functor_type ;
|
||||
typedef Cuda::size_type size_type ;
|
||||
|
||||
// Algorithmic constraints: blockSize is a power of two AND blockDim.y == blockDim.z == 1
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ; // used for workrange and nwork
|
||||
const ReducerType m_reducer ;
|
||||
const pointer_type m_result_ptr ;
|
||||
size_type * m_scratch_space ;
|
||||
size_type * m_scratch_flags ;
|
||||
size_type * m_unified_space ;
|
||||
|
||||
typedef typename Kokkos::Experimental::Impl::Reduce::DeviceIterateTile<Policy::rank, Policy, FunctorType, typename Policy::work_tag, reference_type> DeviceIteratePattern;
|
||||
|
||||
// Shall we use the shfl based reduction or not (only use it for static sized types of more than 128bit
|
||||
enum { UseShflReduction = ((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize) };
|
||||
// Some crutch to do function overloading
|
||||
private:
|
||||
typedef double DummyShflReductionType;
|
||||
typedef int DummySHMEMReductionType;
|
||||
|
||||
public:
|
||||
inline
|
||||
__device__
|
||||
void
|
||||
exec_range( reference_type update ) const
|
||||
{
|
||||
Kokkos::Experimental::Impl::Reduce::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag, reference_type>(m_policy, m_functor, update).exec_range();
|
||||
}
|
||||
|
||||
inline
|
||||
__device__
|
||||
void operator() (void) const {
|
||||
run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0) );
|
||||
}
|
||||
|
||||
__device__ inline
|
||||
void run(const DummySHMEMReductionType& ) const
|
||||
{
|
||||
const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
|
||||
word_count( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) / sizeof(size_type) );
|
||||
|
||||
{
|
||||
reference_type value =
|
||||
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.y * word_count.value );
|
||||
|
||||
// Number of blocks is bounded so that the reduction can be limited to two passes.
|
||||
// Each thread block is given an approximately equal amount of work to perform.
|
||||
// Accumulate the values for this block.
|
||||
// The accumulation ordering does not match the final pass, but is arithmatically equivalent.
|
||||
|
||||
this-> exec_range( value );
|
||||
}
|
||||
|
||||
// Reduce with final value at blockDim.y - 1 location.
|
||||
// Problem: non power-of-two blockDim
|
||||
if ( cuda_single_inter_block_reduce_scan<false,ReducerTypeFwd,WorkTag>(
|
||||
ReducerConditional::select(m_functor , m_reducer) , blockIdx.x , gridDim.x ,
|
||||
kokkos_impl_cuda_shared_memory<size_type>() , m_scratch_space , m_scratch_flags ) ) {
|
||||
|
||||
// This is the final block with the final result at the final threads' location
|
||||
size_type * const shared = kokkos_impl_cuda_shared_memory<size_type>() + ( blockDim.y - 1 ) * word_count.value ;
|
||||
size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;
|
||||
|
||||
if ( threadIdx.y == 0 ) {
|
||||
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , shared );
|
||||
}
|
||||
|
||||
if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); }
|
||||
|
||||
for ( unsigned i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i]; }
|
||||
}
|
||||
}
|
||||
|
||||
__device__ inline
|
||||
void run(const DummyShflReductionType&) const
|
||||
{
|
||||
|
||||
value_type value;
|
||||
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &value);
|
||||
// Number of blocks is bounded so that the reduction can be limited to two passes.
|
||||
// Each thread block is given an approximately equal amount of work to perform.
|
||||
// Accumulate the values for this block.
|
||||
// The accumulation ordering does not match the final pass, but is arithmatically equivalent.
|
||||
|
||||
const Member work_part =
|
||||
( ( m_policy.m_num_tiles + ( gridDim.x - 1 ) ) / gridDim.x ); //portion of tiles handled by each block
|
||||
|
||||
this-> exec_range( value );
|
||||
|
||||
pointer_type const result = (pointer_type) (m_unified_space ? m_unified_space : m_scratch_space) ;
|
||||
|
||||
int max_active_thread = work_part < blockDim.y ? work_part:blockDim.y;
|
||||
max_active_thread = (max_active_thread == 0)?blockDim.y:max_active_thread;
|
||||
|
||||
value_type init;
|
||||
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &init);
|
||||
if(Impl::cuda_inter_block_reduction<ReducerTypeFwd,ValueJoin,WorkTag>
|
||||
(value,init,ValueJoin(ReducerConditional::select(m_functor , m_reducer)),m_scratch_space,result,m_scratch_flags,max_active_thread)) {
|
||||
const unsigned id = threadIdx.y*blockDim.x + threadIdx.x;
|
||||
if(id==0) {
|
||||
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , (void*) &value );
|
||||
*result = value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Determine block size constrained by shared memory:
|
||||
static inline
|
||||
unsigned local_block_size( const FunctorType & f )
|
||||
{
|
||||
unsigned n = CudaTraits::WarpSize * 8 ;
|
||||
while ( n && CudaTraits::SharedMemoryCapacity < cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( f , n ) ) { n >>= 1 ; }
|
||||
return n ;
|
||||
}
|
||||
|
||||
inline
|
||||
void execute()
|
||||
{
|
||||
const int nwork = m_policy.m_num_tiles;
|
||||
if ( nwork ) {
|
||||
int block_size = m_policy.m_prod_tile_dims;
|
||||
// CONSTRAINT: Algorithm requires block_size >= product of tile dimensions
|
||||
// Nearest power of two
|
||||
int exponent_pow_two = std::ceil( std::log2(block_size) );
|
||||
block_size = std::pow(2, exponent_pow_two);
|
||||
int suggested_blocksize = local_block_size( m_functor );
|
||||
|
||||
block_size = (block_size > suggested_blocksize) ? block_size : suggested_blocksize ; //Note: block_size must be less than or equal to 512
|
||||
|
||||
|
||||
m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) * block_size /* block_size == max block_count */ );
|
||||
m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) );
|
||||
m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) );
|
||||
|
||||
// REQUIRED ( 1 , N , 1 )
|
||||
const dim3 block( 1 , block_size , 1 );
|
||||
// Required grid.x <= block.y
|
||||
const dim3 grid( std::min( int(block.y) , int( nwork ) ) , 1 , 1 );
|
||||
|
||||
const int shmem = UseShflReduction?0:cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( m_functor , block.y );
|
||||
|
||||
CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
|
||||
|
||||
Cuda::fence();
|
||||
|
||||
if ( m_result_ptr ) {
|
||||
if ( m_unified_space ) {
|
||||
const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
|
||||
for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
|
||||
}
|
||||
else {
|
||||
const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) );
|
||||
DeepCopy<HostSpace,CudaSpace>( m_result_ptr , m_scratch_space , size );
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (m_result_ptr) {
|
||||
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , m_result_ptr );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template< class HostViewType >
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy
|
||||
, const HostViewType & arg_result
|
||||
, typename std::enable_if<
|
||||
Kokkos::is_view< HostViewType >::value
|
||||
,void*>::type = NULL)
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( InvalidType() )
|
||||
, m_result_ptr( arg_result.ptr_on_device() )
|
||||
, m_scratch_space( 0 )
|
||||
, m_scratch_flags( 0 )
|
||||
, m_unified_space( 0 )
|
||||
{}
|
||||
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy
|
||||
, const ReducerType & reducer)
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( reducer )
|
||||
, m_result_ptr( reducer.view().ptr_on_device() )
|
||||
, m_scratch_space( 0 )
|
||||
, m_scratch_flags( 0 )
|
||||
, m_unified_space( 0 )
|
||||
{}
|
||||
};
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#if 1
|
||||
|
@ -753,6 +1096,7 @@ private:
|
|||
typedef TeamPolicyInternal< Kokkos::Cuda, Properties ... > Policy ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::launch_bounds LaunchBounds ;
|
||||
|
||||
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
|
||||
typedef typename ReducerConditional::type ReducerTypeFwd;
|
||||
|
@ -819,15 +1163,15 @@ public:
|
|||
if ( m_scratch_size[1]>0 ) {
|
||||
__shared__ int base_thread_id;
|
||||
if (threadIdx.x==0 && threadIdx.y==0 ) {
|
||||
threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % kokkos_impl_cuda_lock_arrays.n;
|
||||
threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % Kokkos::Impl::g_device_cuda_lock_arrays.n;
|
||||
threadid = ((threadid + blockDim.x * blockDim.y-1)/(blockDim.x * blockDim.y)) * blockDim.x * blockDim.y;
|
||||
if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
|
||||
if(threadid > Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
|
||||
int done = 0;
|
||||
while (!done) {
|
||||
done = (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[threadid],0,1));
|
||||
done = (0 == atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid],0,1));
|
||||
if(!done) {
|
||||
threadid += blockDim.x * blockDim.y;
|
||||
if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid = 0;
|
||||
if(threadid > Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid = 0;
|
||||
}
|
||||
}
|
||||
base_thread_id = threadid;
|
||||
|
@ -840,7 +1184,7 @@ public:
|
|||
if ( m_scratch_size[1]>0 ) {
|
||||
__syncthreads();
|
||||
if (threadIdx.x==0 && threadIdx.y==0 )
|
||||
kokkos_impl_cuda_lock_arrays.atomic[threadid]=0;
|
||||
Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid]=0;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -854,7 +1198,8 @@ public:
|
|||
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.y * word_count.value );
|
||||
|
||||
// Iterate this block through the league
|
||||
for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) {
|
||||
const int int_league_size = (int)m_league_size;
|
||||
for ( int league_rank = blockIdx.x ; league_rank < int_league_size ; league_rank += gridDim.x ) {
|
||||
this-> template exec_team< WorkTag >
|
||||
( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin
|
||||
, m_shmem_begin
|
||||
|
@ -894,7 +1239,8 @@ public:
|
|||
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &value);
|
||||
|
||||
// Iterate this block through the league
|
||||
for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) {
|
||||
const int int_league_size = (int)m_league_size;
|
||||
for ( int league_rank = blockIdx.x ; league_rank < int_league_size ; league_rank += gridDim.x ) {
|
||||
this-> template exec_team< WorkTag >
|
||||
( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin
|
||||
, m_shmem_begin
|
||||
|
@ -936,7 +1282,7 @@ public:
|
|||
const dim3 grid( block_count , 1 , 1 );
|
||||
const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size ;
|
||||
|
||||
CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem_size_total ); // copy to device and execute
|
||||
CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem_size_total ); // copy to device and execute
|
||||
|
||||
Cuda::fence();
|
||||
|
||||
|
@ -975,12 +1321,6 @@ public:
|
|||
, m_shmem_begin( 0 )
|
||||
, m_shmem_size( 0 )
|
||||
, m_scratch_ptr{NULL,NULL}
|
||||
, m_league_size( arg_policy.league_size() )
|
||||
, m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
|
||||
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
|
||||
arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
|
||||
arg_policy.vector_length() )
|
||||
, m_vector_size( arg_policy.vector_length() )
|
||||
, m_scratch_size{
|
||||
arg_policy.scratch_size(0,( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
|
||||
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
|
||||
|
@ -991,6 +1331,12 @@ public:
|
|||
arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
|
||||
arg_policy.vector_length() )
|
||||
)}
|
||||
, m_league_size( arg_policy.league_size() )
|
||||
, m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
|
||||
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
|
||||
arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
|
||||
arg_policy.vector_length() )
|
||||
, m_vector_size( arg_policy.vector_length() )
|
||||
{
|
||||
// Return Init value if the number of worksets is zero
|
||||
if( arg_policy.league_size() == 0) {
|
||||
|
@ -1150,6 +1496,7 @@ private:
|
|||
typedef typename reducer_type<>::pointer_type pointer_type ;
|
||||
typedef typename reducer_type<>::reference_type reference_type ;
|
||||
typedef typename reducer_type<>::value_type value_type ;
|
||||
typedef typename Policy::launch_bounds LaunchBounds ;
|
||||
|
||||
typedef Kokkos::Impl::FunctorAnalysis
|
||||
< Kokkos::Impl::FunctorPatternInterface::REDUCE
|
||||
|
@ -1273,7 +1620,7 @@ public:
|
|||
const int shmem = m_shmem_team_begin + m_shmem_team_size ;
|
||||
|
||||
// copy to device and execute
|
||||
CudaParallelLaunch<ParallelReduce>( *this, grid, block, shmem );
|
||||
CudaParallelLaunch<ParallelReduce,LaunchBounds>( *this, grid, block, shmem );
|
||||
|
||||
Cuda::fence();
|
||||
|
||||
|
@ -1373,7 +1720,7 @@ public:
|
|||
|
||||
if ( CudaTraits::WarpSize < team_threads ) {
|
||||
// Need inter-warp team reduction (collectives) shared memory
|
||||
// Speculate an upper bound for the value size
|
||||
// Speculate an upper bound for the value size
|
||||
|
||||
m_shmem_team_begin =
|
||||
align_scratch( CudaTraits::warp_count(team_threads) * sizeof(double) );
|
||||
|
@ -1426,7 +1773,7 @@ public:
|
|||
|
||||
// Reduce space has claim flag followed by vaue buffer
|
||||
const int global_reduce_value_size =
|
||||
max_concurrent_block *
|
||||
max_concurrent_block *
|
||||
( aligned_flag_size + align_scratch( value_size ) );
|
||||
|
||||
// Scratch space has claim flag followed by scratch buffer
|
||||
|
@ -1469,6 +1816,7 @@ private:
|
|||
typedef typename Policy::member_type Member ;
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::WorkRange WorkRange ;
|
||||
typedef typename Policy::launch_bounds LaunchBounds ;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;
|
||||
|
@ -1655,10 +2003,10 @@ public:
|
|||
const int shmem = ValueTraits::value_size( m_functor ) * ( block_size + 2 );
|
||||
|
||||
m_final = false ;
|
||||
CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute
|
||||
CudaParallelLaunch< ParallelScan, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
|
||||
|
||||
m_final = true ;
|
||||
CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute
|
||||
CudaParallelLaunch< ParallelScan, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -151,7 +151,7 @@ template< class ValueType , class JoinOp>
|
|||
__device__
|
||||
inline void cuda_intra_warp_reduction( ValueType& result,
|
||||
const JoinOp& join,
|
||||
const int max_active_thread = blockDim.y) {
|
||||
const uint32_t max_active_thread = blockDim.y) {
|
||||
|
||||
unsigned int shift = 1;
|
||||
|
||||
|
@ -268,29 +268,33 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT
|
|||
if( id + 1 < int(gridDim.x) )
|
||||
join(value, tmp);
|
||||
}
|
||||
int active = __ballot(1);
|
||||
if (int(blockDim.x*blockDim.y) > 2) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 2,32);
|
||||
if( id + 2 < int(gridDim.x) )
|
||||
join(value, tmp);
|
||||
}
|
||||
active += __ballot(1);
|
||||
if (int(blockDim.x*blockDim.y) > 4) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 4,32);
|
||||
if( id + 4 < int(gridDim.x) )
|
||||
join(value, tmp);
|
||||
}
|
||||
active += __ballot(1);
|
||||
if (int(blockDim.x*blockDim.y) > 8) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 8,32);
|
||||
if( id + 8 < int(gridDim.x) )
|
||||
join(value, tmp);
|
||||
}
|
||||
active += __ballot(1);
|
||||
if (int(blockDim.x*blockDim.y) > 16) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 16,32);
|
||||
if( id + 16 < int(gridDim.x) )
|
||||
join(value, tmp);
|
||||
}
|
||||
active += __ballot(1);
|
||||
}
|
||||
}
|
||||
|
||||
//The last block has in its thread=0 the global reduction value through "value"
|
||||
return last_block;
|
||||
#else
|
||||
|
@ -302,7 +306,7 @@ template< class ReducerType >
|
|||
__device__ inline
|
||||
typename std::enable_if< Kokkos::is_reducer<ReducerType>::value >::type
|
||||
cuda_intra_warp_reduction( const ReducerType& reducer,
|
||||
const int max_active_thread = blockDim.y) {
|
||||
const uint32_t max_active_thread = blockDim.y) {
|
||||
|
||||
typedef typename ReducerType::value_type ValueType;
|
||||
|
||||
|
@ -428,26 +432,31 @@ cuda_inter_block_reduction( const ReducerType& reducer,
|
|||
if( id + 1 < int(gridDim.x) )
|
||||
reducer.join(value, tmp);
|
||||
}
|
||||
int active = __ballot(1);
|
||||
if (int(blockDim.x*blockDim.y) > 2) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 2,32);
|
||||
if( id + 2 < int(gridDim.x) )
|
||||
reducer.join(value, tmp);
|
||||
}
|
||||
active += __ballot(1);
|
||||
if (int(blockDim.x*blockDim.y) > 4) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 4,32);
|
||||
if( id + 4 < int(gridDim.x) )
|
||||
reducer.join(value, tmp);
|
||||
}
|
||||
active += __ballot(1);
|
||||
if (int(blockDim.x*blockDim.y) > 8) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 8,32);
|
||||
if( id + 8 < int(gridDim.x) )
|
||||
reducer.join(value, tmp);
|
||||
}
|
||||
active += __ballot(1);
|
||||
if (int(blockDim.x*blockDim.y) > 16) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 16,32);
|
||||
if( id + 16 < int(gridDim.x) )
|
||||
reducer.join(value, tmp);
|
||||
}
|
||||
active += __ballot(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -594,7 +603,7 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType & functor ,
|
|||
typedef FunctorValueOps< FunctorType , ArgTag > ValueOps ;
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
//typedef typename ValueTraits::reference_type reference_type ;
|
||||
|
||||
// '__ffs' = position of the least significant bit set to 1.
|
||||
// 'blockDim.y' is guaranteed to be a power of two so this
|
||||
|
@ -637,7 +646,7 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType & functor ,
|
|||
|
||||
{
|
||||
void * const shared_ptr = shared_data + word_count.value * threadIdx.y ;
|
||||
reference_type shared_value = ValueInit::init( functor , shared_ptr );
|
||||
/* reference_type shared_value = */ ValueInit::init( functor , shared_ptr );
|
||||
|
||||
for ( size_type i = b ; i < e ; ++i ) {
|
||||
ValueJoin::join( functor , shared_ptr , global_data + word_count.value * i );
|
||||
|
|
|
@ -58,25 +58,56 @@ template class TaskQueue< Kokkos::Cuda > ;
|
|||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#if defined( KOKKOS_DEBUG )
|
||||
|
||||
__device__
|
||||
void verify_warp_convergence( const char * const where )
|
||||
{
|
||||
const unsigned b = __ballot(1);
|
||||
|
||||
if ( b != ~0u ) {
|
||||
|
||||
printf(" verify_warp_convergence( %s ) (%d,%d,%d) (%d,%d,%d) failed %x\n"
|
||||
, where
|
||||
, blockIdx.x
|
||||
, blockIdx.y
|
||||
, blockIdx.z
|
||||
, threadIdx.x
|
||||
, threadIdx.y
|
||||
, threadIdx.z
|
||||
, b );
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif // #if defined( KOKKOS_DEBUG )
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
__device__
|
||||
void TaskQueueSpecialization< Kokkos::Cuda >::driver
|
||||
( TaskQueueSpecialization< Kokkos::Cuda >::queue_type * const queue )
|
||||
( TaskQueueSpecialization< Kokkos::Cuda >::queue_type * const queue
|
||||
, int32_t shmem_per_warp )
|
||||
{
|
||||
using Member = TaskExec< Kokkos::Cuda > ;
|
||||
using Queue = TaskQueue< Kokkos::Cuda > ;
|
||||
using task_root_type = TaskBase< Kokkos::Cuda , void , void > ;
|
||||
using task_root_type = TaskBase< void , void , void > ;
|
||||
|
||||
extern __shared__ int32_t shmem_all[];
|
||||
|
||||
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
|
||||
|
||||
Member single_exec( 1 );
|
||||
Member team_exec( blockDim.y );
|
||||
int32_t * const warp_shmem =
|
||||
shmem_all + ( threadIdx.z * shmem_per_warp ) / sizeof(int32_t);
|
||||
|
||||
task_root_type * const task_shmem = (task_root_type *) warp_shmem ;
|
||||
|
||||
const int warp_lane = threadIdx.x + threadIdx.y * blockDim.x ;
|
||||
|
||||
union {
|
||||
task_root_type * ptr ;
|
||||
int raw[2] ;
|
||||
} task ;
|
||||
Member single_exec( warp_shmem , 1 );
|
||||
Member team_exec( warp_shmem , blockDim.y );
|
||||
|
||||
task_root_type * task_ptr ;
|
||||
|
||||
// Loop until all queues are empty and no tasks in flight
|
||||
|
||||
|
@ -87,41 +118,86 @@ void TaskQueueSpecialization< Kokkos::Cuda >::driver
|
|||
|
||||
if ( 0 == warp_lane ) {
|
||||
|
||||
task.ptr = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
|
||||
task_ptr = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
|
||||
|
||||
// Loop by priority and then type
|
||||
for ( int i = 0 ; i < Queue::NumQueue && end == task.ptr ; ++i ) {
|
||||
for ( int j = 0 ; j < 2 && end == task.ptr ; ++j ) {
|
||||
task.ptr = Queue::pop_ready_task( & queue->m_ready[i][j] );
|
||||
for ( int i = 0 ; i < Queue::NumQueue && end == task_ptr ; ++i ) {
|
||||
for ( int j = 0 ; j < 2 && end == task_ptr ; ++j ) {
|
||||
task_ptr = Queue::pop_ready_task( & queue->m_ready[i][j] );
|
||||
}
|
||||
}
|
||||
|
||||
#if 0
|
||||
printf("TaskQueue<Cuda>::driver(%d,%d) task(%lx)\n",threadIdx.z,blockIdx.x
|
||||
, uintptr_t(task.ptr));
|
||||
, uintptr_t(task_ptr));
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
// shuffle broadcast
|
||||
|
||||
task.raw[0] = __shfl( task.raw[0] , 0 );
|
||||
task.raw[1] = __shfl( task.raw[1] , 0 );
|
||||
((int*) & task_ptr )[0] = __shfl( ((int*) & task_ptr )[0] , 0 );
|
||||
((int*) & task_ptr )[1] = __shfl( ((int*) & task_ptr )[1] , 0 );
|
||||
|
||||
if ( 0 == task.ptr ) break ; // 0 == queue->m_ready_count
|
||||
#if defined( KOKKOS_DEBUG )
|
||||
verify_warp_convergence("task_ptr");
|
||||
#endif
|
||||
|
||||
if ( end != task.ptr ) {
|
||||
if ( task_root_type::TaskTeam == task.ptr->m_task_type ) {
|
||||
if ( 0 == task_ptr ) break ; // 0 == queue->m_ready_count
|
||||
|
||||
if ( end != task_ptr ) {
|
||||
|
||||
// Whole warp copy task's closure to/from shared memory.
|
||||
// Use all threads of warp for coalesced read/write.
|
||||
|
||||
int32_t const b = sizeof(task_root_type) / sizeof(int32_t);
|
||||
int32_t const e = *((int32_t volatile *)( & task_ptr->m_alloc_size )) / sizeof(int32_t);
|
||||
|
||||
int32_t volatile * const task_mem = (int32_t volatile *) task_ptr ;
|
||||
|
||||
// copy global to shared memory:
|
||||
|
||||
for ( int32_t i = warp_lane ; i < e ; i += CudaTraits::WarpSize ) {
|
||||
warp_shmem[i] = task_mem[i] ;
|
||||
}
|
||||
|
||||
Kokkos::memory_fence();
|
||||
|
||||
// Copy done - use memory fence so that memory writes are visible.
|
||||
// For reliable warp convergence on Pascal and Volta an explicit
|
||||
// warp level synchronization will also be required.
|
||||
|
||||
if ( task_root_type::TaskTeam == task_shmem->m_task_type ) {
|
||||
// Thread Team Task
|
||||
(*task.ptr->m_apply)( task.ptr , & team_exec );
|
||||
(*task_shmem->m_apply)( task_shmem , & team_exec );
|
||||
}
|
||||
else if ( 0 == threadIdx.y ) {
|
||||
// Single Thread Task
|
||||
(*task.ptr->m_apply)( task.ptr , & single_exec );
|
||||
(*task_shmem->m_apply)( task_shmem , & single_exec );
|
||||
}
|
||||
|
||||
// copy shared to global memory:
|
||||
|
||||
for ( int32_t i = b + warp_lane ; i < e ; i += CudaTraits::WarpSize ) {
|
||||
task_mem[i] = warp_shmem[i] ;
|
||||
}
|
||||
|
||||
Kokkos::memory_fence();
|
||||
|
||||
#if defined( KOKKOS_DEBUG )
|
||||
verify_warp_convergence("apply");
|
||||
#endif
|
||||
|
||||
// If respawn requested copy respawn data back to main memory
|
||||
|
||||
if ( 0 == warp_lane ) {
|
||||
queue->complete( task.ptr );
|
||||
|
||||
if ( ((task_root_type *) task_root_type::LockTag) != task_shmem->m_next ) {
|
||||
( (volatile task_root_type *) task_ptr )->m_next = task_shmem->m_next ;
|
||||
( (volatile task_root_type *) task_ptr )->m_priority = task_shmem->m_priority ;
|
||||
}
|
||||
|
||||
queue->complete( task_ptr );
|
||||
}
|
||||
}
|
||||
} while(1);
|
||||
|
@ -130,18 +206,20 @@ printf("TaskQueue<Cuda>::driver(%d,%d) task(%lx)\n",threadIdx.z,blockIdx.x
|
|||
namespace {
|
||||
|
||||
__global__
|
||||
void cuda_task_queue_execute( TaskQueue< Kokkos::Cuda > * queue )
|
||||
{ TaskQueueSpecialization< Kokkos::Cuda >::driver( queue ); }
|
||||
void cuda_task_queue_execute( TaskQueue< Kokkos::Cuda > * queue
|
||||
, int32_t shmem_size )
|
||||
{ TaskQueueSpecialization< Kokkos::Cuda >::driver( queue , shmem_size ); }
|
||||
|
||||
}
|
||||
|
||||
void TaskQueueSpecialization< Kokkos::Cuda >::execute
|
||||
( TaskQueue< Kokkos::Cuda > * const queue )
|
||||
{
|
||||
const int shared_per_warp = 2048 ;
|
||||
const int warps_per_block = 4 ;
|
||||
const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 );
|
||||
const dim3 block( 1 , Kokkos::Impl::CudaTraits::WarpSize , warps_per_block );
|
||||
const int shared = 0 ;
|
||||
const int shared_total = shared_per_warp * warps_per_block ;
|
||||
const cudaStream_t stream = 0 ;
|
||||
|
||||
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
|
||||
|
@ -159,7 +237,7 @@ printf("cuda_task_queue_execute before\n");
|
|||
//
|
||||
// CUDA_SAFE_CALL( cudaDeviceSetLimit( cudaLimitStackSize , stack_size ) );
|
||||
|
||||
cuda_task_queue_execute<<< grid , block , shared , stream >>>( queue );
|
||||
cuda_task_queue_execute<<< grid , block , shared_total , stream >>>( queue , shared_per_warp );
|
||||
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
|
||||
|
|
|
@ -57,7 +57,7 @@ namespace {
|
|||
template< typename TaskType >
|
||||
__global__
|
||||
void set_cuda_task_base_apply_function_pointer
|
||||
( TaskBase<Kokkos::Cuda,void,void>::function_type * ptr )
|
||||
( TaskBase<void,void,void>::function_type * ptr )
|
||||
{ *ptr = TaskType::apply ; }
|
||||
|
||||
}
|
||||
|
@ -78,7 +78,7 @@ public:
|
|||
void iff_single_thread_recursive_execute( queue_type * const ) {}
|
||||
|
||||
__device__
|
||||
static void driver( queue_type * const );
|
||||
static void driver( queue_type * const , int32_t );
|
||||
|
||||
static
|
||||
void execute( queue_type * const );
|
||||
|
@ -106,7 +106,14 @@ public:
|
|||
|
||||
extern template class TaskQueue< Kokkos::Cuda > ;
|
||||
|
||||
}} /* namespace Kokkos::Impl */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
/**\brief Impl::TaskExec<Cuda> is the TaskScheduler<Cuda>::member_type
|
||||
* passed to tasks running in a Cuda space.
|
||||
*
|
||||
|
@ -134,11 +141,13 @@ private:
|
|||
friend class Kokkos::Impl::TaskQueue< Kokkos::Cuda > ;
|
||||
friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::Cuda > ;
|
||||
|
||||
int32_t * m_team_shmem ;
|
||||
const int m_team_size ;
|
||||
|
||||
__device__
|
||||
TaskExec( int arg_team_size = blockDim.y )
|
||||
: m_team_size( arg_team_size ) {}
|
||||
TaskExec( int32_t * arg_team_shmem , int arg_team_size = blockDim.y )
|
||||
: m_team_shmem( arg_team_shmem )
|
||||
, m_team_size( arg_team_size ) {}
|
||||
|
||||
public:
|
||||
|
||||
|
@ -154,7 +163,13 @@ public:
|
|||
|
||||
};
|
||||
|
||||
}} /* namespace Kokkos::Impl */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template<typename iType>
|
||||
struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
|
||||
|
|
|
@ -106,7 +106,7 @@ private:
|
|||
typedef Kokkos::Cuda execution_space ;
|
||||
typedef execution_space::scratch_memory_space scratch_memory_space ;
|
||||
|
||||
void * m_team_reduce ;
|
||||
mutable void * m_team_reduce ;
|
||||
scratch_memory_space m_team_shared ;
|
||||
int m_team_reduce_size ;
|
||||
int m_league_rank ;
|
||||
|
@ -166,7 +166,7 @@ public:
|
|||
if ( 1 == blockDim.z ) { // team == block
|
||||
__syncthreads();
|
||||
// Wait for shared data write until all threads arrive here
|
||||
if ( threadIdx.x == 0 && threadIdx.y == thread_id ) {
|
||||
if ( threadIdx.x == 0u && threadIdx.y == (uint32_t)thread_id ) {
|
||||
*((ValueType*) m_team_reduce) = val ;
|
||||
}
|
||||
__syncthreads(); // Wait for shared data read until root thread writes
|
||||
|
@ -210,7 +210,7 @@ public:
|
|||
const int wx =
|
||||
( threadIdx.x + blockDim.x * threadIdx.y ) & CudaTraits::WarpIndexMask ;
|
||||
|
||||
for ( int i = CudaTraits::WarpSize ; blockDim.x <= ( i >>= 1 ) ; ) {
|
||||
for ( int i = CudaTraits::WarpSize ; (int)blockDim.x <= ( i >>= 1 ) ; ) {
|
||||
|
||||
cuda_shfl_down( reducer.reference() , tmp , i , CudaTraits::WarpSize );
|
||||
|
||||
|
@ -354,7 +354,7 @@ public:
|
|||
|
||||
for ( int i = blockDim.x ; ( i >>= 1 ) ; ) {
|
||||
cuda_shfl_down( reducer.reference() , tmp , i , blockDim.x );
|
||||
if ( threadIdx.x < i ) { reducer.join( tmp , reducer.reference() ); }
|
||||
if ( (int)threadIdx.x < i ) { reducer.join( tmp , reducer.reference() ); }
|
||||
}
|
||||
|
||||
// Broadcast from root lane to all other lanes.
|
||||
|
@ -410,7 +410,7 @@ public:
|
|||
|
||||
value_type tmp( reducer.reference() );
|
||||
|
||||
for ( int i = CudaTraits::WarpSize ; blockDim.x <= ( i >>= 1 ) ; ) {
|
||||
for ( int i = CudaTraits::WarpSize ; (int)blockDim.x <= ( i >>= 1 ) ; ) {
|
||||
|
||||
cuda_shfl_down( reducer.reference(), tmp, i, CudaTraits::WarpSize );
|
||||
|
||||
|
@ -479,7 +479,7 @@ public:
|
|||
|
||||
__threadfence(); // Wait until global write is visible.
|
||||
|
||||
last_block = gridDim.x ==
|
||||
last_block = (int)gridDim.x ==
|
||||
1 + Kokkos::atomic_fetch_add(global_scratch_flags,1);
|
||||
|
||||
// If last block then reset count
|
||||
|
@ -509,7 +509,7 @@ public:
|
|||
reducer.copy( ((pointer_type)shmem) + offset
|
||||
, ((pointer_type)global_scratch_space) + offset );
|
||||
|
||||
for ( int i = nentry + tid ; i < gridDim.x ; i += nentry ) {
|
||||
for ( int i = nentry + tid ; i < (int)gridDim.x ; i += nentry ) {
|
||||
reducer.join( ((pointer_type)shmem) + offset
|
||||
, ((pointer_type)global_scratch_space)
|
||||
+ i * reducer.length() );
|
||||
|
@ -576,6 +576,14 @@ public:
|
|||
, m_league_size( arg_league_size )
|
||||
{}
|
||||
|
||||
public:
|
||||
// Declare to avoid unused private member warnings which are trigger
|
||||
// when SFINAE excludes the member function which uses these variables
|
||||
// Making another class a friend also surpresses these warnings
|
||||
bool impl_avoid_sfinae_warning() const noexcept
|
||||
{
|
||||
return m_team_reduce_size > 0 && m_team_reduce != nullptr;
|
||||
}
|
||||
};
|
||||
|
||||
} // namspace Impl
|
||||
|
@ -913,10 +921,10 @@ void parallel_scan
|
|||
// [t] += [t-4] if t >= 4
|
||||
// ...
|
||||
|
||||
for ( int j = 1 ; j < blockDim.x ; j <<= 1 ) {
|
||||
for ( int j = 1 ; j < (int)blockDim.x ; j <<= 1 ) {
|
||||
value_type tmp = 0 ;
|
||||
Impl::cuda_shfl_up( tmp , sval , j , blockDim.x );
|
||||
if ( j <= threadIdx.x ) { sval += tmp ; }
|
||||
if ( j <= (int)threadIdx.x ) { sval += tmp ; }
|
||||
}
|
||||
|
||||
// Include accumulation and remove value for exclusive scan:
|
||||
|
|
|
@ -0,0 +1,133 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_CUDA_UNIQUE_TOKEN_HPP
|
||||
#define KOKKOS_CUDA_UNIQUE_TOKEN_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
|
||||
#include <Kokkos_CudaSpace.hpp>
|
||||
#include <Kokkos_UniqueToken.hpp>
|
||||
#include <impl/Kokkos_SharedAlloc.hpp>
|
||||
#include <impl/Kokkos_ConcurrentBitset.hpp>
|
||||
|
||||
namespace Kokkos { namespace Experimental {
|
||||
|
||||
// both global and instance Unique Tokens are implemented in the same way
|
||||
template<>
|
||||
class UniqueToken< Cuda, UniqueTokenScope::Global >
|
||||
{
|
||||
private:
|
||||
|
||||
uint32_t volatile * m_buffer ;
|
||||
uint32_t m_count ;
|
||||
|
||||
public:
|
||||
|
||||
using execution_space = Cuda;
|
||||
|
||||
explicit
|
||||
UniqueToken( execution_space const& );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
UniqueToken() : m_buffer(0), m_count(0) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
UniqueToken( const UniqueToken & ) = default;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
UniqueToken( UniqueToken && ) = default;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
UniqueToken & operator=( const UniqueToken & ) = default ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
UniqueToken & operator=( UniqueToken && ) = default ;
|
||||
|
||||
/// \brief upper bound for acquired values, i.e. 0 <= value < size()
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int32_t size() const noexcept { return m_count ; }
|
||||
|
||||
/// \brief acquire value such that 0 <= value < size()
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int32_t acquire() const
|
||||
{
|
||||
const Kokkos::pair<int,int> result =
|
||||
Kokkos::Impl::concurrent_bitset::
|
||||
acquire_bounded( m_buffer
|
||||
, m_count
|
||||
, Kokkos::Impl::clock_tic() % m_count
|
||||
);
|
||||
|
||||
if ( result.first < 0 ) {
|
||||
Kokkos::abort("UniqueToken<Cuda> failure to release tokens, no tokens available" );
|
||||
}
|
||||
|
||||
return result.first;
|
||||
}
|
||||
|
||||
/// \brief release an acquired value
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void release( int32_t i ) const noexcept
|
||||
{
|
||||
Kokkos::Impl::concurrent_bitset::release( m_buffer, i );
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
class UniqueToken< Cuda, UniqueTokenScope::Instance >
|
||||
: public UniqueToken< Cuda, UniqueTokenScope::Global >
|
||||
{
|
||||
public:
|
||||
|
||||
explicit
|
||||
UniqueToken( execution_space const& arg )
|
||||
: UniqueToken< Cuda, UniqueTokenScope::Global >( arg ) {}
|
||||
};
|
||||
|
||||
}} // namespace Kokkos::Experimental
|
||||
|
||||
#endif // KOKKOS_ENABLE_CUDA
|
||||
#endif // KOKKOS_CUDA_UNIQUE_TOKEN_HPP
|
||||
|
|
@ -221,7 +221,6 @@ struct CudaLDGFetch {
|
|||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace Impl {
|
||||
|
||||
/** \brief Replace Default ViewDataHandle with Cuda texture fetch specialization
|
||||
|
@ -294,9 +293,8 @@ public:
|
|||
}
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
|
|
@ -0,0 +1,119 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_CUDA_WORKGRAPHPOLICY_HPP
|
||||
#define KOKKOS_CUDA_WORKGRAPHPOLICY_HPP
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template< class FunctorType , class ... Traits >
|
||||
class ParallelFor< FunctorType ,
|
||||
Kokkos::Experimental::WorkGraphPolicy< Traits ... > ,
|
||||
Kokkos::Cuda
|
||||
>
|
||||
: public Kokkos::Impl::Experimental::
|
||||
WorkGraphExec< FunctorType,
|
||||
Kokkos::Cuda,
|
||||
Traits ...
|
||||
>
|
||||
{
|
||||
public:
|
||||
|
||||
typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ;
|
||||
typedef Kokkos::Impl::Experimental::
|
||||
WorkGraphExec<FunctorType, Kokkos::Cuda, Traits ... > Base ;
|
||||
typedef ParallelFor<FunctorType, Policy, Kokkos::Cuda> Self ;
|
||||
|
||||
private:
|
||||
|
||||
template< class TagType >
|
||||
__device__
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec_one(const typename Policy::member_type& i) const {
|
||||
Base::m_functor( i );
|
||||
}
|
||||
|
||||
template< class TagType >
|
||||
__device__
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
exec_one(const typename Policy::member_type& i) const {
|
||||
const TagType t{} ;
|
||||
Base::m_functor( t , i );
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
__device__
|
||||
inline
|
||||
void operator()() const {
|
||||
for (std::int32_t i; (-1 != (i = Base::before_work())); ) {
|
||||
exec_one< typename Policy::work_tag >( i );
|
||||
Base::after_work(i);
|
||||
}
|
||||
}
|
||||
|
||||
inline
|
||||
void execute()
|
||||
{
|
||||
const int warps_per_block = 4 ;
|
||||
const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 );
|
||||
const dim3 block( 1 , Kokkos::Impl::CudaTraits::WarpSize , warps_per_block );
|
||||
const int shared = 0 ;
|
||||
const cudaStream_t stream = 0 ;
|
||||
|
||||
Kokkos::Impl::CudaParallelLaunch<Self>(*this, grid, block, shared, stream);
|
||||
}
|
||||
|
||||
inline
|
||||
ParallelFor( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy )
|
||||
: Base( arg_functor, arg_policy )
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif /* #define KOKKOS_CUDA_WORKGRAPHPOLICY_HPP */
|
|
@ -52,6 +52,7 @@
|
|||
|
||||
#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
|
||||
#include<Cuda/KokkosExp_Cuda_IterateTile.hpp>
|
||||
#include <Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp>
|
||||
#endif
|
||||
|
||||
namespace Kokkos { namespace Experimental {
|
||||
|
@ -120,28 +121,17 @@ struct MDRangePolicy
|
|||
, typename traits::index_type
|
||||
> ;
|
||||
|
||||
typedef MDRangePolicy execution_policy; // needed for is_execution_space interrogation
|
||||
|
||||
static_assert( !std::is_same<typename traits::iteration_pattern,void>::value
|
||||
, "Kokkos Error: MD iteration pattern not defined" );
|
||||
|
||||
using iteration_pattern = typename traits::iteration_pattern;
|
||||
using work_tag = typename traits::work_tag;
|
||||
using launch_bounds = typename traits::launch_bounds;
|
||||
using member_type = typename range_policy::member_type;
|
||||
|
||||
static constexpr int rank = iteration_pattern::rank;
|
||||
|
||||
static constexpr int outer_direction = static_cast<int> (
|
||||
(iteration_pattern::outer_direction != Iterate::Default)
|
||||
? iteration_pattern::outer_direction
|
||||
: default_outer_direction< typename traits::execution_space>::value );
|
||||
|
||||
static constexpr int inner_direction = static_cast<int> (
|
||||
iteration_pattern::inner_direction != Iterate::Default
|
||||
? iteration_pattern::inner_direction
|
||||
: default_inner_direction< typename traits::execution_space>::value ) ;
|
||||
|
||||
|
||||
// Ugly ugly workaround intel 14 not handling scoped enum correctly
|
||||
static constexpr int Right = static_cast<int>( Iterate::Right );
|
||||
static constexpr int Left = static_cast<int>( Iterate::Left );
|
||||
enum { rank = static_cast<int>(iteration_pattern::rank) };
|
||||
|
||||
using index_type = typename traits::index_type;
|
||||
using array_index_type = long;
|
||||
|
@ -155,11 +145,50 @@ struct MDRangePolicy
|
|||
// This would require the user to either pass a matching index_type parameter
|
||||
// as template parameter to the MDRangePolicy or static_cast the individual values
|
||||
|
||||
point_type m_lower;
|
||||
point_type m_upper;
|
||||
tile_type m_tile;
|
||||
point_type m_tile_end;
|
||||
index_type m_num_tiles;
|
||||
index_type m_prod_tile_dims;
|
||||
|
||||
/*
|
||||
// NDE enum impl definition alternative - replace static constexpr int ?
|
||||
enum { outer_direction = static_cast<int> (
|
||||
(iteration_pattern::outer_direction != Iterate::Default)
|
||||
? iteration_pattern::outer_direction
|
||||
: default_outer_direction< typename traits::execution_space>::value ) };
|
||||
|
||||
enum { inner_direction = static_cast<int> (
|
||||
iteration_pattern::inner_direction != Iterate::Default
|
||||
? iteration_pattern::inner_direction
|
||||
: default_inner_direction< typename traits::execution_space>::value ) };
|
||||
|
||||
enum { Right = static_cast<int>( Iterate::Right ) };
|
||||
enum { Left = static_cast<int>( Iterate::Left ) };
|
||||
*/
|
||||
//static constexpr int rank = iteration_pattern::rank;
|
||||
|
||||
static constexpr int outer_direction = static_cast<int> (
|
||||
(iteration_pattern::outer_direction != Iterate::Default)
|
||||
? iteration_pattern::outer_direction
|
||||
: default_outer_direction< typename traits::execution_space>::value );
|
||||
|
||||
static constexpr int inner_direction = static_cast<int> (
|
||||
iteration_pattern::inner_direction != Iterate::Default
|
||||
? iteration_pattern::inner_direction
|
||||
: default_inner_direction< typename traits::execution_space>::value ) ;
|
||||
|
||||
// Ugly ugly workaround intel 14 not handling scoped enum correctly
|
||||
static constexpr int Right = static_cast<int>( Iterate::Right );
|
||||
static constexpr int Left = static_cast<int>( Iterate::Left );
|
||||
|
||||
MDRangePolicy( point_type const& lower, point_type const& upper, tile_type const& tile = tile_type{} )
|
||||
: m_lower(lower)
|
||||
, m_upper(upper)
|
||||
, m_tile(tile)
|
||||
, m_num_tiles(1)
|
||||
, m_prod_tile_dims(1)
|
||||
{
|
||||
// Host
|
||||
if ( true
|
||||
|
@ -172,8 +201,8 @@ struct MDRangePolicy
|
|||
for (int i=0; i<rank; ++i) {
|
||||
span = upper[i] - lower[i];
|
||||
if ( m_tile[i] <= 0 ) {
|
||||
if ( (inner_direction == Right && (i < rank-1))
|
||||
|| (inner_direction == Left && (i > 0)) )
|
||||
if ( ((int)inner_direction == (int)Right && (i < rank-1))
|
||||
|| ((int)inner_direction == (int)Left && (i > 0)) )
|
||||
{
|
||||
m_tile[i] = 2;
|
||||
}
|
||||
|
@ -183,6 +212,7 @@ struct MDRangePolicy
|
|||
}
|
||||
m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
|
||||
m_num_tiles *= m_tile_end[i];
|
||||
m_prod_tile_dims *= m_tile[i];
|
||||
}
|
||||
}
|
||||
#if defined(KOKKOS_ENABLE_CUDA)
|
||||
|
@ -190,14 +220,18 @@ struct MDRangePolicy
|
|||
{
|
||||
index_type span;
|
||||
for (int i=0; i<rank; ++i) {
|
||||
span = upper[i] - lower[i];
|
||||
span = m_upper[i] - m_lower[i];
|
||||
if ( m_tile[i] <= 0 ) {
|
||||
// TODO: determine what is a good default tile size for cuda
|
||||
// may be rank dependent
|
||||
if ( (inner_direction == Right && (i < rank-1))
|
||||
|| (inner_direction == Left && (i > 0)) )
|
||||
if ( ((int)inner_direction == (int)Right && (i < rank-1))
|
||||
|| ((int)inner_direction == (int)Left && (i > 0)) )
|
||||
{
|
||||
m_tile[i] = 2;
|
||||
if ( m_prod_tile_dims < 512 ) {
|
||||
m_tile[i] = 2;
|
||||
} else {
|
||||
m_tile[i] = 1;
|
||||
}
|
||||
}
|
||||
else {
|
||||
m_tile[i] = 16;
|
||||
|
@ -205,12 +239,9 @@ struct MDRangePolicy
|
|||
}
|
||||
m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
|
||||
m_num_tiles *= m_tile_end[i];
|
||||
m_prod_tile_dims *= m_tile[i];
|
||||
}
|
||||
index_type total_tile_size_check = 1;
|
||||
for (int i=0; i<rank; ++i) {
|
||||
total_tile_size_check *= m_tile[i];
|
||||
}
|
||||
if ( total_tile_size_check >= 1024 ) { // improve this check - 1024,1024,64 max per dim (Kepler), but product num_threads < 1024; more restrictions pending register limit
|
||||
if ( m_prod_tile_dims > 512 ) { // Match Cuda restriction for ParallelReduce; 1024,1024,64 max per dim (Kepler), but product num_threads < 1024
|
||||
printf(" Tile dimensions exceed Cuda limits\n");
|
||||
Kokkos::abort(" Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
|
||||
//Kokkos::Impl::throw_runtime_exception( " Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
|
||||
|
@ -223,19 +254,7 @@ struct MDRangePolicy
|
|||
template < typename LT , typename UT , typename TT = array_index_type >
|
||||
MDRangePolicy( std::initializer_list<LT> const& lower, std::initializer_list<UT> const& upper, std::initializer_list<TT> const& tile = {} )
|
||||
{
|
||||
#if 0
|
||||
// This should work, less duplicated code but not yet extensively tested
|
||||
point_type lower_tmp, upper_tmp;
|
||||
tile_type tile_tmp;
|
||||
for ( auto i = 0; i < rank; ++i ) {
|
||||
lower_tmp[i] = static_cast<array_index_type>(lower.begin()[i]);
|
||||
upper_tmp[i] = static_cast<array_index_type>(upper.begin()[i]);
|
||||
tile_tmp[i] = static_cast<array_index_type>(tile.begin()[i]);
|
||||
}
|
||||
|
||||
MDRangePolicy( lower_tmp, upper_tmp, tile_tmp );
|
||||
|
||||
#else
|
||||
if(static_cast<int>(m_lower.size()) != rank || static_cast<int>(m_upper.size()) != rank)
|
||||
Kokkos::abort("MDRangePolicy: Constructor initializer lists have wrong size");
|
||||
|
||||
|
@ -249,7 +268,7 @@ struct MDRangePolicy
|
|||
}
|
||||
|
||||
m_num_tiles = 1;
|
||||
|
||||
m_prod_tile_dims = 1;
|
||||
|
||||
// Host
|
||||
if ( true
|
||||
|
@ -262,8 +281,8 @@ struct MDRangePolicy
|
|||
for (int i=0; i<rank; ++i) {
|
||||
span = m_upper[i] - m_lower[i];
|
||||
if ( m_tile[i] <= 0 ) {
|
||||
if ( (inner_direction == Right && (i < rank-1))
|
||||
|| (inner_direction == Left && (i > 0)) )
|
||||
if ( ((int)inner_direction == (int)Right && (i < rank-1))
|
||||
|| ((int)inner_direction == (int)Left && (i > 0)) )
|
||||
{
|
||||
m_tile[i] = 2;
|
||||
}
|
||||
|
@ -273,6 +292,7 @@ struct MDRangePolicy
|
|||
}
|
||||
m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
|
||||
m_num_tiles *= m_tile_end[i];
|
||||
m_prod_tile_dims *= m_tile[i];
|
||||
}
|
||||
}
|
||||
#if defined(KOKKOS_ENABLE_CUDA)
|
||||
|
@ -284,10 +304,14 @@ struct MDRangePolicy
|
|||
if ( m_tile[i] <= 0 ) {
|
||||
// TODO: determine what is a good default tile size for cuda
|
||||
// may be rank dependent
|
||||
if ( (inner_direction == Right && (i < rank-1))
|
||||
|| (inner_direction == Left && (i > 0)) )
|
||||
if ( ((int)inner_direction == (int)Right && (i < rank-1))
|
||||
|| ((int)inner_direction == (int)Left && (i > 0)) )
|
||||
{
|
||||
m_tile[i] = 2;
|
||||
if ( m_prod_tile_dims < 512 ) {
|
||||
m_tile[i] = 2;
|
||||
} else {
|
||||
m_tile[i] = 1;
|
||||
}
|
||||
}
|
||||
else {
|
||||
m_tile[i] = 16;
|
||||
|
@ -295,32 +319,22 @@ struct MDRangePolicy
|
|||
}
|
||||
m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
|
||||
m_num_tiles *= m_tile_end[i];
|
||||
m_prod_tile_dims *= m_tile[i];
|
||||
}
|
||||
index_type total_tile_size_check = 1;
|
||||
for (int i=0; i<rank; ++i) {
|
||||
total_tile_size_check *= m_tile[i];
|
||||
}
|
||||
if ( total_tile_size_check >= 1024 ) { // improve this check - 1024,1024,64 max per dim (Kepler), but product num_threads < 1024; more restrictions pending register limit
|
||||
if ( m_prod_tile_dims > 512 ) { // Match Cuda restriction for ParallelReduce; 1024,1024,64 max per dim (Kepler), but product num_threads < 1024
|
||||
printf(" Tile dimensions exceed Cuda limits\n");
|
||||
Kokkos::abort(" Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
|
||||
//Kokkos::Impl::throw_runtime_exception( " Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
point_type m_lower;
|
||||
point_type m_upper;
|
||||
tile_type m_tile;
|
||||
point_type m_tile_end;
|
||||
index_type m_num_tiles;
|
||||
};
|
||||
// ------------------------------------------------------------------ //
|
||||
|
||||
// ------------------------------------------------------------------ //
|
||||
//md_parallel_for
|
||||
//md_parallel_for - deprecated use parallel_for
|
||||
// ------------------------------------------------------------------ //
|
||||
template <typename MDRange, typename Functor, typename Enable = void>
|
||||
void md_parallel_for( MDRange const& range
|
||||
|
@ -335,7 +349,6 @@ void md_parallel_for( MDRange const& range
|
|||
{
|
||||
Impl::MDFunctor<MDRange, Functor, void> g(range, f);
|
||||
|
||||
//using range_policy = typename MDRange::range_policy;
|
||||
using range_policy = typename MDRange::impl_range_policy;
|
||||
|
||||
Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
|
||||
|
@ -354,7 +367,6 @@ void md_parallel_for( const std::string& str
|
|||
{
|
||||
Impl::MDFunctor<MDRange, Functor, void> g(range, f);
|
||||
|
||||
//using range_policy = typename MDRange::range_policy;
|
||||
using range_policy = typename MDRange::impl_range_policy;
|
||||
|
||||
Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
|
||||
|
@ -395,7 +407,7 @@ void md_parallel_for( MDRange const& range
|
|||
// ------------------------------------------------------------------ //
|
||||
|
||||
// ------------------------------------------------------------------ //
|
||||
//md_parallel_reduce
|
||||
//md_parallel_reduce - deprecated use parallel_reduce
|
||||
// ------------------------------------------------------------------ //
|
||||
template <typename MDRange, typename Functor, typename ValueType>
|
||||
void md_parallel_reduce( MDRange const& range
|
||||
|
@ -409,9 +421,8 @@ void md_parallel_reduce( MDRange const& range
|
|||
) >::type* = 0
|
||||
)
|
||||
{
|
||||
Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f, v);
|
||||
Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f);
|
||||
|
||||
//using range_policy = typename MDRange::range_policy;
|
||||
using range_policy = typename MDRange::impl_range_policy;
|
||||
Kokkos::parallel_reduce( str, range_policy(0, range.m_num_tiles).set_chunk_size(1), g, v );
|
||||
}
|
||||
|
@ -428,48 +439,14 @@ void md_parallel_reduce( const std::string& str
|
|||
) >::type* = 0
|
||||
)
|
||||
{
|
||||
Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f, v);
|
||||
Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f);
|
||||
|
||||
//using range_policy = typename MDRange::range_policy;
|
||||
using range_policy = typename MDRange::impl_range_policy;
|
||||
|
||||
Kokkos::parallel_reduce( str, range_policy(0, range.m_num_tiles).set_chunk_size(1), g, v );
|
||||
}
|
||||
|
||||
// Cuda - parallel_reduce not implemented yet
|
||||
/*
|
||||
template <typename MDRange, typename Functor, typename ValueType>
|
||||
void md_parallel_reduce( MDRange const& range
|
||||
, Functor const& f
|
||||
, ValueType & v
|
||||
, const std::string& str = ""
|
||||
, typename std::enable_if<( true
|
||||
#if defined( KOKKOS_ENABLE_CUDA)
|
||||
&& std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
|
||||
#endif
|
||||
) >::type* = 0
|
||||
)
|
||||
{
|
||||
Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f, v);
|
||||
closure.execute();
|
||||
}
|
||||
|
||||
template <typename MDRange, typename Functor, typename ValueType>
|
||||
void md_parallel_reduce( const std::string& str
|
||||
, MDRange const& range
|
||||
, Functor const& f
|
||||
, ValueType & v
|
||||
, typename std::enable_if<( true
|
||||
#if defined( KOKKOS_ENABLE_CUDA)
|
||||
&& std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
|
||||
#endif
|
||||
) >::type* = 0
|
||||
)
|
||||
{
|
||||
Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f, v);
|
||||
closure.execute();
|
||||
}
|
||||
*/
|
||||
// Cuda - md_parallel_reduce not implemented - use parallel_reduce
|
||||
|
||||
}} // namespace Kokkos::Experimental
|
||||
|
||||
|
|
|
@ -114,40 +114,9 @@
|
|||
#endif /* Not pre-selected atomic implementation */
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
// Forward decalaration of functions supporting arbitrary sized atomics
|
||||
// This is necessary since Kokkos_Atomic.hpp is internally included very early
|
||||
// through Kokkos_HostSpace.hpp as well as the allocation tracker.
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
/// \brief Aquire a lock for the address
|
||||
///
|
||||
/// This function tries to aquire the lock for the hash value derived
|
||||
/// from the provided ptr. If the lock is successfully aquired the
|
||||
/// function returns true. Otherwise it returns false.
|
||||
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
||||
extern
|
||||
#include <Cuda/Kokkos_Cuda_Locks.hpp>
|
||||
#endif
|
||||
__device__ inline
|
||||
bool lock_address_cuda_space(void* ptr);
|
||||
|
||||
/// \brief Release lock for the address
|
||||
///
|
||||
/// This function releases the lock for the hash value derived
|
||||
/// from the provided ptr. This function should only be called
|
||||
/// after previously successfully aquiring a lock with
|
||||
/// lock_address.
|
||||
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
||||
extern
|
||||
#endif
|
||||
__device__ inline
|
||||
void unlock_address_cuda_space(void* ptr);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
namespace Kokkos {
|
||||
template <typename T>
|
||||
|
|
|
@ -79,6 +79,21 @@ struct IndexType
|
|||
using type = T;
|
||||
};
|
||||
|
||||
/**\brief Specify Launch Bounds for CUDA execution.
|
||||
*
|
||||
* The "best" defaults may be architecture specific.
|
||||
*/
|
||||
template< unsigned int maxT = 1024 /* Max threads per block */
|
||||
, unsigned int minB = 1 /* Min blocks per SM */
|
||||
>
|
||||
struct LaunchBounds
|
||||
{
|
||||
using launch_bounds = LaunchBounds;
|
||||
using type = LaunchBounds<maxT,minB>;
|
||||
static unsigned int constexpr maxTperB {maxT};
|
||||
static unsigned int constexpr minBperSM {minB};
|
||||
};
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
@ -119,6 +134,7 @@ using Kokkos::is_array_layout ;
|
|||
KOKKOS_IMPL_IS_CONCEPT( iteration_pattern )
|
||||
KOKKOS_IMPL_IS_CONCEPT( schedule_type )
|
||||
KOKKOS_IMPL_IS_CONCEPT( index_type )
|
||||
KOKKOS_IMPL_IS_CONCEPT( launch_bounds )
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -96,11 +96,13 @@ struct InitArguments {
|
|||
int num_numa;
|
||||
int device_id;
|
||||
|
||||
InitArguments() {
|
||||
num_threads = -1;
|
||||
num_numa = -1;
|
||||
device_id = -1;
|
||||
}
|
||||
InitArguments( int nt = -1
|
||||
, int nn = -1
|
||||
, int dv = -1)
|
||||
: num_threads( nt )
|
||||
, num_numa( nn )
|
||||
, device_id( dv )
|
||||
{}
|
||||
};
|
||||
|
||||
void initialize(int& narg, char* arg[]);
|
||||
|
@ -168,6 +170,9 @@ void * kokkos_realloc( void * arg_alloc , const size_t arg_alloc_size )
|
|||
|
||||
} // namespace Kokkos
|
||||
|
||||
#include <Kokkos_Crs.hpp>
|
||||
#include <Kokkos_WorkGraphPolicy.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
|
|
@ -51,6 +51,9 @@
|
|||
#include <Kokkos_Macros.hpp>
|
||||
#include <impl/Kokkos_Utilities.hpp>
|
||||
|
||||
#include <Kokkos_UniqueToken.hpp>
|
||||
#include <Kokkos_MasterLock.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// Have assumed a 64bit build (8byte pointers) throughout the code base.
|
||||
|
||||
|
|
|
@ -0,0 +1,333 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_CRS_HPP
|
||||
#define KOKKOS_CRS_HPP
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
|
||||
/// \class Crs
|
||||
/// \brief Compressed row storage array.
|
||||
///
|
||||
/// \tparam DataType The type of stored entries. If a Crs is
|
||||
/// used as the graph of a sparse matrix, then this is usually an
|
||||
/// integer type, the type of the column indices in the sparse
|
||||
/// matrix.
|
||||
///
|
||||
/// \tparam Arg1Type The second template parameter, corresponding
|
||||
/// either to the Device type (if there are no more template
|
||||
/// parameters) or to the Layout type (if there is at least one more
|
||||
/// template parameter).
|
||||
///
|
||||
/// \tparam Arg2Type The third template parameter, which if provided
|
||||
/// corresponds to the Device type.
|
||||
///
|
||||
/// \tparam SizeType The type of row offsets. Usually the default
|
||||
/// parameter suffices. However, setting a nondefault value is
|
||||
/// necessary in some cases, for example, if you want to have a
|
||||
/// sparse matrices with dimensions (and therefore column indices)
|
||||
/// that fit in \c int, but want to store more than <tt>INT_MAX</tt>
|
||||
/// entries in the sparse matrix.
|
||||
///
|
||||
/// A row has a range of entries:
|
||||
/// <ul>
|
||||
/// <li> <tt> row_map[i0] <= entry < row_map[i0+1] </tt> </li>
|
||||
/// <li> <tt> 0 <= i1 < row_map[i0+1] - row_map[i0] </tt> </li>
|
||||
/// <li> <tt> entries( entry , i2 , i3 , ... ); </tt> </li>
|
||||
/// <li> <tt> entries( row_map[i0] + i1 , i2 , i3 , ... ); </tt> </li>
|
||||
/// </ul>
|
||||
template< class DataType,
|
||||
class Arg1Type,
|
||||
class Arg2Type = void,
|
||||
typename SizeType = typename ViewTraits<DataType*, Arg1Type, Arg2Type, void >::size_type>
|
||||
class Crs {
|
||||
protected:
|
||||
typedef ViewTraits<DataType*, Arg1Type, Arg2Type, void> traits;
|
||||
|
||||
public:
|
||||
typedef DataType data_type;
|
||||
typedef typename traits::array_layout array_layout;
|
||||
typedef typename traits::execution_space execution_space;
|
||||
typedef typename traits::memory_space memory_space;
|
||||
typedef typename traits::device_type device_type;
|
||||
typedef SizeType size_type;
|
||||
|
||||
typedef Crs< DataType , Arg1Type , Arg2Type , SizeType > staticcrsgraph_type;
|
||||
typedef Crs< DataType , array_layout , typename traits::host_mirror_space , SizeType > HostMirror;
|
||||
typedef View<size_type* , array_layout, device_type> row_map_type;
|
||||
typedef View<DataType* , array_layout, device_type> entries_type;
|
||||
|
||||
entries_type entries;
|
||||
row_map_type row_map;
|
||||
|
||||
//! Construct an empty view.
|
||||
Crs () : entries(), row_map() {}
|
||||
|
||||
//! Copy constructor (shallow copy).
|
||||
Crs (const Crs& rhs) : entries (rhs.entries), row_map (rhs.row_map)
|
||||
{}
|
||||
|
||||
template<class EntriesType, class RowMapType>
|
||||
Crs (const EntriesType& entries_,const RowMapType& row_map_) : entries (entries_), row_map (row_map_)
|
||||
{}
|
||||
|
||||
/** \brief Assign to a view of the rhs array.
|
||||
* If the old view is the last view
|
||||
* then allocated memory is deallocated.
|
||||
*/
|
||||
Crs& operator= (const Crs& rhs) {
|
||||
entries = rhs.entries;
|
||||
row_map = rhs.row_map;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/** \brief Destroy this view of the array.
|
||||
* If the last view then allocated memory is deallocated.
|
||||
*/
|
||||
~Crs() {}
|
||||
|
||||
/** \brief Return number of rows in the graph
|
||||
*/
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
size_type numRows() const {
|
||||
return (row_map.dimension_0 () != 0) ?
|
||||
row_map.dimension_0 () - static_cast<size_type> (1) :
|
||||
static_cast<size_type> (0);
|
||||
}
|
||||
};
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
template< class OutCounts,
|
||||
class DataType,
|
||||
class Arg1Type,
|
||||
class Arg2Type,
|
||||
class SizeType>
|
||||
void get_crs_transpose_counts(
|
||||
OutCounts& out,
|
||||
Crs<DataType, Arg1Type, Arg2Type, SizeType> const& in,
|
||||
std::string const& name = "transpose_counts");
|
||||
|
||||
template< class OutCounts,
|
||||
class InCrs>
|
||||
void get_crs_row_map_from_counts(
|
||||
OutCounts& out,
|
||||
InCrs const& in,
|
||||
std::string const& name = "row_map");
|
||||
|
||||
template< class DataType,
|
||||
class Arg1Type,
|
||||
class Arg2Type,
|
||||
class SizeType>
|
||||
void transpose_crs(
|
||||
Crs<DataType, Arg1Type, Arg2Type, SizeType>& out,
|
||||
Crs<DataType, Arg1Type, Arg2Type, SizeType> const& in);
|
||||
|
||||
}} // namespace Kokkos::Experimental
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
namespace Experimental {
|
||||
|
||||
template <class InCrs, class OutCounts>
|
||||
class GetCrsTransposeCounts {
|
||||
public:
|
||||
using execution_space = typename InCrs::execution_space;
|
||||
using self_type = GetCrsTransposeCounts<InCrs, OutCounts>;
|
||||
using index_type = typename InCrs::size_type;
|
||||
private:
|
||||
InCrs in;
|
||||
OutCounts out;
|
||||
public:
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(index_type i) const {
|
||||
atomic_increment( &out[in.entries(i)] );
|
||||
}
|
||||
GetCrsTransposeCounts(InCrs const& arg_in, OutCounts const& arg_out):
|
||||
in(arg_in),out(arg_out) {
|
||||
using policy_type = RangePolicy<index_type, execution_space>;
|
||||
using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
|
||||
const closure_type closure(*this, policy_type(0, index_type(in.entries.size())));
|
||||
closure.execute();
|
||||
execution_space::fence();
|
||||
}
|
||||
};
|
||||
|
||||
template <class InCounts, class OutRowMap>
|
||||
class CrsRowMapFromCounts {
|
||||
public:
|
||||
using execution_space = typename InCounts::execution_space;
|
||||
using value_type = typename OutRowMap::value_type;
|
||||
using index_type = typename InCounts::size_type;
|
||||
private:
|
||||
InCounts in;
|
||||
OutRowMap out;
|
||||
public:
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(index_type i, value_type& update, bool final_pass) const {
|
||||
update += in(i);
|
||||
if (final_pass) {
|
||||
out(i + 1) = update;
|
||||
if (i == 0) {
|
||||
out(0) = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void init(value_type& update) const { update = 0; }
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void join(volatile value_type& update, const volatile value_type& input) const {
|
||||
update += input;
|
||||
}
|
||||
using self_type = CrsRowMapFromCounts<InCounts, OutRowMap>;
|
||||
CrsRowMapFromCounts(InCounts const& arg_in, OutRowMap const& arg_out):
|
||||
in(arg_in),out(arg_out) {
|
||||
using policy_type = RangePolicy<index_type, execution_space>;
|
||||
using closure_type = Kokkos::Impl::ParallelScan<self_type, policy_type>;
|
||||
closure_type closure(*this, policy_type(0, in.size()));
|
||||
closure.execute();
|
||||
execution_space::fence();
|
||||
}
|
||||
};
|
||||
|
||||
template <class InCrs, class OutCrs>
|
||||
class FillCrsTransposeEntries {
|
||||
public:
|
||||
using execution_space = typename InCrs::execution_space;
|
||||
using memory_space = typename InCrs::memory_space;
|
||||
using value_type = typename OutCrs::entries_type::value_type;
|
||||
using index_type = typename InCrs::size_type;
|
||||
private:
|
||||
using counters_type = View<index_type*, memory_space>;
|
||||
InCrs in;
|
||||
OutCrs out;
|
||||
counters_type counters;
|
||||
public:
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(index_type i) const {
|
||||
auto begin = in.row_map(i);
|
||||
auto end = in.row_map(i + 1);
|
||||
for (auto j = begin; j < end; ++j) {
|
||||
auto ti = in.entries(j);
|
||||
auto tbegin = out.row_map(ti);
|
||||
auto tj = atomic_fetch_add( &counters(ti), 1 );
|
||||
out.entries( tbegin + tj ) = i;
|
||||
}
|
||||
}
|
||||
using self_type = FillCrsTransposeEntries<InCrs, OutCrs>;
|
||||
FillCrsTransposeEntries(InCrs const& arg_in, OutCrs const& arg_out):
|
||||
in(arg_in),out(arg_out),
|
||||
counters("counters", arg_out.numRows()) {
|
||||
using policy_type = RangePolicy<index_type, execution_space>;
|
||||
using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
|
||||
const closure_type closure(*this, policy_type(0, index_type(in.numRows())));
|
||||
closure.execute();
|
||||
execution_space::fence();
|
||||
}
|
||||
};
|
||||
|
||||
}}} // namespace Kokkos::Impl::Experimental
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
|
||||
template< class OutCounts,
|
||||
class DataType,
|
||||
class Arg1Type,
|
||||
class Arg2Type,
|
||||
class SizeType>
|
||||
void get_crs_transpose_counts(
|
||||
OutCounts& out,
|
||||
Crs<DataType, Arg1Type, Arg2Type, SizeType> const& in,
|
||||
std::string const& name) {
|
||||
using InCrs = Crs<DataType, Arg1Type, Arg2Type, SizeType>;
|
||||
out = OutCounts(name, in.numRows());
|
||||
Kokkos::Impl::Experimental::
|
||||
GetCrsTransposeCounts<InCrs, OutCounts> functor(in, out);
|
||||
}
|
||||
|
||||
template< class OutRowMap,
|
||||
class InCounts>
|
||||
void get_crs_row_map_from_counts(
|
||||
OutRowMap& out,
|
||||
InCounts const& in,
|
||||
std::string const& name) {
|
||||
out = OutRowMap(ViewAllocateWithoutInitializing(name), in.size() + 1);
|
||||
Kokkos::Impl::Experimental::
|
||||
CrsRowMapFromCounts<InCounts, OutRowMap> functor(in, out);
|
||||
}
|
||||
|
||||
template< class DataType,
|
||||
class Arg1Type,
|
||||
class Arg2Type,
|
||||
class SizeType>
|
||||
void transpose_crs(
|
||||
Crs<DataType, Arg1Type, Arg2Type, SizeType>& out,
|
||||
Crs<DataType, Arg1Type, Arg2Type, SizeType> const& in)
|
||||
{
|
||||
typedef Crs<DataType, Arg1Type, Arg2Type, SizeType> crs_type ;
|
||||
typedef typename crs_type::memory_space memory_space ;
|
||||
typedef View<SizeType*, memory_space> counts_type ;
|
||||
{
|
||||
counts_type counts;
|
||||
Kokkos::Experimental::get_crs_transpose_counts(counts, in);
|
||||
Kokkos::Experimental::get_crs_row_map_from_counts(out.row_map, counts,
|
||||
"tranpose_row_map");
|
||||
}
|
||||
out.entries = decltype(out.entries)("transpose_entries", in.entries.size());
|
||||
Kokkos::Impl::Experimental::
|
||||
FillCrsTransposeEntries<crs_type, crs_type> entries_functor(in, out);
|
||||
}
|
||||
|
||||
}} // namespace Kokkos::Experimental
|
||||
|
||||
#endif /* #define KOKKOS_CRS_HPP */
|
|
@ -217,8 +217,8 @@ public:
|
|||
|
||||
private:
|
||||
|
||||
cudaStream_t m_stream ;
|
||||
int m_device ;
|
||||
cudaStream_t m_stream ;
|
||||
};
|
||||
|
||||
} // namespace Kokkos
|
||||
|
@ -295,6 +295,7 @@ struct VerifyExecutionCanAccessMemorySpace
|
|||
#include <Cuda/Kokkos_Cuda_Team.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Parallel.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Task.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_UniqueToken.hpp>
|
||||
|
||||
#include <KokkosExp_MDRangePolicy.hpp>
|
||||
//----------------------------------------------------------------------------
|
||||
|
|
|
@ -90,7 +90,7 @@ public:
|
|||
, const size_t arg_alloc_size ) const ;
|
||||
|
||||
/**\brief Return Name of the MemorySpace */
|
||||
static constexpr const char* name();
|
||||
static constexpr const char* name() { return m_name; }
|
||||
|
||||
/*--------------------------------*/
|
||||
/** \brief Error reporting for HostSpace attempt to access CudaSpace */
|
||||
|
@ -186,7 +186,7 @@ public:
|
|||
, const size_t arg_alloc_size ) const ;
|
||||
|
||||
/**\brief Return Name of the MemorySpace */
|
||||
static constexpr const char* name();
|
||||
static constexpr const char* name() { return m_name; }
|
||||
|
||||
/*--------------------------------*/
|
||||
|
||||
|
@ -234,7 +234,7 @@ public:
|
|||
, const size_t arg_alloc_size ) const ;
|
||||
|
||||
/**\brief Return Name of the MemorySpace */
|
||||
static constexpr const char* name();
|
||||
static constexpr const char* name() { return m_name; }
|
||||
|
||||
private:
|
||||
|
||||
|
|
|
@ -384,6 +384,7 @@ Impl::PerThreadValue PerThread(const int& arg);
|
|||
* WorkTag (none): Tag which is used as the first argument for the functor operator.
|
||||
* Schedule<Type> (Schedule<Static>): Scheduling Policy (Dynamic, or Static).
|
||||
* IndexType<Type> (IndexType<ExecutionSpace::size_type>: Integer Index type used to iterate over the Index space.
|
||||
* LaunchBounds<int,int> (LaunchBounds<1024,1>: Launch Bounds for CUDA compilation.
|
||||
*/
|
||||
template< class ... Properties>
|
||||
class TeamPolicy: public
|
||||
|
@ -561,6 +562,45 @@ KOKKOS_INLINE_FUNCTION
|
|||
Impl::ThreadVectorRangeBoundariesStruct<iType,TeamMemberType>
|
||||
ThreadVectorRange( const TeamMemberType&, const iType& count );
|
||||
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
namespace Impl {
|
||||
|
||||
template<typename FunctorType, typename TagType,
|
||||
bool HasTag = !std::is_same<TagType, void>::value >
|
||||
struct ParallelConstructName;
|
||||
|
||||
template<typename FunctorType, typename TagType>
|
||||
struct ParallelConstructName<FunctorType, TagType, true> {
|
||||
ParallelConstructName(std::string const& label):label_ref(label) {
|
||||
if (label.empty()) {
|
||||
default_name = std::string(typeid(FunctorType).name()) + "/" +
|
||||
typeid(TagType).name();
|
||||
}
|
||||
}
|
||||
std::string const& get() {
|
||||
return (label_ref.empty()) ? default_name : label_ref;
|
||||
}
|
||||
std::string const& label_ref;
|
||||
std::string default_name;
|
||||
};
|
||||
|
||||
template<typename FunctorType, typename TagType>
|
||||
struct ParallelConstructName<FunctorType, TagType, false> {
|
||||
ParallelConstructName(std::string const& label):label_ref(label) {
|
||||
if (label.empty()) {
|
||||
default_name = std::string(typeid(FunctorType).name());
|
||||
}
|
||||
}
|
||||
std::string const& get() {
|
||||
return (label_ref.empty()) ? default_name : label_ref;
|
||||
}
|
||||
std::string const& label_ref;
|
||||
std::string default_name;
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
#endif /* defined KOKKOS_ENABLE_PROFILING */
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif /* #define KOKKOS_EXECPOLICY_HPP */
|
||||
|
|
|
@ -126,14 +126,6 @@ public:
|
|||
//! This memory space preferred device_type
|
||||
typedef Kokkos::Device< execution_space, memory_space > device_type;
|
||||
|
||||
/*--------------------------------*/
|
||||
/* Functions unique to the HBWSpace */
|
||||
static int in_parallel();
|
||||
|
||||
static void register_in_parallel( int (*)() );
|
||||
|
||||
/*--------------------------------*/
|
||||
|
||||
/**\brief Default memory space instance */
|
||||
HBWSpace();
|
||||
HBWSpace( const HBWSpace & rhs ) = default;
|
||||
|
|
|
@ -130,14 +130,6 @@ public:
|
|||
//! This memory space preferred device_type
|
||||
typedef Kokkos::Device< execution_space, memory_space > device_type;
|
||||
|
||||
/*--------------------------------*/
|
||||
/* Functions unique to the HostSpace */
|
||||
static int in_parallel();
|
||||
|
||||
static void register_in_parallel( int (*)() );
|
||||
|
||||
/*--------------------------------*/
|
||||
|
||||
/**\brief Default memory space instance */
|
||||
HostSpace();
|
||||
HostSpace( HostSpace && rhs ) = default;
|
||||
|
@ -161,7 +153,7 @@ public:
|
|||
, const size_t arg_alloc_size ) const;
|
||||
|
||||
/**\brief Return Name of the MemorySpace */
|
||||
static constexpr const char* name();
|
||||
static constexpr const char* name() { return m_name; }
|
||||
|
||||
private:
|
||||
AllocationMechanism m_alloc_mech;
|
||||
|
|
|
@ -156,6 +156,8 @@ struct LayoutStride {
|
|||
for ( int r = 0 ; r < ARRAY_LAYOUT_MAX_RANK ; ++r ) {
|
||||
tmp.dimension[r] = 0 ;
|
||||
tmp.stride[r] = 0 ;
|
||||
}
|
||||
for ( int r = 0 ; r < rank ; ++r ) {
|
||||
check_input &= ~int( 1 << order[r] );
|
||||
}
|
||||
if ( 0 == check_input ) {
|
||||
|
|
|
@ -297,6 +297,10 @@
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_ARCH_AVX512MIC )
|
||||
#define KOKKOS_ENABLE_RFO_PREFETCH 1
|
||||
#endif
|
||||
|
||||
#if defined( __MIC__ )
|
||||
// Compiling for Xeon Phi
|
||||
#endif
|
||||
|
@ -344,13 +348,18 @@
|
|||
//#define KOKKOS_ENABLE_PRAGMA_VECTOR 1
|
||||
//#define KOKKOS_ENABLE_PRAGMA_SIMD 1
|
||||
|
||||
#if defined( KOKKOS_ARCH_AVX512MIC )
|
||||
#define KOKKOS_ENABLE_RFO_PREFETCH 1
|
||||
#endif
|
||||
|
||||
#if !defined( KOKKOS_FORCEINLINE_FUNCTION )
|
||||
#define KOKKOS_FORCEINLINE_FUNCTION inline __attribute__((always_inline))
|
||||
#endif
|
||||
|
||||
#if !defined( KOKKOS_ENABLE_ASM ) && !defined( __PGIC__ ) && \
|
||||
( defined( __amd64 ) || defined( __amd64__ ) || \
|
||||
defined( __x86_64 ) || defined( __x86_64__ ) )
|
||||
defined( __x86_64 ) || defined( __x86_64__ ) || \
|
||||
defined(__PPC64__) )
|
||||
#define KOKKOS_ENABLE_ASM 1
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,73 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_MASTER_LOCK_HPP
|
||||
#define KOKKOS_MASTER_LOCK_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
namespace Kokkos { namespace Experimental {
|
||||
|
||||
// my be used to coordinate work between master instances
|
||||
// SHOULD NOT be used within a parallel algorithm
|
||||
//
|
||||
// This lock should be used with with a scoped lock guard
|
||||
// i.e. std::unique_lock<Lock>, std::lock_guard
|
||||
//
|
||||
// cannot be copied or moved
|
||||
// has the following functions available
|
||||
//
|
||||
// Lock()
|
||||
// ~Lock()
|
||||
//
|
||||
// void lock()
|
||||
// void unlock()
|
||||
// bool try_lock()
|
||||
//
|
||||
template <typename ExecutionSpace>
|
||||
class MasterLock;
|
||||
|
||||
}} // namespace Kokkos::Experimental
|
||||
|
||||
#endif //KOKKOS_MASTER_LOCK_HPP
|
||||
|
|
@ -66,11 +66,6 @@ private:
|
|||
enum : uint32_t { max_bit_count_lg2 = CB::max_bit_count_lg2 };
|
||||
enum : uint32_t { max_bit_count = CB::max_bit_count };
|
||||
|
||||
/* Defaults for min block, max block, and superblock sizes */
|
||||
enum : uint32_t { MIN_BLOCK_SIZE_LG2 = 6 /* 64 bytes */ };
|
||||
enum : uint32_t { MAX_BLOCK_SIZE_LG2 = 12 /* 4k bytes */ };
|
||||
enum : uint32_t { SUPERBLOCK_SIZE_LG2 = 16 /* 64k bytes */ };
|
||||
|
||||
enum : uint32_t { HINT_PER_BLOCK_SIZE = 2 };
|
||||
|
||||
/* Each superblock has a concurrent bitset state
|
||||
|
@ -85,6 +80,14 @@ private:
|
|||
* is concurrently updated.
|
||||
*/
|
||||
|
||||
/* Mapping between block_size <-> block_state
|
||||
*
|
||||
* block_state = ( m_sb_size_lg2 - block_size_lg2 ) << state_shift
|
||||
* block_size = m_sb_size_lg2 - ( block_state >> state_shift )
|
||||
*
|
||||
* Thus A_block_size < B_block_size <=> A_block_state > B_block_state
|
||||
*/
|
||||
|
||||
typedef typename DeviceType::memory_space base_memory_space ;
|
||||
|
||||
enum { accessible =
|
||||
|
@ -251,10 +254,10 @@ public:
|
|||
* significant runtime performance improvements.
|
||||
*/
|
||||
MemoryPool( const base_memory_space & memspace
|
||||
, const size_t min_total_alloc_size
|
||||
, const uint32_t min_block_alloc_size // = 1 << MIN_BLOCK_SIZE_LG2
|
||||
, const uint32_t max_block_alloc_size // = 1 << MAX_BLOCK_SIZE_LG2
|
||||
, const uint32_t min_superblock_size // = 1 << SUPERBLOCK_SIZE_LG2
|
||||
, const size_t min_total_alloc_size
|
||||
, size_t min_block_alloc_size = 0
|
||||
, size_t max_block_alloc_size = 0
|
||||
, size_t min_superblock_size = 0
|
||||
)
|
||||
: m_tracker()
|
||||
, m_sb_state_array(0)
|
||||
|
@ -267,8 +270,43 @@ public:
|
|||
, m_data_offset(0)
|
||||
, m_unused_padding(0)
|
||||
{
|
||||
const uint32_t int_align_lg2 = 3 ; /* align as int[8] */
|
||||
const uint32_t int_align_mask = ( 1u << int_align_lg2 ) - 1 ;
|
||||
const uint32_t int_align_lg2 = 3 ; /* align as int[8] */
|
||||
const uint32_t int_align_mask = ( 1u << int_align_lg2 ) - 1 ;
|
||||
|
||||
// Constraints and defaults:
|
||||
// min_block_alloc_size <= max_block_alloc_size
|
||||
// max_block_alloc_size <= min_superblock_size
|
||||
// min_superblock_size <= min_total_alloc_size
|
||||
|
||||
const uint32_t MIN_BLOCK_SIZE = 1u << 6 /* 64 bytes */ ;
|
||||
const uint32_t MAX_BLOCK_SIZE = 1u << 12 /* 4k bytes */ ;
|
||||
|
||||
if ( 0 == min_block_alloc_size ) min_block_alloc_size = MIN_BLOCK_SIZE ;
|
||||
|
||||
if ( 0 == max_block_alloc_size ) {
|
||||
|
||||
max_block_alloc_size = MAX_BLOCK_SIZE ;
|
||||
|
||||
// Upper bound of total allocation size
|
||||
max_block_alloc_size = std::min( size_t(max_block_alloc_size)
|
||||
, min_total_alloc_size );
|
||||
|
||||
// Lower bound of minimum block size
|
||||
max_block_alloc_size = std::max( max_block_alloc_size
|
||||
, min_block_alloc_size );
|
||||
}
|
||||
|
||||
if ( 0 == min_superblock_size ) {
|
||||
min_superblock_size = max_block_alloc_size ;
|
||||
|
||||
// Upper bound of total allocation size
|
||||
min_superblock_size = std::min( size_t(min_superblock_size)
|
||||
, min_total_alloc_size );
|
||||
|
||||
// Lower bound of maximum block size
|
||||
min_superblock_size = std::max( min_superblock_size
|
||||
, max_block_alloc_size );
|
||||
}
|
||||
|
||||
// Block and superblock size is power of two:
|
||||
|
||||
|
@ -435,6 +473,8 @@ public:
|
|||
void * allocate( size_t alloc_size
|
||||
, int32_t attempt_limit = 1 ) const noexcept
|
||||
{
|
||||
if ( 0 == alloc_size ) return (void*) 0 ;
|
||||
|
||||
void * p = 0 ;
|
||||
|
||||
const uint32_t block_size_lg2 = get_block_size_lg2( alloc_size );
|
||||
|
@ -444,10 +484,9 @@ public:
|
|||
// Allocation will fit within a superblock
|
||||
// that has block sizes ( 1 << block_size_lg2 )
|
||||
|
||||
const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2 ;
|
||||
const uint32_t block_state = block_count_lg2 << state_shift ;
|
||||
const uint32_t block_count = 1u << block_count_lg2 ;
|
||||
const uint32_t block_count_mask = block_count - 1 ;
|
||||
const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2 ;
|
||||
const uint32_t block_state = block_count_lg2 << state_shift ;
|
||||
const uint32_t block_count = 1u << block_count_lg2 ;
|
||||
|
||||
// Superblock hints for this block size:
|
||||
// hint_sb_id_ptr[0] is the dynamically changing hint
|
||||
|
@ -465,7 +504,7 @@ public:
|
|||
// the guess for which block within a superblock should
|
||||
// be claimed. If not available then a search occurs.
|
||||
|
||||
const uint32_t block_id_hint = block_count_mask &
|
||||
const uint32_t block_id_hint =
|
||||
(uint32_t)( Kokkos::Impl::clock_tic()
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA )
|
||||
// Spread out potentially concurrent access
|
||||
|
@ -474,6 +513,9 @@ public:
|
|||
#endif
|
||||
);
|
||||
|
||||
// expected state of superblock for allocation
|
||||
uint32_t sb_state = block_state ;
|
||||
|
||||
int32_t sb_id = -1 ;
|
||||
|
||||
volatile uint32_t * sb_state_array = 0 ;
|
||||
|
@ -484,6 +526,8 @@ public:
|
|||
|
||||
if ( sb_id < 0 ) {
|
||||
|
||||
// No superblock specified, try the hint for this block size
|
||||
|
||||
sb_id = hint_sb_id = int32_t( *hint_sb_id_ptr );
|
||||
|
||||
sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size );
|
||||
|
@ -493,16 +537,20 @@ public:
|
|||
// 0 <= sb_id
|
||||
// sb_state_array == m_sb_state_array + m_sb_state_size * sb_id
|
||||
|
||||
if ( block_state == ( state_header_mask & *sb_state_array ) ) {
|
||||
if ( sb_state == ( state_header_mask & *sb_state_array ) ) {
|
||||
|
||||
// This superblock state is assigned to this block size.
|
||||
// Try to claim a bit.
|
||||
// This superblock state is as expected, for the moment.
|
||||
// Attempt to claim a bit. The attempt updates the state
|
||||
// so have already made sure the state header is as expected.
|
||||
|
||||
const uint32_t count_lg2 = sb_state >> state_shift ;
|
||||
const uint32_t mask = ( 1u << count_lg2 ) - 1 ;
|
||||
|
||||
const Kokkos::pair<int,int> result =
|
||||
CB::acquire_bounded_lg2( sb_state_array
|
||||
, block_count_lg2
|
||||
, block_id_hint
|
||||
, block_state
|
||||
, count_lg2
|
||||
, block_id_hint & mask
|
||||
, sb_state
|
||||
);
|
||||
|
||||
// If result.first < 0 then failed to acquire
|
||||
|
@ -512,16 +560,18 @@ public:
|
|||
|
||||
if ( 0 <= result.first ) { // acquired a bit
|
||||
|
||||
const uint32_t size_lg2 = m_sb_size_lg2 - count_lg2 ;
|
||||
|
||||
// Set the allocated block pointer
|
||||
|
||||
p = ((char*)( m_sb_state_array + m_data_offset ))
|
||||
+ ( uint32_t(sb_id) << m_sb_size_lg2 ) // superblock memory
|
||||
+ ( result.first << block_size_lg2 ); // block memory
|
||||
+ ( result.first << size_lg2 ); // block memory
|
||||
|
||||
break ; // Success
|
||||
}
|
||||
|
||||
// printf(" acquire block_count_lg2(%d) block_state(0x%x) sb_id(%d) result(%d,%d)\n" , block_count_lg2 , block_state , sb_id , result.first , result.second );
|
||||
// printf(" acquire count_lg2(%d) sb_state(0x%x) sb_id(%d) result(%d,%d)\n" , count_lg2 , sb_state , sb_id , result.first , result.second );
|
||||
|
||||
}
|
||||
//------------------------------------------------------------------
|
||||
|
@ -529,12 +579,18 @@ public:
|
|||
// Must find a new superblock.
|
||||
|
||||
// Start searching at designated index for this block size.
|
||||
// Look for a partially full superblock of this block size.
|
||||
// Look for an empty superblock just in case cannot find partfull.
|
||||
// Look for superblock that, in preferential order,
|
||||
// 1) part-full superblock of this block size
|
||||
// 2) empty superblock to claim for this block size
|
||||
// 3) part-full superblock of the next larger block size
|
||||
|
||||
sb_state = block_state ; // Expect to find the desired state
|
||||
sb_id = -1 ;
|
||||
|
||||
bool update_hint = false ;
|
||||
int32_t sb_id_empty = -1 ;
|
||||
int32_t sb_id_large = -1 ;
|
||||
uint32_t sb_state_large = 0 ;
|
||||
|
||||
sb_state_array = m_sb_state_array + sb_id_begin * m_sb_state_size ;
|
||||
|
||||
|
@ -544,38 +600,54 @@ public:
|
|||
// Note that the state may change at any moment
|
||||
// as concurrent allocations and deallocations occur.
|
||||
|
||||
const uint32_t state = *sb_state_array ;
|
||||
const uint32_t used = state & state_used_mask ;
|
||||
const uint32_t full_state = *sb_state_array ;
|
||||
const uint32_t used = full_state & state_used_mask ;
|
||||
const uint32_t state = full_state & state_header_mask ;
|
||||
|
||||
if ( block_state == ( state & state_header_mask ) ) {
|
||||
if ( state == block_state ) {
|
||||
|
||||
// Superblock is assigned to this block size
|
||||
|
||||
if ( used < block_count ) {
|
||||
if ( used < block_count ) {
|
||||
|
||||
// There is room to allocate one block
|
||||
|
||||
sb_id = id ;
|
||||
|
||||
if ( used + 1 < block_count ) {
|
||||
// Is there room to allocate more than one block?
|
||||
|
||||
// There is room to allocate more than one block
|
||||
|
||||
Kokkos::atomic_compare_exchange
|
||||
( hint_sb_id_ptr , uint32_t(hint_sb_id) , uint32_t(sb_id) );
|
||||
}
|
||||
update_hint = used + 1 < block_count ;
|
||||
|
||||
break ;
|
||||
}
|
||||
}
|
||||
else if ( ( used == 0 ) && ( sb_id_empty == -1 ) ) {
|
||||
else if ( 0 == used ) {
|
||||
|
||||
// Superblock is not assigned to this block size
|
||||
// and is the first empty superblock encountered.
|
||||
// Save this id to use if a partfull superblock is not found.
|
||||
// Superblock is empty
|
||||
|
||||
sb_id_empty = id ;
|
||||
if ( -1 == sb_id_empty ) {
|
||||
|
||||
// Superblock is not assigned to this block size
|
||||
// and is the first empty superblock encountered.
|
||||
// Save this id to use if a partfull superblock is not found.
|
||||
|
||||
sb_id_empty = id ;
|
||||
}
|
||||
}
|
||||
else if ( ( -1 == sb_id_empty /* have not found an empty */ ) &&
|
||||
( -1 == sb_id_large /* have not found a larger */ ) &&
|
||||
( state < block_state /* a larger block */ ) &&
|
||||
// is not full:
|
||||
( used < ( 1u << ( state >> state_shift ) ) ) ) {
|
||||
// First superblock encountered that is
|
||||
// larger than this block size and
|
||||
// has room for an allocation.
|
||||
// Save this id to use of partfull or empty superblock not found
|
||||
sb_id_large = id ;
|
||||
sb_state_large = state ;
|
||||
}
|
||||
|
||||
// Iterate around the superblock array:
|
||||
|
||||
if ( ++id < m_sb_count ) {
|
||||
sb_state_array += m_sb_state_size ;
|
||||
|
@ -586,7 +658,7 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
// printf(" search m_sb_count(%d) sb_id(%d) sb_id_empty(%d)\n" , m_sb_count , sb_id , sb_id_empty );
|
||||
// printf(" search m_sb_count(%d) sb_id(%d) sb_id_empty(%d) sb_id_large(%d)\n" , m_sb_count , sb_id , sb_id_empty , sb_id_large);
|
||||
|
||||
if ( sb_id < 0 ) {
|
||||
|
||||
|
@ -609,21 +681,31 @@ public:
|
|||
|
||||
const uint32_t state_empty = state_header_mask & *sb_state_array ;
|
||||
|
||||
if ( state_empty ==
|
||||
Kokkos::atomic_compare_exchange
|
||||
(sb_state_array,state_empty,block_state) ) {
|
||||
// If this thread claims the empty block then update the hint
|
||||
update_hint =
|
||||
state_empty ==
|
||||
Kokkos::atomic_compare_exchange
|
||||
(sb_state_array,state_empty,block_state);
|
||||
}
|
||||
else if ( 0 <= sb_id_large ) {
|
||||
|
||||
// If this thread claimed the block then update the hint
|
||||
// Found a larger superblock with space available
|
||||
|
||||
Kokkos::atomic_compare_exchange
|
||||
( hint_sb_id_ptr , uint32_t(hint_sb_id) , uint32_t(sb_id) );
|
||||
}
|
||||
sb_id = sb_id_large ;
|
||||
sb_state = sb_state_large ;
|
||||
|
||||
sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size );
|
||||
}
|
||||
else {
|
||||
// Did not find a potentially usable superblock
|
||||
--attempt_limit ;
|
||||
}
|
||||
}
|
||||
|
||||
if ( update_hint ) {
|
||||
Kokkos::atomic_compare_exchange
|
||||
( hint_sb_id_ptr , uint32_t(hint_sb_id) , uint32_t(sb_id) );
|
||||
}
|
||||
} // end allocation attempt loop
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
|
@ -646,6 +728,8 @@ public:
|
|||
KOKKOS_INLINE_FUNCTION
|
||||
void deallocate( void * p , size_t /* alloc_size */ ) const noexcept
|
||||
{
|
||||
if ( 0 == p ) return ;
|
||||
|
||||
// Determine which superblock and block
|
||||
const ptrdiff_t d =
|
||||
((char*)p) - ((char*)( m_sb_state_array + m_data_offset ));
|
||||
|
|
|
@ -72,11 +72,11 @@ struct MemoryTraits {
|
|||
//! Tag this class as a kokkos memory traits:
|
||||
typedef MemoryTraits memory_traits ;
|
||||
|
||||
enum { Unmanaged = T & unsigned(Kokkos::Unmanaged) };
|
||||
enum { RandomAccess = T & unsigned(Kokkos::RandomAccess) };
|
||||
enum { Atomic = T & unsigned(Kokkos::Atomic) };
|
||||
enum { Restrict = T & unsigned(Kokkos::Restrict) };
|
||||
enum { Aligned = T & unsigned(Kokkos::Aligned) };
|
||||
enum : bool { Unmanaged = (unsigned(0) != (T & unsigned(Kokkos::Unmanaged))) };
|
||||
enum : bool { RandomAccess = (unsigned(0) != (T & unsigned(Kokkos::RandomAccess))) };
|
||||
enum : bool { Atomic = (unsigned(0) != (T & unsigned(Kokkos::Atomic))) };
|
||||
enum : bool { Restrict = (unsigned(0) != (T & unsigned(Kokkos::Restrict))) };
|
||||
enum : bool { Aligned = (unsigned(0) != (T & unsigned(Kokkos::Aligned))) };
|
||||
|
||||
};
|
||||
|
||||
|
@ -109,7 +109,11 @@ enum { MEMORY_ALIGNMENT =
|
|||
#else
|
||||
( 1 << Kokkos::Impl::integral_power_of_two( 128 ) )
|
||||
#endif
|
||||
, MEMORY_ALIGNMENT_THRESHOLD = 4
|
||||
#if defined( KOKKOS_MEMORY_ALIGNMENT_THRESHOLD )
|
||||
, MEMORY_ALIGNMENT_THRESHOLD = KOKKOS_MEMORY_ALIGNMENT_THRESHOLD
|
||||
#else
|
||||
, MEMORY_ALIGNMENT_THRESHOLD = 4
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
|
|
|
@ -47,10 +47,6 @@
|
|||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_OPENMP)
|
||||
|
||||
#if !defined(_OPENMP)
|
||||
#error "You enabled Kokkos OpenMP support without enabling OpenMP in the compiler!"
|
||||
#endif
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
#include <cstddef>
|
||||
|
@ -67,95 +63,144 @@
|
|||
#include <Kokkos_Layout.hpp>
|
||||
#include <impl/Kokkos_Tags.hpp>
|
||||
|
||||
#include <vector>
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
namespace Impl {
|
||||
class OpenMPExec;
|
||||
}
|
||||
|
||||
/// \class OpenMP
|
||||
/// \brief Kokkos device for multicore processors in the host memory space.
|
||||
class OpenMP {
|
||||
public:
|
||||
//------------------------------------
|
||||
//! \name Type declarations that all Kokkos devices must provide.
|
||||
//@{
|
||||
|
||||
//! Tag this class as a kokkos execution space
|
||||
using execution_space = OpenMP;
|
||||
|
||||
using memory_space =
|
||||
#ifdef KOKKOS_ENABLE_HBWSPACE
|
||||
using memory_space = Experimental::HBWSpace;
|
||||
Experimental::HBWSpace;
|
||||
#else
|
||||
using memory_space = HostSpace;
|
||||
HostSpace;
|
||||
#endif
|
||||
|
||||
//! This execution space preferred device_type
|
||||
using device_type = Kokkos::Device<execution_space,memory_space>;
|
||||
|
||||
using array_layout = LayoutRight;
|
||||
using size_type = memory_space::size_type;
|
||||
|
||||
using device_type = Kokkos::Device< execution_space, memory_space >;
|
||||
using array_layout = LayoutRight;
|
||||
using size_type = memory_space::size_type;
|
||||
using scratch_memory_space = ScratchMemorySpace< OpenMP >;
|
||||
|
||||
//@}
|
||||
//------------------------------------
|
||||
//! \name Functions that all Kokkos execution spaces must implement.
|
||||
//@{
|
||||
/// \brief Get a handle to the default execution space instance
|
||||
inline
|
||||
OpenMP() noexcept;
|
||||
|
||||
inline static bool in_parallel();
|
||||
// Using omp_get_max_threads(); is problematic
|
||||
// On Intel (essentially an initial call to the OpenMP runtime
|
||||
// without a parallel region before will set a process mask for a single core
|
||||
// The runtime will than bind threads for a parallel region to other cores on the
|
||||
// entering the first parallel region and make the process mask the aggregate of
|
||||
// the thread masks. The intend seems to be to make serial code run fast, if you
|
||||
// compile with OpenMP enabled but don't actually use parallel regions or so
|
||||
// static int omp_max_threads = omp_get_max_threads();
|
||||
static int get_current_max_threads() noexcept;
|
||||
|
||||
/** \brief Set the device in a "sleep" state. A noop for OpenMP. */
|
||||
static bool sleep();
|
||||
/// \brief Initialize the default execution space
|
||||
///
|
||||
/// if ( thread_count == -1 )
|
||||
/// then use the number of threads that openmp defaults to
|
||||
/// if ( thread_count == 0 && Kokkos::hwlow_available() )
|
||||
/// then use hwloc to choose the number of threads and change
|
||||
/// the default number of threads
|
||||
/// if ( thread_count > 0 )
|
||||
/// then force openmp to use the given number of threads and change
|
||||
/// the default number of threads
|
||||
static void initialize( int thread_count = -1 );
|
||||
|
||||
/** \brief Wake the device from the 'sleep' state. A noop for OpenMP. */
|
||||
static bool wake();
|
||||
|
||||
/** \brief Wait until all dispatched functors complete. A noop for OpenMP. */
|
||||
static void fence() {}
|
||||
|
||||
/// \brief Print configuration information to the given output stream.
|
||||
static void print_configuration( std::ostream & , const bool detail = false );
|
||||
|
||||
/// \brief Free any resources being consumed by the device.
|
||||
/// \brief Free any resources being consumed by the default execution space
|
||||
static void finalize();
|
||||
|
||||
/** \brief Initialize the device.
|
||||
*
|
||||
* 1) If the hardware locality library is enabled and OpenMP has not
|
||||
* already bound threads then bind OpenMP threads to maximize
|
||||
* core utilization and group for memory hierarchy locality.
|
||||
*
|
||||
* 2) Allocate a HostThread for each OpenMP thread to hold its
|
||||
* topology and fan in/out data.
|
||||
*/
|
||||
static void initialize( unsigned thread_count = 0 ,
|
||||
unsigned use_numa_count = 0 ,
|
||||
unsigned use_cores_per_numa = 0 );
|
||||
/// \brief is the default execution space initialized for current 'master' thread
|
||||
static bool is_initialized() noexcept;
|
||||
|
||||
static int is_initialized();
|
||||
/// \brief Print configuration information to the given output stream.
|
||||
static void print_configuration( std::ostream & , const bool verbose = false );
|
||||
|
||||
/** \brief Return the maximum amount of concurrency. */
|
||||
static int concurrency();
|
||||
/// \brief is the instance running a parallel algorithm
|
||||
inline
|
||||
static bool in_parallel( OpenMP const& = OpenMP() ) noexcept;
|
||||
|
||||
//@}
|
||||
//------------------------------------
|
||||
/** \brief This execution space has a topological thread pool which can be queried.
|
||||
*
|
||||
* All threads within a pool have a common memory space for which they are cache coherent.
|
||||
* depth = 0 gives the number of threads in the whole pool.
|
||||
* depth = 1 gives the number of threads in a NUMA region, typically sharing L3 cache.
|
||||
* depth = 2 gives the number of threads at the finest granularity, typically sharing L1 cache.
|
||||
*/
|
||||
inline static int thread_pool_size( int depth = 0 );
|
||||
/// \brief Wait until all dispatched functors complete on the given instance
|
||||
///
|
||||
/// This is a no-op on OpenMP
|
||||
inline
|
||||
static void fence( OpenMP const& = OpenMP() ) noexcept;
|
||||
|
||||
/// \brief Does the given instance return immediately after launching
|
||||
/// a parallel algorithm
|
||||
///
|
||||
/// This always returns false on OpenMP
|
||||
inline
|
||||
static bool is_asynchronous( OpenMP const& = OpenMP() ) noexcept;
|
||||
|
||||
|
||||
/// \brief Partition the default instance into new instances without creating
|
||||
/// new masters
|
||||
///
|
||||
/// This is a no-op on OpenMP since the default instance cannot be partitioned
|
||||
/// without promoting other threads to 'master'
|
||||
static std::vector<OpenMP> partition(...);
|
||||
|
||||
/// Non-default instances should be ref-counted so that when the last
|
||||
/// is destroyed the instance resources are released
|
||||
///
|
||||
/// This is a no-op on OpenMP since a non default instance cannot be created
|
||||
static OpenMP create_instance(...);
|
||||
|
||||
/// \brief Partition the default instance and call 'f' on each new 'master' thread
|
||||
///
|
||||
/// Func is a functor with the following signiture
|
||||
/// void( int partition_id, int num_partitions )
|
||||
template <typename F>
|
||||
static void partition_master( F const& f
|
||||
, int requested_num_partitions = 0
|
||||
, int requested_partition_size = 0
|
||||
);
|
||||
|
||||
inline
|
||||
static int thread_pool_size() noexcept;
|
||||
|
||||
/** \brief The rank of the executing thread in this thread pool */
|
||||
KOKKOS_INLINE_FUNCTION static int thread_pool_rank();
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static int thread_pool_rank() noexcept;
|
||||
|
||||
//------------------------------------
|
||||
#if !defined( KOKKOS_DISABLE_DEPRECATED )
|
||||
/// \brief Initialize the default execution space
|
||||
static void initialize( int thread_count,
|
||||
int use_numa_count,
|
||||
int use_cores_per_numa = 0);
|
||||
|
||||
inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
|
||||
inline
|
||||
static int thread_pool_size( int depth );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
unsigned hardware_thread_id() { return thread_pool_rank(); }
|
||||
static void sleep() {};
|
||||
static void wake() {};
|
||||
|
||||
static const char* name();
|
||||
// use UniqueToken
|
||||
static int concurrency();
|
||||
|
||||
// use UniqueToken
|
||||
inline
|
||||
static int max_hardware_threads() noexcept;
|
||||
|
||||
// use UniqueToken
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static int hardware_thread_id() noexcept;
|
||||
#endif
|
||||
|
||||
static constexpr const char* name() noexcept { return "OpenMP"; }
|
||||
};
|
||||
|
||||
} // namespace Kokkos
|
||||
|
@ -195,6 +240,7 @@ struct VerifyExecutionCanAccessMemorySpace
|
|||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
#include <OpenMP/Kokkos_OpenMP_Exec.hpp>
|
||||
#include <OpenMP/Kokkos_OpenMP_Team.hpp>
|
||||
#include <OpenMP/Kokkos_OpenMP_Parallel.hpp>
|
||||
#include <OpenMP/Kokkos_OpenMP_Task.hpp>
|
||||
|
||||
|
|
|
@ -177,22 +177,23 @@ void parallel_for( const ExecPolicy & policy
|
|||
)
|
||||
{
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
uint64_t kpID = 0;
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
|
||||
}
|
||||
uint64_t kpID = 0;
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Impl::ParallelConstructName<FunctorType, typename ExecPolicy::work_tag> name(str);
|
||||
Kokkos::Profiling::beginParallelFor(name.get(), 0, &kpID);
|
||||
}
|
||||
#endif
|
||||
|
||||
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
|
||||
Kokkos::Impl::shared_allocation_tracking_disable();
|
||||
Impl::ParallelFor< FunctorType , ExecPolicy > closure( functor , policy );
|
||||
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
|
||||
Kokkos::Impl::shared_allocation_tracking_enable();
|
||||
|
||||
closure.execute();
|
||||
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::endParallelFor(kpID);
|
||||
}
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::endParallelFor(kpID);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -210,14 +211,15 @@ void parallel_for( const size_t work_count
|
|||
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
uint64_t kpID = 0;
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
|
||||
}
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Impl::ParallelConstructName<FunctorType, void> name(str);
|
||||
Kokkos::Profiling::beginParallelFor(name.get(), 0, &kpID);
|
||||
}
|
||||
#endif
|
||||
|
||||
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
|
||||
Kokkos::Impl::shared_allocation_tracking_disable();
|
||||
Impl::ParallelFor< FunctorType , policy > closure( functor , policy(0,work_count) );
|
||||
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
|
||||
Kokkos::Impl::shared_allocation_tracking_enable();
|
||||
|
||||
closure.execute();
|
||||
|
||||
|
@ -420,21 +422,22 @@ void parallel_scan( const ExecutionPolicy & policy
|
|||
{
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
uint64_t kpID = 0;
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
|
||||
}
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Impl::ParallelConstructName<FunctorType, typename ExecutionPolicy::work_tag> name(str);
|
||||
Kokkos::Profiling::beginParallelScan(name.get(), 0, &kpID);
|
||||
}
|
||||
#endif
|
||||
|
||||
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
|
||||
Kokkos::Impl::shared_allocation_tracking_disable();
|
||||
Impl::ParallelScan< FunctorType , ExecutionPolicy > closure( functor , policy );
|
||||
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
|
||||
Kokkos::Impl::shared_allocation_tracking_enable();
|
||||
|
||||
closure.execute();
|
||||
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::endParallelScan(kpID);
|
||||
}
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::endParallelScan(kpID);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
@ -453,21 +456,22 @@ void parallel_scan( const size_t work_count
|
|||
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
uint64_t kpID = 0;
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
|
||||
}
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Impl::ParallelConstructName<FunctorType, void> name(str);
|
||||
Kokkos::Profiling::beginParallelScan(name.get(), 0, &kpID);
|
||||
}
|
||||
#endif
|
||||
|
||||
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
|
||||
Kokkos::Impl::shared_allocation_tracking_disable();
|
||||
Impl::ParallelScan< FunctorType , policy > closure( functor , policy(0,work_count) );
|
||||
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
|
||||
Kokkos::Impl::shared_allocation_tracking_enable();
|
||||
|
||||
closure.execute();
|
||||
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::endParallelScan(kpID);
|
||||
}
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::endParallelScan(kpID);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
|
|
@ -872,13 +872,14 @@ namespace Impl {
|
|||
const FunctorType& functor,
|
||||
ReturnType& return_value) {
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
uint64_t kpID = 0;
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::beginParallelReduce("" == label ? typeid(FunctorType).name() : label, 0, &kpID);
|
||||
}
|
||||
uint64_t kpID = 0;
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Impl::ParallelConstructName<FunctorType, typename PolicyType::work_tag> name(label);
|
||||
Kokkos::Profiling::beginParallelReduce(name.get(), 0, &kpID);
|
||||
}
|
||||
#endif
|
||||
|
||||
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
|
||||
Kokkos::Impl::shared_allocation_tracking_disable();
|
||||
#ifdef KOKKOS_IMPL_NEED_FUNCTOR_WRAPPER
|
||||
Impl::ParallelReduce<typename functor_adaptor::functor_type, PolicyType, typename return_value_adapter::reducer_type >
|
||||
closure(functor_adaptor::functor(functor),
|
||||
|
@ -890,13 +891,13 @@ namespace Impl {
|
|||
policy,
|
||||
return_value_adapter::return_value(return_value,functor));
|
||||
#endif
|
||||
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
|
||||
Kokkos::Impl::shared_allocation_tracking_enable();
|
||||
closure.execute();
|
||||
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::endParallelReduce(kpID);
|
||||
}
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::endParallelReduce(kpID);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
@ -66,6 +66,7 @@
|
|||
|
||||
#include <KokkosExp_MDRangePolicy.hpp>
|
||||
|
||||
#include <Kokkos_UniqueToken.hpp>
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
|
@ -526,6 +527,7 @@ public:
|
|||
}
|
||||
};
|
||||
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
template< class FunctorType , class ... Traits >
|
||||
|
@ -604,6 +606,178 @@ public:
|
|||
{}
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/* Parallel patterns for Kokkos::Serial with MDRangePolicy */
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template< class FunctorType , class ... Traits >
|
||||
class ParallelFor< FunctorType ,
|
||||
Kokkos::Experimental::MDRangePolicy< Traits ... > ,
|
||||
Kokkos::Serial
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
|
||||
typedef typename MDRangePolicy::impl_range_policy Policy ;
|
||||
|
||||
typedef typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const MDRangePolicy m_mdr_policy ;
|
||||
const Policy m_policy ;
|
||||
|
||||
void
|
||||
exec() const
|
||||
{
|
||||
const typename Policy::member_type e = m_policy.end();
|
||||
for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
|
||||
iterate_type( m_mdr_policy, m_functor )( i );
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
inline
|
||||
void execute() const
|
||||
{ this->exec(); }
|
||||
|
||||
inline
|
||||
ParallelFor( const FunctorType & arg_functor
|
||||
, const MDRangePolicy & arg_policy )
|
||||
: m_functor( arg_functor )
|
||||
, m_mdr_policy( arg_policy )
|
||||
, m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
|
||||
{}
|
||||
};
|
||||
|
||||
|
||||
template< class FunctorType , class ReducerType , class ... Traits >
|
||||
class ParallelReduce< FunctorType
|
||||
, Kokkos::Experimental::MDRangePolicy< Traits ... >
|
||||
, ReducerType
|
||||
, Kokkos::Serial
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
|
||||
typedef typename MDRangePolicy::impl_range_policy Policy ;
|
||||
|
||||
typedef typename MDRangePolicy::work_tag WorkTag ;
|
||||
|
||||
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
|
||||
typedef typename ReducerConditional::type ReducerTypeFwd;
|
||||
|
||||
typedef typename ReducerTypeFwd::value_type ValueType;
|
||||
|
||||
typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTag > ValueInit ;
|
||||
|
||||
typedef typename Analysis::pointer_type pointer_type ;
|
||||
typedef typename Analysis::reference_type reference_type ;
|
||||
|
||||
|
||||
using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy
|
||||
, FunctorType
|
||||
, WorkTag
|
||||
, ValueType
|
||||
>;
|
||||
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const MDRangePolicy m_mdr_policy ;
|
||||
const Policy m_policy ;
|
||||
const ReducerType m_reducer ;
|
||||
const pointer_type m_result_ptr ;
|
||||
|
||||
inline
|
||||
void
|
||||
exec( reference_type update ) const
|
||||
{
|
||||
const typename Policy::member_type e = m_policy.end();
|
||||
for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
|
||||
iterate_type( m_mdr_policy, m_functor, update )( i );
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
inline
|
||||
void execute() const
|
||||
{
|
||||
const size_t pool_reduce_size =
|
||||
Analysis::value_size( ReducerConditional::select(m_functor , m_reducer) );
|
||||
const size_t team_reduce_size = 0 ; // Never shrinks
|
||||
const size_t team_shared_size = 0 ; // Never shrinks
|
||||
const size_t thread_local_size = 0 ; // Never shrinks
|
||||
|
||||
serial_resize_thread_team_data( pool_reduce_size
|
||||
, team_reduce_size
|
||||
, team_shared_size
|
||||
, thread_local_size );
|
||||
|
||||
HostThreadTeamData & data = *serial_get_thread_team_data();
|
||||
|
||||
pointer_type ptr =
|
||||
m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local());
|
||||
|
||||
reference_type update =
|
||||
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , ptr );
|
||||
|
||||
this-> exec( update );
|
||||
|
||||
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::
|
||||
final( ReducerConditional::select(m_functor , m_reducer) , ptr );
|
||||
}
|
||||
|
||||
template< class HostViewType >
|
||||
ParallelReduce( const FunctorType & arg_functor ,
|
||||
const MDRangePolicy & arg_policy ,
|
||||
const HostViewType & arg_result_view ,
|
||||
typename std::enable_if<
|
||||
Kokkos::is_view< HostViewType >::value &&
|
||||
!Kokkos::is_reducer_type<ReducerType>::value
|
||||
,void*>::type = NULL)
|
||||
: m_functor( arg_functor )
|
||||
, m_mdr_policy( arg_policy )
|
||||
, m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
|
||||
, m_reducer( InvalidType() )
|
||||
, m_result_ptr( arg_result_view.data() )
|
||||
{
|
||||
static_assert( Kokkos::is_view< HostViewType >::value
|
||||
, "Kokkos::Serial reduce result must be a View" );
|
||||
|
||||
static_assert( std::is_same< typename HostViewType::memory_space , HostSpace >::value
|
||||
, "Kokkos::Serial reduce result must be a View in HostSpace" );
|
||||
}
|
||||
|
||||
inline
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, MDRangePolicy arg_policy
|
||||
, const ReducerType& reducer )
|
||||
: m_functor( arg_functor )
|
||||
, m_mdr_policy( arg_policy )
|
||||
, m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
|
||||
, m_reducer( reducer )
|
||||
, m_result_ptr( reducer.view().data() )
|
||||
{
|
||||
/*static_assert( std::is_same< typename ViewType::memory_space
|
||||
, Kokkos::HostSpace >::value
|
||||
, "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
|
@ -819,6 +993,60 @@ public:
|
|||
/*--------------------------------------------------------------------------*/
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos { namespace Experimental {
|
||||
|
||||
template<>
|
||||
class UniqueToken< Serial, UniqueTokenScope::Instance>
|
||||
{
|
||||
public:
|
||||
using execution_space = Serial;
|
||||
using size_type = int;
|
||||
|
||||
/// \brief create object size for concurrency on the given instance
|
||||
///
|
||||
/// This object should not be shared between instances
|
||||
UniqueToken( execution_space const& = execution_space() ) noexcept {}
|
||||
|
||||
/// \brief upper bound for acquired values, i.e. 0 <= value < size()
|
||||
inline
|
||||
int size() const noexcept { return 1; }
|
||||
|
||||
/// \brief acquire value such that 0 <= value < size()
|
||||
inline
|
||||
int acquire() const noexcept { return 0; }
|
||||
|
||||
/// \brief release a value acquired by generate
|
||||
inline
|
||||
void release( int ) const noexcept {}
|
||||
};
|
||||
|
||||
template<>
|
||||
class UniqueToken< Serial, UniqueTokenScope::Global>
|
||||
{
|
||||
public:
|
||||
using execution_space = Serial;
|
||||
using size_type = int;
|
||||
|
||||
/// \brief create object size for concurrency on the given instance
|
||||
///
|
||||
/// This object should not be shared between instances
|
||||
UniqueToken( execution_space const& = execution_space() ) noexcept {}
|
||||
|
||||
/// \brief upper bound for acquired values, i.e. 0 <= value < size()
|
||||
inline
|
||||
int size() const noexcept { return 1; }
|
||||
|
||||
/// \brief acquire value such that 0 <= value < size()
|
||||
inline
|
||||
int acquire() const noexcept { return 0; }
|
||||
|
||||
/// \brief release a value acquired by generate
|
||||
inline
|
||||
void release( int ) const noexcept {}
|
||||
};
|
||||
|
||||
}} // namespace Kokkos::Experimental
|
||||
|
||||
#include <impl/Kokkos_Serial_Task.hpp>
|
||||
|
||||
#endif // defined( KOKKOS_ENABLE_SERIAL )
|
||||
|
|
|
@ -148,7 +148,7 @@ private:
|
|||
typename std::conditional< Arg2_is_space , Arg2 , void
|
||||
>::type >::type ;
|
||||
|
||||
using task_base = Impl::TaskBase< Space , ValueType , void > ;
|
||||
using task_base = Impl::TaskBase< void , void , void > ;
|
||||
using queue_type = Impl::TaskQueue< Space > ;
|
||||
|
||||
task_base * m_task ;
|
||||
|
@ -293,13 +293,17 @@ public:
|
|||
//----------------------------------------
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
typename task_base::get_return_type
|
||||
int is_ready() const noexcept
|
||||
{ return ( 0 == m_task ) || ( ((task_base*) task_base::LockTag) == m_task->m_wait ); }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const typename Impl::TaskResult< ValueType >::reference_type
|
||||
get() const
|
||||
{
|
||||
if ( 0 == m_task ) {
|
||||
Kokkos::abort( "Kokkos:::Future::get ERROR: is_null()");
|
||||
}
|
||||
return m_task->get();
|
||||
return Impl::TaskResult< ValueType >::get( m_task );
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -396,7 +400,7 @@ private:
|
|||
|
||||
using track_type = Kokkos::Impl::SharedAllocationTracker ;
|
||||
using queue_type = Kokkos::Impl::TaskQueue< ExecSpace > ;
|
||||
using task_base = Impl::TaskBase< ExecSpace , void , void > ;
|
||||
using task_base = Impl::TaskBase< void , void , void > ;
|
||||
|
||||
track_type m_track ;
|
||||
queue_type * m_queue ;
|
||||
|
@ -464,29 +468,19 @@ public:
|
|||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
memory_pool * memory() const noexcept
|
||||
{ return m_queue ? m_queue->m_memory : (memory_pool*) 0 ; }
|
||||
{ return m_queue ? &( m_queue->m_memory ) : (memory_pool*) 0 ; }
|
||||
|
||||
//----------------------------------------
|
||||
/**\brief Allocation size for a spawned task */
|
||||
template< typename FunctorType >
|
||||
KOKKOS_FUNCTION
|
||||
size_t spawn_allocation_size() const
|
||||
{
|
||||
using task_type = Impl::TaskBase< execution_space
|
||||
, typename FunctorType::value_type
|
||||
, FunctorType > ;
|
||||
|
||||
return m_queue->allocate_block_size( sizeof(task_type) );
|
||||
}
|
||||
{ return m_queue->template spawn_allocation_size< FunctorType >(); }
|
||||
|
||||
/**\brief Allocation size for a when_all aggregate */
|
||||
KOKKOS_FUNCTION
|
||||
size_t when_all_allocation_size( int narg ) const
|
||||
{
|
||||
using task_base = Kokkos::Impl::TaskBase< ExecSpace , void , void > ;
|
||||
|
||||
return m_queue->allocate_block_size( sizeof(task_base) + narg * sizeof(task_base*) );
|
||||
}
|
||||
{ return m_queue->when_all_allocation_size( narg ); }
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
|
@ -507,7 +501,7 @@ public:
|
|||
queue_type * const queue =
|
||||
arg_policy.m_scheduler ? arg_policy.m_scheduler->m_queue : (
|
||||
arg_policy.m_dependence.m_task
|
||||
? arg_policy.m_dependence.m_task->m_queue
|
||||
? static_cast<queue_type*>(arg_policy.m_dependence.m_task->m_queue)
|
||||
: (queue_type*) 0 );
|
||||
|
||||
if ( 0 == queue ) {
|
||||
|
@ -530,8 +524,12 @@ public:
|
|||
future_type f ;
|
||||
|
||||
// Allocate task from memory pool
|
||||
|
||||
const size_t alloc_size =
|
||||
queue->template spawn_allocation_size< FunctorType >();
|
||||
|
||||
f.m_task =
|
||||
reinterpret_cast< task_type * >(queue->allocate(sizeof(task_type)));
|
||||
reinterpret_cast< task_type * >(queue->allocate(alloc_size) );
|
||||
|
||||
if ( f.m_task ) {
|
||||
|
||||
|
@ -539,15 +537,17 @@ public:
|
|||
// Reference count starts at two:
|
||||
// +1 for the matching decrement when task is complete
|
||||
// +1 for the future
|
||||
new ( f.m_task )
|
||||
task_type( arg_function
|
||||
, queue
|
||||
, arg_policy.m_dependence.m_task /* dependence */
|
||||
, 2 /* reference count */
|
||||
, int(sizeof(task_type)) /* allocation size */
|
||||
, int(arg_policy.m_task_type)
|
||||
, int(arg_policy.m_priority)
|
||||
, std::move(arg_functor) );
|
||||
new ( f.m_task ) task_type( std::move(arg_functor) );
|
||||
|
||||
f.m_task->m_apply = arg_function ;
|
||||
f.m_task->m_queue = queue ;
|
||||
f.m_task->m_next = arg_policy.m_dependence.m_task ;
|
||||
f.m_task->m_ref_count = 2 ;
|
||||
f.m_task->m_alloc_size = alloc_size ;
|
||||
f.m_task->m_task_type = arg_policy.m_task_type ;
|
||||
f.m_task->m_priority = arg_policy.m_priority ;
|
||||
|
||||
Kokkos::memory_fence();
|
||||
|
||||
// The dependence (if any) is processed immediately
|
||||
// within the schedule function, as such the dependence's
|
||||
|
@ -586,6 +586,30 @@ public:
|
|||
// Postcondition: task is in Executing-Respawn state
|
||||
}
|
||||
|
||||
template< typename FunctorType >
|
||||
KOKKOS_FUNCTION static
|
||||
void
|
||||
respawn( FunctorType * arg_self
|
||||
, TaskScheduler const &
|
||||
, TaskPriority const & arg_priority
|
||||
)
|
||||
{
|
||||
// Precondition: task is in Executing state
|
||||
|
||||
using value_type = typename FunctorType::value_type ;
|
||||
using task_type = Impl::TaskBase< execution_space
|
||||
, value_type
|
||||
, FunctorType > ;
|
||||
|
||||
task_type * const task = static_cast< task_type * >( arg_self );
|
||||
|
||||
task->m_priority = static_cast<int>(arg_priority);
|
||||
|
||||
task->add_dependence( (task_base*) 0 );
|
||||
|
||||
// Postcondition: task is in Executing-Respawn state
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
/**\brief Return a future that is complete
|
||||
* when all input futures are complete.
|
||||
|
@ -596,7 +620,7 @@ public:
|
|||
when_all( Future< A1 , A2 > const arg[] , int narg )
|
||||
{
|
||||
using future_type = Future< execution_space > ;
|
||||
using task_base = Kokkos::Impl::TaskBase< execution_space , void , void > ;
|
||||
using task_base = Kokkos::Impl::TaskBase< void , void , void > ;
|
||||
|
||||
future_type f ;
|
||||
|
||||
|
@ -610,9 +634,9 @@ public:
|
|||
// Increment reference count to track subsequent assignment.
|
||||
Kokkos::atomic_increment( &(t->m_ref_count) );
|
||||
if ( queue == 0 ) {
|
||||
queue = t->m_queue ;
|
||||
queue = static_cast< queue_type * >( t->m_queue );
|
||||
}
|
||||
else if ( queue != t->m_queue ) {
|
||||
else if ( queue != static_cast< queue_type * >( t->m_queue ) ) {
|
||||
Kokkos::abort("Kokkos when_all Futures must be in the same scheduler" );
|
||||
}
|
||||
}
|
||||
|
@ -620,28 +644,34 @@ public:
|
|||
|
||||
if ( queue != 0 ) {
|
||||
|
||||
size_t const size = sizeof(task_base) + narg * sizeof(task_base*);
|
||||
size_t const alloc_size = queue->when_all_allocation_size( narg );
|
||||
|
||||
f.m_task =
|
||||
reinterpret_cast< task_base * >( queue->allocate( size ) );
|
||||
reinterpret_cast< task_base * >( queue->allocate( alloc_size ) );
|
||||
|
||||
if ( f.m_task ) {
|
||||
|
||||
// Reference count starts at two:
|
||||
// +1 to match decrement when task completes
|
||||
// +1 for the future
|
||||
new( f.m_task ) task_base( queue
|
||||
, 2 /* reference count */
|
||||
, size /* allocation size */
|
||||
, narg /* dependence count */
|
||||
);
|
||||
|
||||
new( f.m_task ) task_base();
|
||||
|
||||
f.m_task->m_queue = queue ;
|
||||
f.m_task->m_ref_count = 2 ;
|
||||
f.m_task->m_alloc_size = alloc_size ;
|
||||
f.m_task->m_dep_count = narg ;
|
||||
f.m_task->m_task_type = task_base::Aggregate ;
|
||||
|
||||
// Assign dependences, reference counts were already incremented
|
||||
|
||||
task_base ** const dep = f.m_task->aggregate_dependences();
|
||||
task_base * volatile * const dep =
|
||||
f.m_task->aggregate_dependences();
|
||||
|
||||
for ( int i = 0 ; i < narg ; ++i ) { dep[i] = arg[i].m_task ; }
|
||||
|
||||
Kokkos::memory_fence();
|
||||
|
||||
queue->schedule_aggregate( f.m_task );
|
||||
// this when_all may be processed at any moment
|
||||
}
|
||||
|
|
|
@ -0,0 +1,88 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_UNIQUE_TOKEN_HPP
|
||||
#define KOKKOS_UNIQUE_TOKEN_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
namespace Kokkos { namespace Experimental {
|
||||
|
||||
enum class UniqueTokenScope : int
|
||||
{
|
||||
Instance,
|
||||
Global
|
||||
};
|
||||
|
||||
/// \brief class to generate unique ids base on the required amount of concurrency
|
||||
///
|
||||
/// This object should behave like a ref-counted object, so that when the last
|
||||
/// instance is destroy resources are free if needed
|
||||
template <typename ExecutionSpace, UniqueTokenScope = UniqueTokenScope::Instance >
|
||||
class UniqueToken
|
||||
{
|
||||
public:
|
||||
using execution_space = ExecutionSpace;
|
||||
using size_type = typename execution_space::size_type;
|
||||
|
||||
/// \brief create object size for concurrency on the given instance
|
||||
///
|
||||
/// This object should not be shared between instances
|
||||
UniqueToken( execution_space const& = execution_space() );
|
||||
|
||||
/// \brief upper bound for acquired values, i.e. 0 <= value < size()
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
size_type size() const ;
|
||||
|
||||
/// \brief acquire value such that 0 <= value < size()
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
size_type acquire() const ;
|
||||
|
||||
/// \brief release a value acquired by generate
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void release( size_type ) const ;
|
||||
};
|
||||
|
||||
}} // namespace Kokkos::Experimental
|
||||
|
||||
#endif //KOKKOS_UNIQUE_TOKEN_HPP
|
|
@ -1,13 +1,13 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
|
@ -36,7 +36,7 @@
|
|||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
@ -54,11 +54,14 @@
|
|||
#include <Kokkos_MemoryTraits.hpp>
|
||||
#include <Kokkos_ExecPolicy.hpp>
|
||||
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
#include <impl/Kokkos_Profiling_Interface.hpp>
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace Impl {
|
||||
|
||||
template< class DataType >
|
||||
|
@ -73,16 +76,6 @@ struct ViewDataAnalysis ;
|
|||
template< class , class ... >
|
||||
class ViewMapping { public: enum { is_assignable = false }; };
|
||||
|
||||
} /* namespace Impl */
|
||||
} /* namespace Experimental */
|
||||
} /* namespace Kokkos */
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
using Kokkos::Experimental::Impl::ViewMapping ;
|
||||
using Kokkos::Experimental::Impl::ViewDataAnalysis ;
|
||||
|
||||
} /* namespace Impl */
|
||||
} /* namespace Kokkos */
|
||||
|
||||
|
@ -1563,12 +1556,12 @@ namespace Kokkos {
|
|||
namespace Impl {
|
||||
|
||||
inline
|
||||
void shared_allocation_tracking_claim_and_disable()
|
||||
{ Kokkos::Impl::SharedAllocationRecord<void,void>::tracking_claim_and_disable(); }
|
||||
void shared_allocation_tracking_disable()
|
||||
{ Kokkos::Impl::SharedAllocationRecord<void,void>::tracking_disable(); }
|
||||
|
||||
inline
|
||||
void shared_allocation_tracking_release_and_enable()
|
||||
{ Kokkos::Impl::SharedAllocationRecord<void,void>::tracking_release_and_enable(); }
|
||||
void shared_allocation_tracking_enable()
|
||||
{ Kokkos::Impl::SharedAllocationRecord<void,void>::tracking_enable(); }
|
||||
|
||||
} /* namespace Impl */
|
||||
} /* namespace Kokkos */
|
||||
|
@ -1795,6 +1788,20 @@ void deep_copy
|
|||
|
||||
if ( (void *) dst.data() != (void*) src.data() ) {
|
||||
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
if (Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
|
||||
Kokkos::Profiling::beginDeepCopy(
|
||||
Kokkos::Profiling::SpaceHandle(dst_memory_space::name()),
|
||||
dst.label(),
|
||||
dst.data(),
|
||||
Kokkos::Profiling::SpaceHandle(src_memory_space::name()),
|
||||
src.label(),
|
||||
src.data(),
|
||||
nbytes);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Concern: If overlapping views then a parallel copy will be erroneous.
|
||||
// ...
|
||||
|
||||
|
@ -1882,7 +1889,14 @@ void deep_copy
|
|||
else {
|
||||
Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation");
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
if (Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::endDeepCopy();
|
||||
}
|
||||
#endif
|
||||
|
||||
} // ( (void *) dst.data() != (void*) src.data() )
|
||||
}
|
||||
|
||||
} /* namespace Kokkos */
|
||||
|
@ -2249,6 +2263,82 @@ resize( Kokkos::View<T,P...> & v ,
|
|||
|
||||
static_assert( Kokkos::ViewTraits<T,P...>::is_managed , "Can only resize managed views" );
|
||||
|
||||
// Fix #904 by checking dimensions before actually resizing.
|
||||
//
|
||||
// Rank is known at compile time, so hopefully the compiler will
|
||||
// remove branches that are compile-time false. The upcoming "if
|
||||
// constexpr" language feature would make this certain.
|
||||
if (view_type::Rank == 1 &&
|
||||
n0 == static_cast<size_t> (v.extent(0))) {
|
||||
return;
|
||||
}
|
||||
if (view_type::Rank == 2 &&
|
||||
n0 == static_cast<size_t> (v.extent(0)) &&
|
||||
n1 == static_cast<size_t> (v.extent(1))) {
|
||||
return;
|
||||
}
|
||||
if (view_type::Rank == 3 &&
|
||||
n0 == static_cast<size_t> (v.extent(0)) &&
|
||||
n1 == static_cast<size_t> (v.extent(1)) &&
|
||||
n2 == static_cast<size_t> (v.extent(2))) {
|
||||
return;
|
||||
}
|
||||
if (view_type::Rank == 4 &&
|
||||
n0 == static_cast<size_t> (v.extent(0)) &&
|
||||
n1 == static_cast<size_t> (v.extent(1)) &&
|
||||
n2 == static_cast<size_t> (v.extent(2)) &&
|
||||
n3 == static_cast<size_t> (v.extent(3))) {
|
||||
return;
|
||||
}
|
||||
if (view_type::Rank == 5 &&
|
||||
n0 == static_cast<size_t> (v.extent(0)) &&
|
||||
n1 == static_cast<size_t> (v.extent(1)) &&
|
||||
n2 == static_cast<size_t> (v.extent(2)) &&
|
||||
n3 == static_cast<size_t> (v.extent(3)) &&
|
||||
n4 == static_cast<size_t> (v.extent(4))) {
|
||||
return;
|
||||
}
|
||||
if (view_type::Rank == 6 &&
|
||||
n0 == static_cast<size_t> (v.extent(0)) &&
|
||||
n1 == static_cast<size_t> (v.extent(1)) &&
|
||||
n2 == static_cast<size_t> (v.extent(2)) &&
|
||||
n3 == static_cast<size_t> (v.extent(3)) &&
|
||||
n4 == static_cast<size_t> (v.extent(4)) &&
|
||||
n5 == static_cast<size_t> (v.extent(5))) {
|
||||
return;
|
||||
}
|
||||
if (view_type::Rank == 7 &&
|
||||
n0 == static_cast<size_t> (v.extent(0)) &&
|
||||
n1 == static_cast<size_t> (v.extent(1)) &&
|
||||
n2 == static_cast<size_t> (v.extent(2)) &&
|
||||
n3 == static_cast<size_t> (v.extent(3)) &&
|
||||
n4 == static_cast<size_t> (v.extent(4)) &&
|
||||
n5 == static_cast<size_t> (v.extent(5)) &&
|
||||
n6 == static_cast<size_t> (v.extent(6))) {
|
||||
return;
|
||||
}
|
||||
if (view_type::Rank == 8 &&
|
||||
n0 == static_cast<size_t> (v.extent(0)) &&
|
||||
n1 == static_cast<size_t> (v.extent(1)) &&
|
||||
n2 == static_cast<size_t> (v.extent(2)) &&
|
||||
n3 == static_cast<size_t> (v.extent(3)) &&
|
||||
n4 == static_cast<size_t> (v.extent(4)) &&
|
||||
n5 == static_cast<size_t> (v.extent(5)) &&
|
||||
n6 == static_cast<size_t> (v.extent(6)) &&
|
||||
n7 == static_cast<size_t> (v.extent(7))) {
|
||||
return;
|
||||
}
|
||||
// If Kokkos ever supports Views of rank > 8, the above code won't
|
||||
// be incorrect, because avoiding reallocation in resize() is just
|
||||
// an optimization.
|
||||
|
||||
// TODO (mfh 27 Jun 2017) If the old View has enough space but just
|
||||
// different dimensions (e.g., if the product of the dimensions,
|
||||
// including extra space for alignment, will not change), then
|
||||
// consider just reusing storage. For now, Kokkos always
|
||||
// reallocates if any of the dimensions change, even if the old View
|
||||
// has enough space.
|
||||
|
||||
view_type v_resized( v.label(), n0, n1, n2, n3, n4, n5, n6, n7 );
|
||||
|
||||
Kokkos::Impl::ViewRemap< view_type , view_type >( v_resized , v );
|
||||
|
@ -2317,6 +2407,106 @@ void realloc( Kokkos::View<T,P...> & v ,
|
|||
}
|
||||
} /* namespace Kokkos */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos { namespace Impl {
|
||||
|
||||
template < class Specialize, typename A, typename B >
|
||||
struct CommonViewValueType;
|
||||
|
||||
template < typename A, typename B >
|
||||
struct CommonViewValueType< void, A, B >
|
||||
{
|
||||
using value_type = typename std::common_type< A , B >::type;
|
||||
};
|
||||
|
||||
|
||||
template < class Specialize, class ValueType >
|
||||
struct CommonViewAllocProp;
|
||||
|
||||
template < class ValueType >
|
||||
struct CommonViewAllocProp< void, ValueType >
|
||||
{
|
||||
using value_type = ValueType;
|
||||
|
||||
template < class ... Views >
|
||||
CommonViewAllocProp( const Views & ... ) {}
|
||||
};
|
||||
|
||||
|
||||
template < class ... Views >
|
||||
struct DeduceCommonViewAllocProp;
|
||||
|
||||
// Base case must provide types for:
|
||||
// 1. specialize 2. value_type 3. is_view 4. prop_type
|
||||
template < class FirstView >
|
||||
struct DeduceCommonViewAllocProp< FirstView >
|
||||
{
|
||||
using specialize = typename FirstView::traits::specialize;
|
||||
|
||||
using value_type = typename FirstView::traits::value_type;
|
||||
|
||||
enum : bool { is_view = is_view< FirstView >::value };
|
||||
|
||||
using prop_type = CommonViewAllocProp< specialize, value_type >;
|
||||
};
|
||||
|
||||
|
||||
template < class FirstView, class ... NextViews >
|
||||
struct DeduceCommonViewAllocProp< FirstView, NextViews... >
|
||||
{
|
||||
using NextTraits = DeduceCommonViewAllocProp< NextViews... >;
|
||||
|
||||
using first_specialize = typename FirstView::traits::specialize;
|
||||
using first_value_type = typename FirstView::traits::value_type;
|
||||
|
||||
enum : bool { first_is_view = is_view< FirstView >::value };
|
||||
|
||||
using next_specialize = typename NextTraits::specialize;
|
||||
using next_value_type = typename NextTraits::value_type;
|
||||
|
||||
enum : bool { next_is_view = NextTraits::is_view };
|
||||
|
||||
// common types
|
||||
|
||||
// determine specialize type
|
||||
// if first and next specialize differ, but are not the same specialize, error out
|
||||
static_assert( !(!std::is_same< first_specialize, next_specialize >::value && !std::is_same< first_specialize, void>::value && !std::is_same< void, next_specialize >::value) , "Kokkos DeduceCommonViewAllocProp ERROR: Only one non-void specialize trait allowed" );
|
||||
|
||||
// otherwise choose non-void specialize if either/both are non-void
|
||||
using specialize = typename std::conditional< std::is_same< first_specialize, next_specialize >::value
|
||||
, first_specialize
|
||||
, typename std::conditional< ( std::is_same< first_specialize, void >::value
|
||||
&& !std::is_same< next_specialize, void >::value)
|
||||
, next_specialize
|
||||
, first_specialize
|
||||
>::type
|
||||
>::type;
|
||||
|
||||
using value_type = typename CommonViewValueType< specialize, first_value_type, next_value_type >::value_type;
|
||||
|
||||
enum : bool { is_view = (first_is_view && next_is_view) };
|
||||
|
||||
using prop_type = CommonViewAllocProp< specialize, value_type >;
|
||||
};
|
||||
|
||||
} // end namespace Impl
|
||||
|
||||
template < class ... Views >
|
||||
using DeducedCommonPropsType = typename Impl::DeduceCommonViewAllocProp<Views...>::prop_type ;
|
||||
|
||||
// User function
|
||||
template < class ... Views >
|
||||
DeducedCommonPropsType<Views...>
|
||||
common_view_alloc_prop( Views const & ... views )
|
||||
{
|
||||
return DeducedCommonPropsType<Views...>( views... );
|
||||
}
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
// For backward compatibility:
|
||||
|
@ -2350,6 +2540,9 @@ using Kokkos::Impl::WithoutInitializing_t ;
|
|||
using Kokkos::Impl::AllowPadding_t ;
|
||||
using Kokkos::Impl::SharedAllocationRecord ;
|
||||
using Kokkos::Impl::SharedAllocationTracker ;
|
||||
using Kokkos::Impl::ViewMapping ;
|
||||
using Kokkos::Impl::ViewDataAnalysis ;
|
||||
|
||||
|
||||
} /* namespace Impl */
|
||||
} /* namespace Experimental */
|
||||
|
|
|
@ -0,0 +1,265 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_WORKGRAPHPOLICY_HPP
|
||||
#define KOKKOS_WORKGRAPHPOLICY_HPP
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
namespace Experimental {
|
||||
|
||||
template< class functor_type , class execution_space, class ... policy_args >
|
||||
class WorkGraphExec;
|
||||
|
||||
}}} // namespace Kokkos::Impl::Experimental
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
|
||||
template< class ... Properties >
|
||||
class WorkGraphPolicy
|
||||
{
|
||||
public:
|
||||
|
||||
using self_type = WorkGraphPolicy<Properties ... >;
|
||||
using traits = Kokkos::Impl::PolicyTraits<Properties ... >;
|
||||
using index_type = typename traits::index_type;
|
||||
using execution_space = typename traits::execution_space;
|
||||
using work_tag = typename traits::work_tag;
|
||||
using memory_space = typename execution_space::memory_space;
|
||||
using graph_type = Kokkos::Experimental::Crs<index_type, execution_space, void, index_type>;
|
||||
using member_type = index_type;
|
||||
|
||||
private:
|
||||
|
||||
graph_type m_graph;
|
||||
|
||||
using ints_type = Kokkos::View<std::int32_t*, memory_space>;
|
||||
using range_type = Kokkos::pair<std::int32_t, std::int32_t>;
|
||||
using ranges_type = Kokkos::View<range_type*, memory_space>;
|
||||
const std::int32_t m_total_work;
|
||||
ints_type m_counts;
|
||||
ints_type m_queue;
|
||||
ranges_type m_ranges;
|
||||
|
||||
public:
|
||||
|
||||
struct TagZeroRanges {};
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(TagZeroRanges, std::int32_t i) const {
|
||||
m_ranges[i] = range_type(0, 0);
|
||||
}
|
||||
void zero_ranges() {
|
||||
using policy_type = RangePolicy<std::int32_t, execution_space, TagZeroRanges>;
|
||||
using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
|
||||
const closure_type closure(*this, policy_type(0, 1));
|
||||
closure.execute();
|
||||
execution_space::fence();
|
||||
}
|
||||
|
||||
struct TagFillQueue {};
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(TagFillQueue, std::int32_t i) const {
|
||||
if (*((volatile std::int32_t*)(&m_counts(i))) == 0) push_work(i);
|
||||
}
|
||||
void fill_queue() {
|
||||
using policy_type = RangePolicy<std::int32_t, execution_space, TagFillQueue>;
|
||||
using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
|
||||
const closure_type closure(*this, policy_type(0, m_total_work));
|
||||
closure.execute();
|
||||
execution_space::fence();
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
inline
|
||||
void setup() {
|
||||
if (m_graph.numRows() > std::numeric_limits<std::int32_t>::max()) {
|
||||
Kokkos::abort("WorkGraphPolicy work must be indexable using int32_t");
|
||||
}
|
||||
get_crs_transpose_counts(m_counts, m_graph);
|
||||
m_queue = ints_type(ViewAllocateWithoutInitializing("queue"), m_total_work);
|
||||
deep_copy(m_queue, std::int32_t(-1));
|
||||
m_ranges = ranges_type("ranges", 1);
|
||||
fill_queue();
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
std::int32_t pop_work() const {
|
||||
range_type w(-1,-1);
|
||||
while (true) {
|
||||
const range_type w_new( w.first + 1 , w.second );
|
||||
w = atomic_compare_exchange( &m_ranges(0) , w , w_new );
|
||||
if ( w.first < w.second ) { // there was work in the queue
|
||||
if ( w_new.first == w.first + 1 && w_new.second == w.second ) {
|
||||
// we got a work item
|
||||
std::int32_t i;
|
||||
// the push_work function may have incremented the end counter
|
||||
// but not yet written the work index into the queue.
|
||||
// wait until the entry is valid.
|
||||
while ( -1 == ( i = *((volatile std::int32_t*)(&m_queue( w.first ))) ) );
|
||||
return i;
|
||||
} // we got a work item
|
||||
} else { // there was no work in the queue
|
||||
#ifdef KOKKOS_DEBUG
|
||||
if ( w_new.first == w.first + 1 && w_new.second == w.second ) {
|
||||
Kokkos::abort("bug in pop_work");
|
||||
}
|
||||
#endif
|
||||
if (w.first == m_total_work) { // all work is done
|
||||
return -1;
|
||||
} else { // need to wait for more work to be pushed
|
||||
// take a guess that one work item will be pushed
|
||||
// the key thing is we can't leave (w) alone, because
|
||||
// otherwise the next compare_exchange may succeed in
|
||||
// popping work from an empty queue
|
||||
w.second++;
|
||||
}
|
||||
} // there was no work in the queue
|
||||
} // while (true)
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void push_work(std::int32_t i) const {
|
||||
range_type w(-1,-1);
|
||||
while (true) {
|
||||
const range_type w_new( w.first , w.second + 1 );
|
||||
// try to increment the end counter
|
||||
w = atomic_compare_exchange( &m_ranges(0) , w , w_new );
|
||||
// stop trying if the increment was successful
|
||||
if ( w.first == w_new.first && w.second + 1 == w_new.second ) break;
|
||||
}
|
||||
// write the work index into the claimed spot in the queue
|
||||
*((volatile std::int32_t*)(&m_queue( w.second ))) = i;
|
||||
// push this write out into the memory system
|
||||
memory_fence();
|
||||
}
|
||||
|
||||
template< class functor_type , class execution_space, class ... policy_args >
|
||||
friend class Kokkos::Impl::Experimental::WorkGraphExec;
|
||||
|
||||
public:
|
||||
|
||||
WorkGraphPolicy(graph_type arg_graph)
|
||||
: m_graph(arg_graph)
|
||||
, m_total_work( arg_graph.numRows() )
|
||||
{
|
||||
setup();
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
}} // namespace Kokkos::Experimental
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
namespace Experimental {
|
||||
|
||||
template< class functor_type , class execution_space, class ... policy_args >
|
||||
class WorkGraphExec
|
||||
{
|
||||
public:
|
||||
|
||||
using self_type = WorkGraphExec< functor_type, execution_space, policy_args ... >;
|
||||
using policy_type = Kokkos::Experimental::WorkGraphPolicy< policy_args ... >;
|
||||
using member_type = typename policy_type::member_type;
|
||||
using memory_space = typename execution_space::memory_space;
|
||||
|
||||
protected:
|
||||
|
||||
const functor_type m_functor;
|
||||
const policy_type m_policy;
|
||||
|
||||
protected:
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
std::int32_t before_work() const {
|
||||
return m_policy.pop_work();
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void after_work(std::int32_t i) const {
|
||||
/* fence any writes that were done by the work item itself
|
||||
(usually writing its result to global memory) */
|
||||
memory_fence();
|
||||
const std::int32_t begin = m_policy.m_graph.row_map( i );
|
||||
const std::int32_t end = m_policy.m_graph.row_map( i + 1 );
|
||||
for (std::int32_t j = begin; j < end; ++j) {
|
||||
const std::int32_t next = m_policy.m_graph.entries( j );
|
||||
const std::int32_t old_count = atomic_fetch_add( &(m_policy.m_counts(next)), -1 );
|
||||
if ( old_count == 1 ) m_policy.push_work( next );
|
||||
}
|
||||
}
|
||||
|
||||
inline
|
||||
WorkGraphExec( const functor_type & arg_functor
|
||||
, const policy_type & arg_policy )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
}}} // namespace Kokkos::Impl::Experimental
|
||||
|
||||
#ifdef KOKKOS_ENABLE_SERIAL
|
||||
#include "impl/Kokkos_Serial_WorkGraphPolicy.hpp"
|
||||
#endif
|
||||
|
||||
#ifdef KOKKOS_ENABLE_OPENMP
|
||||
#include "OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp"
|
||||
#endif
|
||||
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
#include "Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp"
|
||||
#endif
|
||||
|
||||
#ifdef KOKKOS_ENABLE_THREADS
|
||||
#include "Threads/Kokkos_Threads_WorkGraphPolicy.hpp"
|
||||
#endif
|
||||
|
||||
#endif /* #define KOKKOS_WORKGRAPHPOLICY_HPP */
|
|
@ -45,75 +45,100 @@
|
|||
#if defined( KOKKOS_ENABLE_OPENMP )
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
|
||||
#include <limits>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
#include <iostream>
|
||||
#include <impl/Kokkos_CPUDiscovery.hpp>
|
||||
#include <impl/Kokkos_Profiling_Interface.hpp>
|
||||
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
namespace {
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int kokkos_omp_in_parallel();
|
||||
int g_openmp_hardware_max_threads = 1;
|
||||
|
||||
int kokkos_omp_in_critical_region = ( Kokkos::HostSpace::register_in_parallel( kokkos_omp_in_parallel ) , 0 );
|
||||
__thread int t_openmp_hardware_id = 0;
|
||||
__thread Impl::OpenMPExec * t_openmp_instance = nullptr;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int kokkos_omp_in_parallel()
|
||||
void OpenMPExec::validate_partition( const int nthreads
|
||||
, int & num_partitions
|
||||
, int & partition_size
|
||||
)
|
||||
{
|
||||
#ifndef __CUDA_ARCH__
|
||||
return omp_in_parallel() && ! kokkos_omp_in_critical_region ;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
if (nthreads == 1) {
|
||||
num_partitions = 1;
|
||||
partition_size = 1;
|
||||
}
|
||||
else if( num_partitions < 1 && partition_size < 1) {
|
||||
int idle = nthreads;
|
||||
for (int np = 2; np <= nthreads ; ++np) {
|
||||
for (int ps = 1; ps <= nthreads/np; ++ps) {
|
||||
if (nthreads - np*ps < idle) {
|
||||
idle = nthreads - np*ps;
|
||||
num_partitions = np;
|
||||
partition_size = ps;
|
||||
}
|
||||
if (idle == 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if( num_partitions < 1 && partition_size > 0 ) {
|
||||
if ( partition_size <= nthreads ) {
|
||||
num_partitions = nthreads / partition_size;
|
||||
}
|
||||
else {
|
||||
num_partitions = 1;
|
||||
partition_size = nthreads;
|
||||
}
|
||||
}
|
||||
else if( num_partitions > 0 && partition_size < 1 ) {
|
||||
if ( num_partitions <= nthreads ) {
|
||||
partition_size = nthreads / num_partitions;
|
||||
}
|
||||
else {
|
||||
num_partitions = nthreads;
|
||||
partition_size = 1;
|
||||
}
|
||||
}
|
||||
else if ( num_partitions * partition_size > nthreads ) {
|
||||
int idle = nthreads;
|
||||
const int NP = num_partitions;
|
||||
const int PS = partition_size;
|
||||
for (int np = NP; np > 0; --np) {
|
||||
for (int ps = PS; ps > 0; --ps) {
|
||||
if ( (np*ps <= nthreads)
|
||||
&& (nthreads - np*ps < idle) ) {
|
||||
idle = nthreads - np*ps;
|
||||
num_partitions = np;
|
||||
partition_size = ps;
|
||||
}
|
||||
if (idle == 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
bool s_using_hwloc = false;
|
||||
|
||||
} // namespace
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
int OpenMPExec::m_map_rank[ OpenMPExec::MAX_THREAD_COUNT ] = { 0 };
|
||||
|
||||
int OpenMPExec::m_pool_topo[ 4 ] = { 0 };
|
||||
|
||||
HostThreadTeamData * OpenMPExec::m_pool[ OpenMPExec::MAX_THREAD_COUNT ] = { 0 };
|
||||
|
||||
void OpenMPExec::verify_is_process( const char * const label )
|
||||
void OpenMPExec::verify_is_master( const char * const label )
|
||||
{
|
||||
if ( omp_in_parallel() ) {
|
||||
if ( !t_openmp_instance )
|
||||
{
|
||||
std::string msg( label );
|
||||
msg.append( " ERROR: in parallel" );
|
||||
msg.append( " ERROR: in parallel or not initialized" );
|
||||
Kokkos::Impl::throw_runtime_exception( msg );
|
||||
}
|
||||
}
|
||||
|
||||
void OpenMPExec::verify_initialized( const char * const label )
|
||||
{
|
||||
if ( 0 == m_pool[0] ) {
|
||||
std::string msg( label );
|
||||
msg.append( " ERROR: not initialized" );
|
||||
Kokkos::Impl::throw_runtime_exception( msg );
|
||||
}
|
||||
|
||||
if ( omp_get_max_threads() != Kokkos::OpenMP::thread_pool_size(0) ) {
|
||||
std::string msg( label );
|
||||
msg.append( " ERROR: Initialized but threads modified inappropriately" );
|
||||
Kokkos::Impl::throw_runtime_exception( msg );
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
@ -133,11 +158,11 @@ void OpenMPExec::clear_thread_data()
|
|||
const int old_alloc_bytes =
|
||||
m_pool[0] ? ( member_bytes + m_pool[0]->scratch_bytes() ) : 0 ;
|
||||
|
||||
Kokkos::HostSpace space ;
|
||||
OpenMP::memory_space space ;
|
||||
|
||||
#pragma omp parallel
|
||||
#pragma omp parallel num_threads( m_pool_size )
|
||||
{
|
||||
const int rank = m_map_rank[ omp_get_thread_num() ];
|
||||
const int rank = omp_get_thread_num();
|
||||
|
||||
if ( 0 != m_pool[rank] ) {
|
||||
|
||||
|
@ -189,13 +214,13 @@ void OpenMPExec::resize_thread_data( size_t pool_reduce_bytes
|
|||
, team_shared_bytes
|
||||
, thread_local_bytes );
|
||||
|
||||
const int pool_size = omp_get_max_threads();
|
||||
OpenMP::memory_space space ;
|
||||
|
||||
Kokkos::HostSpace space ;
|
||||
memory_fence();
|
||||
|
||||
#pragma omp parallel
|
||||
#pragma omp parallel num_threads(m_pool_size)
|
||||
{
|
||||
const int rank = m_map_rank[ omp_get_thread_num() ];
|
||||
const int rank = omp_get_thread_num();
|
||||
|
||||
if ( 0 != m_pool[rank] ) {
|
||||
|
||||
|
@ -214,11 +239,14 @@ void OpenMPExec::resize_thread_data( size_t pool_reduce_bytes
|
|||
, pool_reduce_bytes
|
||||
, team_reduce_bytes
|
||||
, team_shared_bytes
|
||||
, thread_local_bytes );
|
||||
, thread_local_bytes
|
||||
);
|
||||
|
||||
memory_fence();
|
||||
}
|
||||
/* END #pragma omp parallel */
|
||||
|
||||
HostThreadTeamData::organize_pool( m_pool , pool_size );
|
||||
HostThreadTeamData::organize_pool( m_pool , m_pool_size );
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -232,16 +260,8 @@ namespace Kokkos {
|
|||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
int OpenMP::is_initialized()
|
||||
{ return 0 != Impl::OpenMPExec::m_pool[0]; }
|
||||
|
||||
void OpenMP::initialize( unsigned thread_count ,
|
||||
unsigned use_numa_count ,
|
||||
unsigned use_cores_per_numa )
|
||||
int OpenMP::get_current_max_threads() noexcept
|
||||
{
|
||||
// Before any other call to OMP query the maximum number of threads
|
||||
// and save the value for re-initialization unit testing.
|
||||
|
||||
// Using omp_get_max_threads(); is problematic in conjunction with
|
||||
// Hwloc on Intel (essentially an initial call to the OpenMP runtime
|
||||
// without a parallel region before will set a process mask for a single core
|
||||
|
@ -250,110 +270,99 @@ void OpenMP::initialize( unsigned thread_count ,
|
|||
// the thread masks. The intend seems to be to make serial code run fast, if you
|
||||
// compile with OpenMP enabled but don't actually use parallel regions or so
|
||||
// static int omp_max_threads = omp_get_max_threads();
|
||||
int nthreads = 0;
|
||||
|
||||
int count = 0;
|
||||
#pragma omp parallel
|
||||
{
|
||||
#pragma omp atomic
|
||||
nthreads++;
|
||||
++count;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
static int omp_max_threads = nthreads;
|
||||
|
||||
const bool is_initialized = 0 != Impl::OpenMPExec::m_pool[0] ;
|
||||
|
||||
bool thread_spawn_failed = false ;
|
||||
|
||||
if ( ! is_initialized ) {
|
||||
|
||||
// Use hwloc thread pinning if concerned with locality.
|
||||
// If spreading threads across multiple NUMA regions.
|
||||
// If hyperthreading is enabled.
|
||||
Impl::s_using_hwloc = hwloc::available() && (
|
||||
( 1 < Kokkos::hwloc::get_available_numa_count() ) ||
|
||||
( 1 < Kokkos::hwloc::get_available_threads_per_core() ) );
|
||||
|
||||
std::pair<unsigned,unsigned> threads_coord[ Impl::OpenMPExec::MAX_THREAD_COUNT ];
|
||||
|
||||
// If hwloc available then use it's maximum value.
|
||||
|
||||
if ( thread_count == 0 ) {
|
||||
thread_count = Impl::s_using_hwloc
|
||||
? Kokkos::hwloc::get_available_numa_count() *
|
||||
Kokkos::hwloc::get_available_cores_per_numa() *
|
||||
Kokkos::hwloc::get_available_threads_per_core()
|
||||
: omp_max_threads ;
|
||||
}
|
||||
|
||||
if(Impl::s_using_hwloc)
|
||||
hwloc::thread_mapping( "Kokkos::OpenMP::initialize" ,
|
||||
false /* do not allow asynchronous */ ,
|
||||
thread_count ,
|
||||
use_numa_count ,
|
||||
use_cores_per_numa ,
|
||||
threads_coord );
|
||||
|
||||
// Spawn threads:
|
||||
|
||||
omp_set_num_threads( thread_count );
|
||||
|
||||
// Verify OMP interaction:
|
||||
if ( int(thread_count) != omp_get_max_threads() ) {
|
||||
thread_spawn_failed = true ;
|
||||
}
|
||||
|
||||
// Verify spawning and bind threads:
|
||||
#pragma omp parallel
|
||||
{
|
||||
#pragma omp critical
|
||||
{
|
||||
if ( int(thread_count) != omp_get_num_threads() ) {
|
||||
thread_spawn_failed = true ;
|
||||
}
|
||||
|
||||
// Call to 'bind_this_thread' is not thread safe so place this whole block in a critical region.
|
||||
// Call to 'new' may not be thread safe as well.
|
||||
|
||||
const unsigned omp_rank = omp_get_thread_num();
|
||||
const unsigned thread_r = Impl::s_using_hwloc && Kokkos::hwloc::can_bind_threads()
|
||||
? Kokkos::hwloc::bind_this_thread( thread_count , threads_coord )
|
||||
: omp_rank ;
|
||||
|
||||
Impl::OpenMPExec::m_map_rank[ omp_rank ] = thread_r ;
|
||||
}
|
||||
/* END #pragma omp critical */
|
||||
}
|
||||
/* END #pragma omp parallel */
|
||||
|
||||
if ( ! thread_spawn_failed ) {
|
||||
Impl::OpenMPExec::m_pool_topo[0] = thread_count ;
|
||||
Impl::OpenMPExec::m_pool_topo[1] = Impl::s_using_hwloc ? thread_count / use_numa_count : thread_count;
|
||||
Impl::OpenMPExec::m_pool_topo[2] = Impl::s_using_hwloc ? thread_count / ( use_numa_count * use_cores_per_numa ) : 1;
|
||||
|
||||
// New, unified host thread team data:
|
||||
{
|
||||
size_t pool_reduce_bytes = 32 * thread_count ;
|
||||
size_t team_reduce_bytes = 32 * thread_count ;
|
||||
size_t team_shared_bytes = 1024 * thread_count ;
|
||||
size_t thread_local_bytes = 1024 ;
|
||||
|
||||
Impl::OpenMPExec::resize_thread_data( pool_reduce_bytes
|
||||
, team_reduce_bytes
|
||||
, team_shared_bytes
|
||||
, thread_local_bytes
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ( is_initialized || thread_spawn_failed ) {
|
||||
std::string msg("Kokkos::OpenMP::initialize ERROR");
|
||||
|
||||
if ( is_initialized ) { msg.append(" : already initialized"); }
|
||||
if ( thread_spawn_failed ) { msg.append(" : failed spawning threads"); }
|
||||
|
||||
void OpenMP::initialize( int thread_count )
|
||||
{
|
||||
if ( omp_in_parallel() ) {
|
||||
std::string msg("Kokkos::OpenMP::initialize ERROR : in parallel");
|
||||
Kokkos::Impl::throw_runtime_exception(msg);
|
||||
}
|
||||
|
||||
if ( Impl::t_openmp_instance )
|
||||
{
|
||||
finalize();
|
||||
}
|
||||
|
||||
{
|
||||
if (nullptr == std::getenv("OMP_PROC_BIND") ) {
|
||||
printf("Kokkos::OpenMP::initialize WARNING: OMP_PROC_BIND environment variable not set\n");
|
||||
printf(" In general, for best performance with OpenMP 4.0 or better set OMP_PROC_BIND=spread and OMP_PLACES=threads\n");
|
||||
printf(" For best performance with OpenMP 3.1 set OMP_PROC_BIND=true\n");
|
||||
printf(" For unit testing set OMP_PROC_BIND=false\n");
|
||||
}
|
||||
|
||||
OpenMP::memory_space space ;
|
||||
|
||||
// Before any other call to OMP query the maximum number of threads
|
||||
// and save the value for re-initialization unit testing.
|
||||
|
||||
Impl::g_openmp_hardware_max_threads = get_current_max_threads();
|
||||
|
||||
int process_num_threads = Impl::g_openmp_hardware_max_threads;
|
||||
|
||||
if ( Kokkos::hwloc::available() ) {
|
||||
process_num_threads = Kokkos::hwloc::get_available_numa_count()
|
||||
* Kokkos::hwloc::get_available_cores_per_numa()
|
||||
* Kokkos::hwloc::get_available_threads_per_core();
|
||||
}
|
||||
|
||||
// if thread_count < 0, use g_openmp_hardware_max_threads;
|
||||
// if thread_count == 0, set g_openmp_hardware_max_threads to process_num_threads
|
||||
// if thread_count > 0, set g_openmp_hardware_max_threads to thread_count
|
||||
if (thread_count < 0 ) {
|
||||
thread_count = Impl::g_openmp_hardware_max_threads;
|
||||
}
|
||||
else if( thread_count == 0 && Impl::g_openmp_hardware_max_threads != process_num_threads ) {
|
||||
Impl::g_openmp_hardware_max_threads = process_num_threads;
|
||||
omp_set_num_threads(Impl::g_openmp_hardware_max_threads);
|
||||
}
|
||||
else {
|
||||
if( thread_count > process_num_threads ) {
|
||||
printf( "Kokkos::OpenMP::initialize WARNING: You are likely oversubscribing your CPU cores.\n");
|
||||
printf( " process threads available : %3d, requested thread : %3d\n", process_num_threads, thread_count );
|
||||
}
|
||||
Impl::g_openmp_hardware_max_threads = thread_count;
|
||||
omp_set_num_threads(Impl::g_openmp_hardware_max_threads);
|
||||
}
|
||||
|
||||
// setup thread local
|
||||
#pragma omp parallel num_threads(Impl::g_openmp_hardware_max_threads)
|
||||
{
|
||||
Impl::t_openmp_instance = nullptr;
|
||||
Impl::t_openmp_hardware_id = omp_get_thread_num();
|
||||
Impl::SharedAllocationRecord< void, void >::tracking_enable();
|
||||
}
|
||||
|
||||
void * const ptr = space.allocate( sizeof(Impl::OpenMPExec) );
|
||||
|
||||
Impl::t_openmp_instance = new (ptr) Impl::OpenMPExec( Impl::g_openmp_hardware_max_threads );
|
||||
|
||||
// New, unified host thread team data:
|
||||
{
|
||||
size_t pool_reduce_bytes = 32 * thread_count ;
|
||||
size_t team_reduce_bytes = 32 * thread_count ;
|
||||
size_t team_shared_bytes = 1024 * thread_count ;
|
||||
size_t thread_local_bytes = 1024 ;
|
||||
|
||||
Impl::t_openmp_instance->resize_thread_data( pool_reduce_bytes
|
||||
, team_reduce_bytes
|
||||
, team_shared_bytes
|
||||
, thread_local_bytes
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Check for over-subscription
|
||||
//if( Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node() ) {
|
||||
// std::cout << "Kokkos::OpenMP::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl;
|
||||
|
@ -373,20 +382,38 @@ void OpenMP::initialize( unsigned thread_count ,
|
|||
|
||||
void OpenMP::finalize()
|
||||
{
|
||||
Impl::OpenMPExec::verify_initialized( "OpenMP::finalize" );
|
||||
Impl::OpenMPExec::verify_is_process( "OpenMP::finalize" );
|
||||
if ( omp_in_parallel() )
|
||||
{
|
||||
std::string msg("Kokkos::OpenMP::finalize ERROR ");
|
||||
if( !Impl::t_openmp_instance ) msg.append(": not initialized");
|
||||
if( omp_in_parallel() ) msg.append(": in parallel");
|
||||
Kokkos::Impl::throw_runtime_exception(msg);
|
||||
}
|
||||
|
||||
// New, unified host thread team data:
|
||||
Impl::OpenMPExec::clear_thread_data();
|
||||
if ( Impl::t_openmp_instance ) {
|
||||
|
||||
Impl::OpenMPExec::m_pool_topo[0] = 0 ;
|
||||
Impl::OpenMPExec::m_pool_topo[1] = 0 ;
|
||||
Impl::OpenMPExec::m_pool_topo[2] = 0 ;
|
||||
const int nthreads = Impl::t_openmp_instance->m_pool_size <= Impl::g_openmp_hardware_max_threads
|
||||
? Impl::g_openmp_hardware_max_threads
|
||||
: Impl::t_openmp_instance->m_pool_size;
|
||||
|
||||
omp_set_num_threads(1);
|
||||
using Exec = Impl::OpenMPExec;
|
||||
Exec * instance = Impl::t_openmp_instance;
|
||||
instance->~Exec();
|
||||
|
||||
if ( Impl::s_using_hwloc && Kokkos::hwloc::can_bind_threads() ) {
|
||||
hwloc::unbind_this_thread();
|
||||
OpenMP::memory_space space;
|
||||
space.deallocate( instance, sizeof(Exec) );
|
||||
|
||||
#pragma omp parallel num_threads(nthreads)
|
||||
{
|
||||
Impl::t_openmp_hardware_id = 0;
|
||||
Impl::t_openmp_instance = nullptr;
|
||||
Impl::SharedAllocationRecord< void, void >::tracking_disable();
|
||||
}
|
||||
|
||||
// allow main thread to track
|
||||
Impl::SharedAllocationRecord< void, void >::tracking_enable();
|
||||
|
||||
Impl::g_openmp_hardware_max_threads = 1;
|
||||
}
|
||||
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
|
@ -396,70 +423,48 @@ void OpenMP::finalize()
|
|||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
void OpenMP::print_configuration( std::ostream & s , const bool detail )
|
||||
void OpenMP::print_configuration( std::ostream & s , const bool verbose )
|
||||
{
|
||||
Impl::OpenMPExec::verify_is_process( "OpenMP::print_configuration" );
|
||||
|
||||
s << "Kokkos::OpenMP" ;
|
||||
|
||||
#if defined( KOKKOS_ENABLE_OPENMP )
|
||||
s << " KOKKOS_ENABLE_OPENMP" ;
|
||||
#endif
|
||||
#if defined( KOKKOS_ENABLE_HWLOC )
|
||||
|
||||
const unsigned numa_count_ = Kokkos::hwloc::get_available_numa_count();
|
||||
const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa();
|
||||
const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
|
||||
|
||||
s << " hwloc[" << numa_count_ << "x" << cores_per_numa << "x" << threads_per_core << "]"
|
||||
<< " hwloc_binding_" << ( Impl::s_using_hwloc ? "enabled" : "disabled" )
|
||||
;
|
||||
#endif
|
||||
|
||||
const bool is_initialized = 0 != Impl::OpenMPExec::m_pool[0] ;
|
||||
const bool is_initialized = Impl::t_openmp_instance != nullptr;
|
||||
|
||||
if ( is_initialized ) {
|
||||
const int numa_count = Kokkos::Impl::OpenMPExec::m_pool_topo[0] / Kokkos::Impl::OpenMPExec::m_pool_topo[1] ;
|
||||
const int core_per_numa = Kokkos::Impl::OpenMPExec::m_pool_topo[1] / Kokkos::Impl::OpenMPExec::m_pool_topo[2] ;
|
||||
const int thread_per_core = Kokkos::Impl::OpenMPExec::m_pool_topo[2] ;
|
||||
Impl::OpenMPExec::verify_is_master( "OpenMP::print_configuration" );
|
||||
|
||||
const int numa_count = 1;
|
||||
const int core_per_numa = Impl::g_openmp_hardware_max_threads;
|
||||
const int thread_per_core = 1;
|
||||
|
||||
s << " thread_pool_topology[ " << numa_count
|
||||
<< " x " << core_per_numa
|
||||
<< " x " << thread_per_core
|
||||
<< " ]"
|
||||
<< std::endl ;
|
||||
|
||||
if ( detail ) {
|
||||
std::vector< std::pair<unsigned,unsigned> > coord( Kokkos::Impl::OpenMPExec::m_pool_topo[0] );
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
#pragma omp critical
|
||||
{
|
||||
coord[ omp_get_thread_num() ] = hwloc::get_this_thread_coordinate();
|
||||
}
|
||||
/* END #pragma omp critical */
|
||||
}
|
||||
/* END #pragma omp parallel */
|
||||
|
||||
for ( unsigned i = 0 ; i < coord.size() ; ++i ) {
|
||||
s << " thread omp_rank[" << i << "]"
|
||||
<< " kokkos_rank[" << Impl::OpenMPExec::m_map_rank[ i ] << "]"
|
||||
<< " hwloc_coord[" << coord[i].first << "." << coord[i].second << "]"
|
||||
<< std::endl ;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
s << " not initialized" << std::endl ;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<OpenMP> OpenMP::partition(...)
|
||||
{ return std::vector<OpenMP>(1); }
|
||||
|
||||
OpenMP OpenMP::create_instance(...) { return OpenMP(); }
|
||||
|
||||
|
||||
#if !defined( KOKKOS_DISABLE_DEPRECATED )
|
||||
|
||||
int OpenMP::concurrency() {
|
||||
return thread_pool_size(0);
|
||||
return Impl::g_openmp_hardware_max_threads;
|
||||
}
|
||||
|
||||
const char* OpenMP::name() { return "OpenMP"; }
|
||||
void OpenMP::initialize( int thread_count , int, int )
|
||||
{
|
||||
initialize(thread_count);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
|
|
|
@ -47,6 +47,10 @@
|
|||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_OPENMP )
|
||||
|
||||
#if !defined(_OPENMP)
|
||||
#error "You enabled Kokkos OpenMP support without enabling OpenMP in the compiler!"
|
||||
#endif
|
||||
|
||||
#include <Kokkos_OpenMP.hpp>
|
||||
|
||||
#include <impl/Kokkos_Traits.hpp>
|
||||
|
@ -54,6 +58,8 @@
|
|||
|
||||
#include <Kokkos_Atomic.hpp>
|
||||
|
||||
#include <Kokkos_UniqueToken.hpp>
|
||||
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <fstream>
|
||||
|
@ -63,8 +69,14 @@
|
|||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
namespace Kokkos { namespace Impl {
|
||||
|
||||
class OpenMPExec;
|
||||
|
||||
extern int g_openmp_hardware_max_threads;
|
||||
|
||||
extern __thread int t_openmp_hardware_id;
|
||||
extern __thread OpenMPExec * t_openmp_instance;
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
/** \brief Data for OpenMP thread execution */
|
||||
|
@ -74,279 +86,279 @@ public:
|
|||
|
||||
friend class Kokkos::OpenMP ;
|
||||
|
||||
enum { MAX_THREAD_COUNT = 4096 };
|
||||
enum { MAX_THREAD_COUNT = 512 };
|
||||
|
||||
void clear_thread_data();
|
||||
|
||||
static void validate_partition( const int nthreads
|
||||
, int & num_partitions
|
||||
, int & partition_size
|
||||
);
|
||||
|
||||
private:
|
||||
OpenMPExec( int arg_pool_size )
|
||||
: m_pool_size{ arg_pool_size }
|
||||
, m_level{ omp_get_level() }
|
||||
, m_pool()
|
||||
{}
|
||||
|
||||
static int m_pool_topo[ 4 ];
|
||||
static int m_map_rank[ MAX_THREAD_COUNT ];
|
||||
~OpenMPExec()
|
||||
{
|
||||
clear_thread_data();
|
||||
}
|
||||
|
||||
static HostThreadTeamData * m_pool[ MAX_THREAD_COUNT ];
|
||||
int m_pool_size;
|
||||
int m_level;
|
||||
|
||||
static
|
||||
void clear_thread_data();
|
||||
HostThreadTeamData * m_pool[ MAX_THREAD_COUNT ];
|
||||
|
||||
public:
|
||||
|
||||
// Topology of a cache coherent thread pool:
|
||||
// TOTAL = NUMA x GRAIN
|
||||
// pool_size( depth = 0 )
|
||||
// pool_size(0) = total number of threads
|
||||
// pool_size(1) = number of threads per NUMA
|
||||
// pool_size(2) = number of threads sharing finest grain memory hierarchy
|
||||
static void verify_is_master( const char * const );
|
||||
|
||||
inline static
|
||||
int pool_size( int depth = 0 ) { return m_pool_topo[ depth ]; }
|
||||
|
||||
static void finalize();
|
||||
|
||||
static void initialize( const unsigned team_count ,
|
||||
const unsigned threads_per_team ,
|
||||
const unsigned numa_count ,
|
||||
const unsigned cores_per_numa );
|
||||
|
||||
static void verify_is_process( const char * const );
|
||||
static void verify_initialized( const char * const );
|
||||
|
||||
|
||||
static
|
||||
void resize_thread_data( size_t pool_reduce_bytes
|
||||
, size_t team_reduce_bytes
|
||||
, size_t team_shared_bytes
|
||||
, size_t thread_local_bytes );
|
||||
|
||||
inline static
|
||||
HostThreadTeamData * get_thread_data() noexcept
|
||||
{ return m_pool[ m_map_rank[ omp_get_thread_num() ] ]; }
|
||||
inline
|
||||
HostThreadTeamData * get_thread_data() const noexcept
|
||||
{ return m_pool[ m_level == omp_get_level() ? 0 : omp_get_thread_num() ]; }
|
||||
|
||||
inline static
|
||||
HostThreadTeamData * get_thread_data( int i ) noexcept
|
||||
{ return m_pool[i]; }
|
||||
inline
|
||||
HostThreadTeamData * get_thread_data( int i ) const noexcept
|
||||
{ return m_pool[i]; }
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template< class ... Properties >
|
||||
class TeamPolicyInternal< Kokkos::OpenMP, Properties ... >: public PolicyTraits<Properties ...>
|
||||
{
|
||||
public:
|
||||
|
||||
//! Tag this class as a kokkos execution policy
|
||||
typedef TeamPolicyInternal execution_policy ;
|
||||
|
||||
typedef PolicyTraits<Properties ... > traits;
|
||||
|
||||
TeamPolicyInternal& operator = (const TeamPolicyInternal& p) {
|
||||
m_league_size = p.m_league_size;
|
||||
m_team_size = p.m_team_size;
|
||||
m_team_alloc = p.m_team_alloc;
|
||||
m_team_iter = p.m_team_iter;
|
||||
m_team_scratch_size[0] = p.m_team_scratch_size[0];
|
||||
m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
|
||||
m_team_scratch_size[1] = p.m_team_scratch_size[1];
|
||||
m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
|
||||
m_chunk_size = p.m_chunk_size;
|
||||
return *this;
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
template< class FunctorType >
|
||||
inline static
|
||||
int team_size_max( const FunctorType & ) {
|
||||
int pool_size = traits::execution_space::thread_pool_size(1);
|
||||
int max_host_team_size = Impl::HostThreadTeamData::max_team_members;
|
||||
return pool_size<max_host_team_size?pool_size:max_host_team_size;
|
||||
}
|
||||
|
||||
template< class FunctorType >
|
||||
inline static
|
||||
int team_size_recommended( const FunctorType & )
|
||||
{ return traits::execution_space::thread_pool_size(2); }
|
||||
|
||||
template< class FunctorType >
|
||||
inline static
|
||||
int team_size_recommended( const FunctorType &, const int& )
|
||||
{ return traits::execution_space::thread_pool_size(2); }
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
private:
|
||||
|
||||
int m_league_size ;
|
||||
int m_team_size ;
|
||||
int m_team_alloc ;
|
||||
int m_team_iter ;
|
||||
|
||||
size_t m_team_scratch_size[2];
|
||||
size_t m_thread_scratch_size[2];
|
||||
|
||||
int m_chunk_size;
|
||||
|
||||
inline void init( const int league_size_request
|
||||
, const int team_size_request )
|
||||
{
|
||||
const int pool_size = traits::execution_space::thread_pool_size(0);
|
||||
const int max_host_team_size = Impl::HostThreadTeamData::max_team_members;
|
||||
const int team_max = pool_size<max_host_team_size?pool_size:max_host_team_size;
|
||||
const int team_grain = traits::execution_space::thread_pool_size(2);
|
||||
|
||||
m_league_size = league_size_request ;
|
||||
|
||||
m_team_size = team_size_request < team_max ?
|
||||
team_size_request : team_max ;
|
||||
|
||||
// Round team size up to a multiple of 'team_gain'
|
||||
const int team_size_grain = team_grain * ( ( m_team_size + team_grain - 1 ) / team_grain );
|
||||
const int team_count = pool_size / team_size_grain ;
|
||||
|
||||
// Constraint : pool_size = m_team_alloc * team_count
|
||||
m_team_alloc = pool_size / team_count ;
|
||||
|
||||
// Maxumum number of iterations each team will take:
|
||||
m_team_iter = ( m_league_size + team_count - 1 ) / team_count ;
|
||||
|
||||
set_auto_chunk_size();
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
inline int team_size() const { return m_team_size ; }
|
||||
inline int league_size() const { return m_league_size ; }
|
||||
|
||||
inline size_t scratch_size(const int& level, int team_size_ = -1) const {
|
||||
if(team_size_ < 0) team_size_ = m_team_size;
|
||||
return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level] ;
|
||||
}
|
||||
|
||||
/** \brief Specify league size, request team size */
|
||||
TeamPolicyInternal( typename traits::execution_space &
|
||||
, int league_size_request
|
||||
, int team_size_request
|
||||
, int /* vector_length_request */ = 1 )
|
||||
: m_team_scratch_size { 0 , 0 }
|
||||
, m_thread_scratch_size { 0 , 0 }
|
||||
, m_chunk_size(0)
|
||||
{ init( league_size_request , team_size_request ); }
|
||||
|
||||
TeamPolicyInternal( typename traits::execution_space &
|
||||
, int league_size_request
|
||||
, const Kokkos::AUTO_t & /* team_size_request */
|
||||
, int /* vector_length_request */ = 1)
|
||||
: m_team_scratch_size { 0 , 0 }
|
||||
, m_thread_scratch_size { 0 , 0 }
|
||||
, m_chunk_size(0)
|
||||
{ init( league_size_request , traits::execution_space::thread_pool_size(2) ); }
|
||||
|
||||
TeamPolicyInternal( int league_size_request
|
||||
, int team_size_request
|
||||
, int /* vector_length_request */ = 1 )
|
||||
: m_team_scratch_size { 0 , 0 }
|
||||
, m_thread_scratch_size { 0 , 0 }
|
||||
, m_chunk_size(0)
|
||||
{ init( league_size_request , team_size_request ); }
|
||||
|
||||
TeamPolicyInternal( int league_size_request
|
||||
, const Kokkos::AUTO_t & /* team_size_request */
|
||||
, int /* vector_length_request */ = 1 )
|
||||
: m_team_scratch_size { 0 , 0 }
|
||||
, m_thread_scratch_size { 0 , 0 }
|
||||
, m_chunk_size(0)
|
||||
{ init( league_size_request , traits::execution_space::thread_pool_size(2) ); }
|
||||
|
||||
inline int team_alloc() const { return m_team_alloc ; }
|
||||
inline int team_iter() const { return m_team_iter ; }
|
||||
|
||||
inline int chunk_size() const { return m_chunk_size ; }
|
||||
|
||||
/** \brief set chunk_size to a discrete value*/
|
||||
inline TeamPolicyInternal set_chunk_size(typename traits::index_type chunk_size_) const {
|
||||
TeamPolicyInternal p = *this;
|
||||
p.m_chunk_size = chunk_size_;
|
||||
return p;
|
||||
}
|
||||
|
||||
inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
|
||||
TeamPolicyInternal p = *this;
|
||||
p.m_team_scratch_size[level] = per_team.value;
|
||||
return p;
|
||||
};
|
||||
|
||||
inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
|
||||
TeamPolicyInternal p = *this;
|
||||
p.m_thread_scratch_size[level] = per_thread.value;
|
||||
return p;
|
||||
};
|
||||
|
||||
inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
|
||||
TeamPolicyInternal p = *this;
|
||||
p.m_team_scratch_size[level] = per_team.value;
|
||||
p.m_thread_scratch_size[level] = per_thread.value;
|
||||
return p;
|
||||
};
|
||||
|
||||
private:
|
||||
/** \brief finalize chunk_size if it was set to AUTO*/
|
||||
inline void set_auto_chunk_size() {
|
||||
|
||||
int concurrency = traits::execution_space::thread_pool_size(0)/m_team_alloc;
|
||||
if( concurrency==0 ) concurrency=1;
|
||||
|
||||
if(m_chunk_size > 0) {
|
||||
if(!Impl::is_integral_power_of_two( m_chunk_size ))
|
||||
Kokkos::abort("TeamPolicy blocking granularity must be power of two" );
|
||||
}
|
||||
|
||||
int new_chunk_size = 1;
|
||||
while(new_chunk_size*100*concurrency < m_league_size)
|
||||
new_chunk_size *= 2;
|
||||
if(new_chunk_size < 128) {
|
||||
new_chunk_size = 1;
|
||||
while( (new_chunk_size*40*concurrency < m_league_size ) && (new_chunk_size<128) )
|
||||
new_chunk_size*=2;
|
||||
}
|
||||
m_chunk_size = new_chunk_size;
|
||||
}
|
||||
|
||||
public:
|
||||
typedef Impl::HostThreadTeamMember< Kokkos::OpenMP > member_type ;
|
||||
};
|
||||
} // namespace Impl
|
||||
|
||||
} // namespace Kokkos
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
inline
|
||||
bool OpenMP::in_parallel()
|
||||
{ return omp_in_parallel(); }
|
||||
inline OpenMP::OpenMP() noexcept
|
||||
{}
|
||||
|
||||
inline
|
||||
int OpenMP::thread_pool_size( int depth )
|
||||
bool OpenMP::is_initialized() noexcept
|
||||
{ return Impl::t_openmp_instance != nullptr; }
|
||||
|
||||
inline
|
||||
bool OpenMP::in_parallel( OpenMP const& ) noexcept
|
||||
{
|
||||
return Impl::OpenMPExec::pool_size(depth);
|
||||
//t_openmp_instance is only non-null on a master thread
|
||||
return !Impl::t_openmp_instance
|
||||
|| Impl::t_openmp_instance->m_level < omp_get_level()
|
||||
;
|
||||
}
|
||||
|
||||
inline
|
||||
int OpenMP::thread_pool_size() noexcept
|
||||
{
|
||||
return OpenMP::in_parallel()
|
||||
? omp_get_num_threads()
|
||||
: Impl::t_openmp_instance->m_pool_size
|
||||
;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int OpenMP::thread_pool_rank()
|
||||
int OpenMP::thread_pool_rank() noexcept
|
||||
{
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
return Impl::OpenMPExec::m_map_rank[ omp_get_thread_num() ];
|
||||
return Impl::t_openmp_instance ? 0 : omp_get_thread_num();
|
||||
#else
|
||||
return -1 ;
|
||||
#endif
|
||||
}
|
||||
|
||||
inline
|
||||
void OpenMP::fence( OpenMP const& instance ) noexcept {}
|
||||
|
||||
inline
|
||||
bool OpenMP::is_asynchronous( OpenMP const& instance ) noexcept
|
||||
{ return false; }
|
||||
|
||||
template <typename F>
|
||||
void OpenMP::partition_master( F const& f
|
||||
, int num_partitions
|
||||
, int partition_size
|
||||
)
|
||||
{
|
||||
if (omp_get_nested()) {
|
||||
using Exec = Impl::OpenMPExec;
|
||||
|
||||
Exec * prev_instance = Impl::t_openmp_instance;
|
||||
|
||||
Exec::validate_partition( prev_instance->m_pool_size, num_partitions, partition_size );
|
||||
|
||||
OpenMP::memory_space space;
|
||||
|
||||
#pragma omp parallel num_threads(num_partitions)
|
||||
{
|
||||
void * const ptr = space.allocate( sizeof(Exec) );
|
||||
|
||||
Impl::t_openmp_instance = new (ptr) Exec( partition_size );
|
||||
|
||||
size_t pool_reduce_bytes = 32 * partition_size ;
|
||||
size_t team_reduce_bytes = 32 * partition_size ;
|
||||
size_t team_shared_bytes = 1024 * partition_size ;
|
||||
size_t thread_local_bytes = 1024 ;
|
||||
|
||||
Impl::t_openmp_instance->resize_thread_data( pool_reduce_bytes
|
||||
, team_reduce_bytes
|
||||
, team_shared_bytes
|
||||
, thread_local_bytes
|
||||
);
|
||||
|
||||
f( omp_get_thread_num(), omp_get_num_threads() );
|
||||
|
||||
Impl::t_openmp_instance->~Exec();
|
||||
space.deallocate( Impl::t_openmp_instance, sizeof(Exec) );
|
||||
Impl::t_openmp_instance = nullptr;
|
||||
}
|
||||
|
||||
Impl::t_openmp_instance = prev_instance;
|
||||
}
|
||||
else {
|
||||
// nested openmp not enabled
|
||||
f(0,1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
namespace Experimental {
|
||||
|
||||
template<>
|
||||
class MasterLock<OpenMP>
|
||||
{
|
||||
public:
|
||||
void lock() { omp_set_lock( &m_lock ); }
|
||||
void unlock() { omp_unset_lock( &m_lock ); }
|
||||
bool try_lock() { return static_cast<bool>(omp_test_lock( &m_lock )); }
|
||||
|
||||
MasterLock() { omp_init_lock( &m_lock ); }
|
||||
~MasterLock() { omp_destroy_lock( &m_lock ); }
|
||||
|
||||
MasterLock( MasterLock const& ) = delete;
|
||||
MasterLock( MasterLock && ) = delete;
|
||||
MasterLock & operator=( MasterLock const& ) = delete;
|
||||
MasterLock & operator=( MasterLock && ) = delete;
|
||||
|
||||
private:
|
||||
omp_lock_t m_lock;
|
||||
|
||||
};
|
||||
|
||||
template<>
|
||||
class UniqueToken< OpenMP, UniqueTokenScope::Instance>
|
||||
{
|
||||
public:
|
||||
using execution_space = OpenMP;
|
||||
using size_type = int;
|
||||
|
||||
/// \brief create object size for concurrency on the given instance
|
||||
///
|
||||
/// This object should not be shared between instances
|
||||
UniqueToken( execution_space const& = execution_space() ) noexcept {}
|
||||
|
||||
/// \brief upper bound for acquired values, i.e. 0 <= value < size()
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int size() const noexcept
|
||||
{
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
return Kokkos::OpenMP::thread_pool_size();
|
||||
#else
|
||||
return 0 ;
|
||||
#endif
|
||||
}
|
||||
|
||||
/// \brief acquire value such that 0 <= value < size()
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int acquire() const noexcept
|
||||
{
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
return Kokkos::OpenMP::thread_pool_rank();
|
||||
#else
|
||||
return 0 ;
|
||||
#endif
|
||||
}
|
||||
|
||||
/// \brief release a value acquired by generate
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void release( int ) const noexcept {}
|
||||
};
|
||||
|
||||
template<>
|
||||
class UniqueToken< OpenMP, UniqueTokenScope::Global>
|
||||
{
|
||||
public:
|
||||
using execution_space = OpenMP;
|
||||
using size_type = int;
|
||||
|
||||
/// \brief create object size for concurrency on the given instance
|
||||
///
|
||||
/// This object should not be shared between instances
|
||||
UniqueToken( execution_space const& = execution_space() ) noexcept {}
|
||||
|
||||
/// \brief upper bound for acquired values, i.e. 0 <= value < size()
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int size() const noexcept
|
||||
{
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
return Kokkos::Impl::g_openmp_hardware_max_threads ;
|
||||
#else
|
||||
return 0 ;
|
||||
#endif
|
||||
}
|
||||
|
||||
/// \brief acquire value such that 0 <= value < size()
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int acquire() const noexcept
|
||||
{
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
return Kokkos::Impl::t_openmp_hardware_id ;
|
||||
#else
|
||||
return 0 ;
|
||||
#endif
|
||||
}
|
||||
|
||||
/// \brief release a value acquired by generate
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void release( int ) const noexcept {}
|
||||
};
|
||||
|
||||
} // namespace Experimental
|
||||
|
||||
|
||||
#if !defined( KOKKOS_DISABLE_DEPRECATED )
|
||||
|
||||
inline
|
||||
int OpenMP::thread_pool_size( int depth )
|
||||
{
|
||||
return depth < 2
|
||||
? thread_pool_size()
|
||||
: 1;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int OpenMP::hardware_thread_id() noexcept
|
||||
{
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
return Impl::t_openmp_hardware_id;
|
||||
#else
|
||||
return -1 ;
|
||||
#endif
|
||||
}
|
||||
|
||||
inline
|
||||
int OpenMP::max_hardware_threads() noexcept
|
||||
{
|
||||
return Impl::g_openmp_hardware_max_threads;
|
||||
}
|
||||
|
||||
#endif // KOKKOS_DISABLE_DEPRECATED
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif
|
||||
|
|
|
@ -52,6 +52,8 @@
|
|||
#include <OpenMP/Kokkos_OpenMP_Exec.hpp>
|
||||
#include <impl/Kokkos_FunctorAdapter.hpp>
|
||||
|
||||
#include <KokkosExp_MDRangePolicy.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
@ -71,8 +73,9 @@ private:
|
|||
typedef typename Policy::WorkRange WorkRange ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
OpenMPExec * m_instance ;
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
|
||||
template< class TagType >
|
||||
inline static
|
||||
|
@ -110,16 +113,120 @@ private:
|
|||
public:
|
||||
|
||||
inline void execute() const
|
||||
{
|
||||
enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
|
||||
, Kokkos::Dynamic >::value
|
||||
};
|
||||
|
||||
if ( OpenMP::in_parallel() ) {
|
||||
exec_range< WorkTag >( m_functor
|
||||
, m_policy.begin()
|
||||
, m_policy.end() );
|
||||
}
|
||||
else {
|
||||
|
||||
OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_for");
|
||||
|
||||
const int pool_size = OpenMP::thread_pool_size();
|
||||
#pragma omp parallel num_threads(pool_size)
|
||||
{
|
||||
HostThreadTeamData & data = *(m_instance->get_thread_data());
|
||||
|
||||
data.set_work_partition( m_policy.end() - m_policy.begin()
|
||||
, m_policy.chunk_size() );
|
||||
|
||||
if ( is_dynamic ) {
|
||||
// Make sure work partition is set before stealing
|
||||
if ( data.pool_rendezvous() ) data.pool_rendezvous_release();
|
||||
}
|
||||
|
||||
std::pair<int64_t,int64_t> range(0,0);
|
||||
|
||||
do {
|
||||
|
||||
range = is_dynamic ? data.get_work_stealing_chunk()
|
||||
: data.get_work_partition();
|
||||
|
||||
ParallelFor::template
|
||||
exec_range< WorkTag >( m_functor
|
||||
, range.first + m_policy.begin()
|
||||
, range.second + m_policy.begin() );
|
||||
|
||||
} while ( is_dynamic && 0 <= range.first );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline
|
||||
ParallelFor( const FunctorType & arg_functor
|
||||
, Policy arg_policy )
|
||||
: m_instance( t_openmp_instance )
|
||||
, m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
{}
|
||||
};
|
||||
|
||||
|
||||
// MDRangePolicy impl
|
||||
template< class FunctorType , class ... Traits >
|
||||
class ParallelFor< FunctorType
|
||||
, Kokkos::Experimental::MDRangePolicy< Traits ... >
|
||||
, Kokkos::OpenMP
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
|
||||
typedef typename MDRangePolicy::impl_range_policy Policy ;
|
||||
typedef typename MDRangePolicy::work_tag WorkTag ;
|
||||
|
||||
typedef typename Policy::WorkRange WorkRange ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
typedef typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
|
||||
|
||||
OpenMPExec * m_instance ;
|
||||
const FunctorType m_functor ;
|
||||
const MDRangePolicy m_mdr_policy ;
|
||||
const Policy m_policy ; // construct as RangePolicy( 0, num_tiles ).set_chunk_size(1) in ctor
|
||||
|
||||
inline static
|
||||
void
|
||||
exec_range( const MDRangePolicy & mdr_policy
|
||||
, const FunctorType & functor
|
||||
, const Member ibeg , const Member iend )
|
||||
{
|
||||
#ifdef KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION
|
||||
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
#endif
|
||||
for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
|
||||
iterate_type( mdr_policy, functor )( iwork );
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
inline void execute() const
|
||||
{
|
||||
enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
|
||||
, Kokkos::Dynamic >::value };
|
||||
|
||||
OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_for");
|
||||
OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_for");
|
||||
if ( OpenMP::in_parallel() ) {
|
||||
ParallelFor::exec_range ( m_mdr_policy
|
||||
, m_functor
|
||||
, m_policy.begin()
|
||||
, m_policy.end() );
|
||||
}
|
||||
else {
|
||||
|
||||
#pragma omp parallel
|
||||
OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_for");
|
||||
|
||||
const int pool_size = OpenMP::thread_pool_size();
|
||||
#pragma omp parallel num_threads(pool_size)
|
||||
{
|
||||
HostThreadTeamData & data = *OpenMPExec::get_thread_data();
|
||||
HostThreadTeamData & data = *(m_instance->get_thread_data());
|
||||
|
||||
data.set_work_partition( m_policy.end() - m_policy.begin()
|
||||
, m_policy.chunk_size() );
|
||||
|
@ -136,8 +243,8 @@ public:
|
|||
range = is_dynamic ? data.get_work_stealing_chunk()
|
||||
: data.get_work_partition();
|
||||
|
||||
ParallelFor::template
|
||||
exec_range< WorkTag >( m_functor
|
||||
ParallelFor::exec_range( m_mdr_policy
|
||||
, m_functor
|
||||
, range.first + m_policy.begin()
|
||||
, range.second + m_policy.begin() );
|
||||
|
||||
|
@ -145,12 +252,15 @@ public:
|
|||
}
|
||||
// END #pragma omp parallel
|
||||
}
|
||||
}
|
||||
|
||||
inline
|
||||
ParallelFor( const FunctorType & arg_functor
|
||||
, Policy arg_policy )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, MDRangePolicy arg_policy )
|
||||
: m_instance( t_openmp_instance )
|
||||
, m_functor( arg_functor )
|
||||
, m_mdr_policy( arg_policy )
|
||||
, m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
|
||||
{}
|
||||
};
|
||||
|
||||
|
@ -191,10 +301,11 @@ private:
|
|||
typedef typename Analysis::pointer_type pointer_type ;
|
||||
typedef typename Analysis::reference_type reference_type ;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
const ReducerType m_reducer ;
|
||||
const pointer_type m_result_ptr ;
|
||||
OpenMPExec * m_instance;
|
||||
const FunctorType m_functor;
|
||||
const Policy m_policy;
|
||||
const ReducerType m_reducer;
|
||||
const pointer_type m_result_ptr;
|
||||
|
||||
template< class TagType >
|
||||
inline static
|
||||
|
@ -228,21 +339,21 @@ public:
|
|||
enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
|
||||
, Kokkos::Dynamic >::value };
|
||||
|
||||
OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_reduce");
|
||||
OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_reduce");
|
||||
OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_reduce");
|
||||
|
||||
const size_t pool_reduce_bytes =
|
||||
Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));
|
||||
|
||||
OpenMPExec::resize_thread_data( pool_reduce_bytes
|
||||
m_instance->resize_thread_data( pool_reduce_bytes
|
||||
, 0 // team_reduce_bytes
|
||||
, 0 // team_shared_bytes
|
||||
, 0 // thread_local_bytes
|
||||
);
|
||||
|
||||
#pragma omp parallel
|
||||
const int pool_size = OpenMP::thread_pool_size();
|
||||
#pragma omp parallel num_threads(pool_size)
|
||||
{
|
||||
HostThreadTeamData & data = *OpenMPExec::get_thread_data();
|
||||
HostThreadTeamData & data = *(m_instance->get_thread_data());
|
||||
|
||||
data.set_work_partition( m_policy.end() - m_policy.begin()
|
||||
, m_policy.chunk_size() );
|
||||
|
@ -271,16 +382,15 @@ public:
|
|||
|
||||
} while ( is_dynamic && 0 <= range.first );
|
||||
}
|
||||
// END #pragma omp parallel
|
||||
|
||||
// Reduction:
|
||||
|
||||
const pointer_type ptr = pointer_type( OpenMPExec::get_thread_data(0)->pool_reduce_local() );
|
||||
const pointer_type ptr = pointer_type( m_instance->get_thread_data(0)->pool_reduce_local() );
|
||||
|
||||
for ( int i = 1 ; i < OpenMPExec::pool_size() ; ++i ) {
|
||||
for ( int i = 1 ; i < pool_size ; ++i ) {
|
||||
ValueJoin::join( ReducerConditional::select(m_functor , m_reducer)
|
||||
, ptr
|
||||
, OpenMPExec::get_thread_data(i)->pool_reduce_local() );
|
||||
, m_instance->get_thread_data(i)->pool_reduce_local() );
|
||||
}
|
||||
|
||||
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
|
||||
|
@ -303,7 +413,8 @@ public:
|
|||
Kokkos::is_view< ViewType >::value &&
|
||||
!Kokkos::is_reducer_type<ReducerType>::value
|
||||
,void*>::type = NULL)
|
||||
: m_functor( arg_functor )
|
||||
: m_instance( t_openmp_instance )
|
||||
, m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( InvalidType() )
|
||||
, m_result_ptr( arg_view.data() )
|
||||
|
@ -317,7 +428,8 @@ public:
|
|||
ParallelReduce( const FunctorType & arg_functor
|
||||
, Policy arg_policy
|
||||
, const ReducerType& reducer )
|
||||
: m_functor( arg_functor )
|
||||
: m_instance( t_openmp_instance )
|
||||
, m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( reducer )
|
||||
, m_result_ptr( reducer.view().data() )
|
||||
|
@ -329,6 +441,173 @@ public:
|
|||
|
||||
};
|
||||
|
||||
|
||||
// MDRangePolicy impl
|
||||
template< class FunctorType , class ReducerType, class ... Traits >
|
||||
class ParallelReduce< FunctorType
|
||||
, Kokkos::Experimental::MDRangePolicy< Traits ...>
|
||||
, ReducerType
|
||||
, Kokkos::OpenMP
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
|
||||
typedef typename MDRangePolicy::impl_range_policy Policy ;
|
||||
|
||||
typedef typename MDRangePolicy::work_tag WorkTag ;
|
||||
typedef typename Policy::WorkRange WorkRange ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ;
|
||||
|
||||
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
|
||||
typedef typename ReducerConditional::type ReducerTypeFwd;
|
||||
|
||||
typedef typename ReducerTypeFwd::value_type ValueType;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ;
|
||||
typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, WorkTag > ValueJoin ;
|
||||
|
||||
typedef typename Analysis::pointer_type pointer_type ;
|
||||
typedef typename Analysis::reference_type reference_type ;
|
||||
|
||||
using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy
|
||||
, FunctorType
|
||||
, WorkTag
|
||||
, ValueType
|
||||
>;
|
||||
|
||||
OpenMPExec * m_instance ;
|
||||
const FunctorType m_functor ;
|
||||
const MDRangePolicy m_mdr_policy ;
|
||||
const Policy m_policy ; // construct as RangePolicy( 0, num_tiles ).set_chunk_size(1) in ctor
|
||||
const ReducerType m_reducer ;
|
||||
const pointer_type m_result_ptr ;
|
||||
|
||||
inline static
|
||||
void
|
||||
exec_range( const MDRangePolicy & mdr_policy
|
||||
, const FunctorType & functor
|
||||
, const Member ibeg , const Member iend
|
||||
, reference_type update )
|
||||
{
|
||||
for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
|
||||
iterate_type( mdr_policy, functor, update )( iwork );
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
inline void execute() const
|
||||
{
|
||||
enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
|
||||
, Kokkos::Dynamic >::value };
|
||||
|
||||
OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_reduce");
|
||||
|
||||
const size_t pool_reduce_bytes =
|
||||
Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));
|
||||
|
||||
m_instance->resize_thread_data( pool_reduce_bytes
|
||||
, 0 // team_reduce_bytes
|
||||
, 0 // team_shared_bytes
|
||||
, 0 // thread_local_bytes
|
||||
);
|
||||
|
||||
const int pool_size = OpenMP::thread_pool_size();
|
||||
#pragma omp parallel num_threads(pool_size)
|
||||
{
|
||||
HostThreadTeamData & data = *(m_instance->get_thread_data());
|
||||
|
||||
data.set_work_partition( m_policy.end() - m_policy.begin()
|
||||
, m_policy.chunk_size() );
|
||||
|
||||
if ( is_dynamic ) {
|
||||
// Make sure work partition is set before stealing
|
||||
if ( data.pool_rendezvous() ) data.pool_rendezvous_release();
|
||||
}
|
||||
|
||||
reference_type update =
|
||||
ValueInit::init( ReducerConditional::select(m_functor , m_reducer)
|
||||
, data.pool_reduce_local() );
|
||||
|
||||
std::pair<int64_t,int64_t> range(0,0);
|
||||
|
||||
do {
|
||||
|
||||
range = is_dynamic ? data.get_work_stealing_chunk()
|
||||
: data.get_work_partition();
|
||||
|
||||
ParallelReduce::exec_range ( m_mdr_policy, m_functor
|
||||
, range.first + m_policy.begin()
|
||||
, range.second + m_policy.begin()
|
||||
, update );
|
||||
|
||||
} while ( is_dynamic && 0 <= range.first );
|
||||
}
|
||||
// END #pragma omp parallel
|
||||
|
||||
// Reduction:
|
||||
|
||||
const pointer_type ptr = pointer_type( m_instance->get_thread_data(0)->pool_reduce_local() );
|
||||
|
||||
for ( int i = 1 ; i < pool_size ; ++i ) {
|
||||
ValueJoin::join( ReducerConditional::select(m_functor , m_reducer)
|
||||
, ptr
|
||||
, m_instance->get_thread_data(i)->pool_reduce_local() );
|
||||
}
|
||||
|
||||
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
|
||||
|
||||
if ( m_result_ptr ) {
|
||||
const int n = Analysis::value_count( ReducerConditional::select(m_functor , m_reducer) );
|
||||
|
||||
for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
template< class ViewType >
|
||||
inline
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, MDRangePolicy arg_policy
|
||||
, const ViewType & arg_view
|
||||
, typename std::enable_if<
|
||||
Kokkos::is_view< ViewType >::value &&
|
||||
!Kokkos::is_reducer_type<ReducerType>::value
|
||||
,void*>::type = NULL)
|
||||
: m_instance( t_openmp_instance )
|
||||
, m_functor( arg_functor )
|
||||
, m_mdr_policy( arg_policy )
|
||||
, m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
|
||||
, m_reducer( InvalidType() )
|
||||
, m_result_ptr( arg_view.data() )
|
||||
{
|
||||
/*static_assert( std::is_same< typename ViewType::memory_space
|
||||
, Kokkos::HostSpace >::value
|
||||
, "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
|
||||
}
|
||||
|
||||
inline
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, MDRangePolicy arg_policy
|
||||
, const ReducerType& reducer )
|
||||
: m_instance( t_openmp_instance )
|
||||
, m_functor( arg_functor )
|
||||
, m_mdr_policy( arg_policy )
|
||||
, m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
|
||||
, m_reducer( reducer )
|
||||
, m_result_ptr( reducer.view().data() )
|
||||
{
|
||||
/*static_assert( std::is_same< typename ViewType::memory_space
|
||||
, Kokkos::HostSpace >::value
|
||||
, "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
|
@ -361,8 +640,9 @@ private:
|
|||
typedef typename Analysis::pointer_type pointer_type ;
|
||||
typedef typename Analysis::reference_type reference_type ;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
OpenMPExec * m_instance;
|
||||
const FunctorType m_functor;
|
||||
const Policy m_policy;
|
||||
|
||||
template< class TagType >
|
||||
inline static
|
||||
|
@ -394,23 +674,23 @@ public:
|
|||
inline
|
||||
void execute() const
|
||||
{
|
||||
OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_scan");
|
||||
OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_scan");
|
||||
OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_scan");
|
||||
|
||||
const int value_count = Analysis::value_count( m_functor );
|
||||
const size_t pool_reduce_bytes = 2 * Analysis::value_size( m_functor );
|
||||
|
||||
OpenMPExec::resize_thread_data( pool_reduce_bytes
|
||||
m_instance->resize_thread_data( pool_reduce_bytes
|
||||
, 0 // team_reduce_bytes
|
||||
, 0 // team_shared_bytes
|
||||
, 0 // thread_local_bytes
|
||||
);
|
||||
|
||||
#pragma omp parallel
|
||||
const int pool_size = OpenMP::thread_pool_size();
|
||||
#pragma omp parallel num_threads(pool_size)
|
||||
{
|
||||
HostThreadTeamData & data = *OpenMPExec::get_thread_data();
|
||||
HostThreadTeamData & data = *(m_instance->get_thread_data());
|
||||
|
||||
const WorkRange range( m_policy, data.pool_rank(), data.pool_size() );
|
||||
const WorkRange range( m_policy, omp_get_thread_num(), omp_get_num_threads() );
|
||||
|
||||
reference_type update_sum =
|
||||
ValueInit::init( m_functor , data.pool_reduce_local() );
|
||||
|
@ -422,7 +702,7 @@ public:
|
|||
|
||||
pointer_type ptr_prev = 0 ;
|
||||
|
||||
const int n = data.pool_size();
|
||||
const int n = omp_get_num_threads();
|
||||
|
||||
for ( int i = 0 ; i < n ; ++i ) {
|
||||
|
||||
|
@ -452,7 +732,6 @@ public:
|
|||
ParallelScan::template exec_range< WorkTag >
|
||||
( m_functor , range.begin() , range.end() , update_base , true );
|
||||
}
|
||||
/* END #pragma omp parallel */
|
||||
|
||||
}
|
||||
|
||||
|
@ -461,7 +740,8 @@ public:
|
|||
inline
|
||||
ParallelScan( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy )
|
||||
: m_functor( arg_functor )
|
||||
: m_instance( t_openmp_instance )
|
||||
, m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
{}
|
||||
|
||||
|
@ -492,9 +772,10 @@ private:
|
|||
typedef typename Policy::schedule_type::type SchedTag ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
const int m_shmem_size ;
|
||||
OpenMPExec * m_instance;
|
||||
const FunctorType m_functor;
|
||||
const Policy m_policy;
|
||||
const int m_shmem_size;
|
||||
|
||||
template< class TagType >
|
||||
inline static
|
||||
|
@ -548,22 +829,22 @@ public:
|
|||
{
|
||||
enum { is_dynamic = std::is_same< SchedTag , Kokkos::Dynamic >::value };
|
||||
|
||||
OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_for");
|
||||
OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_for");
|
||||
OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_for");
|
||||
|
||||
const size_t pool_reduce_size = 0 ; // Never shrinks
|
||||
const size_t team_reduce_size = TEAM_REDUCE_SIZE * m_policy.team_size();
|
||||
const size_t team_shared_size = m_shmem_size + m_policy.scratch_size(1);
|
||||
const size_t thread_local_size = 0 ; // Never shrinks
|
||||
|
||||
OpenMPExec::resize_thread_data( pool_reduce_size
|
||||
m_instance->resize_thread_data( pool_reduce_size
|
||||
, team_reduce_size
|
||||
, team_shared_size
|
||||
, thread_local_size );
|
||||
|
||||
#pragma omp parallel
|
||||
const int pool_size = OpenMP::thread_pool_size();
|
||||
#pragma omp parallel num_threads(pool_size)
|
||||
{
|
||||
HostThreadTeamData & data = *OpenMPExec::get_thread_data();
|
||||
HostThreadTeamData & data = *(m_instance->get_thread_data());
|
||||
|
||||
const int active = data.organize_team( m_policy.team_size() );
|
||||
|
||||
|
@ -598,14 +879,14 @@ public:
|
|||
|
||||
data.disband_team();
|
||||
}
|
||||
// END #pragma omp parallel
|
||||
}
|
||||
|
||||
|
||||
inline
|
||||
ParallelFor( const FunctorType & arg_functor ,
|
||||
const Policy & arg_policy )
|
||||
: m_functor( arg_functor )
|
||||
: m_instance( t_openmp_instance )
|
||||
, m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_shmem_size( arg_policy.scratch_size(0) +
|
||||
arg_policy.scratch_size(1) +
|
||||
|
@ -646,11 +927,12 @@ private:
|
|||
typedef typename Analysis::pointer_type pointer_type ;
|
||||
typedef typename Analysis::reference_type reference_type ;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
const ReducerType m_reducer ;
|
||||
const pointer_type m_result_ptr ;
|
||||
const int m_shmem_size ;
|
||||
OpenMPExec * m_instance;
|
||||
const FunctorType m_functor;
|
||||
const Policy m_policy;
|
||||
const ReducerType m_reducer;
|
||||
const pointer_type m_result_ptr;
|
||||
const int m_shmem_size;
|
||||
|
||||
template< class TagType >
|
||||
inline static
|
||||
|
@ -706,8 +988,7 @@ public:
|
|||
{
|
||||
enum { is_dynamic = std::is_same< SchedTag , Kokkos::Dynamic >::value };
|
||||
|
||||
OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_reduce");
|
||||
OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_reduce");
|
||||
OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_reduce");
|
||||
|
||||
const size_t pool_reduce_size =
|
||||
Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));
|
||||
|
@ -716,14 +997,15 @@ public:
|
|||
const size_t team_shared_size = m_shmem_size + m_policy.scratch_size(1);
|
||||
const size_t thread_local_size = 0 ; // Never shrinks
|
||||
|
||||
OpenMPExec::resize_thread_data( pool_reduce_size
|
||||
m_instance->resize_thread_data( pool_reduce_size
|
||||
, team_reduce_size
|
||||
, team_shared_size
|
||||
, thread_local_size );
|
||||
|
||||
#pragma omp parallel
|
||||
const int pool_size = OpenMP::thread_pool_size();
|
||||
#pragma omp parallel num_threads(pool_size)
|
||||
{
|
||||
HostThreadTeamData & data = *OpenMPExec::get_thread_data();
|
||||
HostThreadTeamData & data = *(m_instance->get_thread_data());
|
||||
|
||||
const int active = data.organize_team( m_policy.team_size() );
|
||||
|
||||
|
@ -763,17 +1045,26 @@ public:
|
|||
}
|
||||
|
||||
data.disband_team();
|
||||
|
||||
// This thread has updated 'pool_reduce_local()' with its
|
||||
// contributions to the reduction. The parallel region is
|
||||
// about to terminate and the master thread will load and
|
||||
// reduce each 'pool_reduce_local()' contribution.
|
||||
// Must 'memory_fence()' to guarantee that storing the update to
|
||||
// 'pool_reduce_local()' will complete before this thread
|
||||
// exits the parallel region.
|
||||
|
||||
memory_fence();
|
||||
}
|
||||
// END #pragma omp parallel
|
||||
|
||||
// Reduction:
|
||||
|
||||
const pointer_type ptr = pointer_type( OpenMPExec::get_thread_data(0)->pool_reduce_local() );
|
||||
const pointer_type ptr = pointer_type( m_instance->get_thread_data(0)->pool_reduce_local() );
|
||||
|
||||
for ( int i = 1 ; i < OpenMPExec::pool_size() ; ++i ) {
|
||||
for ( int i = 1 ; i < pool_size ; ++i ) {
|
||||
ValueJoin::join( ReducerConditional::select(m_functor , m_reducer)
|
||||
, ptr
|
||||
, OpenMPExec::get_thread_data(i)->pool_reduce_local() );
|
||||
, m_instance->get_thread_data(i)->pool_reduce_local() );
|
||||
}
|
||||
|
||||
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
|
||||
|
@ -796,7 +1087,8 @@ public:
|
|||
Kokkos::is_view< ViewType >::value &&
|
||||
!Kokkos::is_reducer_type<ReducerType>::value
|
||||
,void*>::type = NULL)
|
||||
: m_functor( arg_functor )
|
||||
: m_instance( t_openmp_instance )
|
||||
, m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( InvalidType() )
|
||||
, m_result_ptr( arg_result.ptr_on_device() )
|
||||
|
@ -810,7 +1102,8 @@ public:
|
|||
ParallelReduce( const FunctorType & arg_functor
|
||||
, Policy arg_policy
|
||||
, const ReducerType& reducer )
|
||||
: m_functor( arg_functor )
|
||||
: m_instance( t_openmp_instance )
|
||||
, m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( reducer )
|
||||
, m_result_ptr( reducer.view().data() )
|
||||
|
|
|
@ -105,7 +105,7 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::execute
|
|||
{
|
||||
using execution_space = Kokkos::OpenMP ;
|
||||
using queue_type = TaskQueue< execution_space > ;
|
||||
using task_root_type = TaskBase< execution_space , void , void > ;
|
||||
using task_root_type = TaskBase< void , void , void > ;
|
||||
using Member = Impl::HostThreadTeamMember< execution_space > ;
|
||||
|
||||
static task_root_type * const end =
|
||||
|
@ -115,23 +115,19 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::execute
|
|||
HostThreadTeamData & team_data_single =
|
||||
HostThreadTeamDataSingleton::singleton();
|
||||
|
||||
const int team_size = Impl::OpenMPExec::pool_size(2); // Threads per core
|
||||
// const int team_size = Impl::OpenMPExec::pool_size(1); // Threads per NUMA
|
||||
Impl::OpenMPExec * instance = t_openmp_instance;
|
||||
const int pool_size = OpenMP::thread_pool_size();
|
||||
|
||||
#if 0
|
||||
fprintf(stdout,"TaskQueue<OpenMP> execute %d\n", team_size );
|
||||
fflush(stdout);
|
||||
#endif
|
||||
const int team_size = 1; // Threads per core
|
||||
instance->resize_thread_data( 0 /* global reduce buffer */
|
||||
, 512 * team_size /* team reduce buffer */
|
||||
, 0 /* team shared buffer */
|
||||
, 0 /* thread local buffer */
|
||||
);
|
||||
|
||||
OpenMPExec::resize_thread_data( 0 /* global reduce buffer */
|
||||
, 512 * team_size /* team reduce buffer */
|
||||
, 0 /* team shared buffer */
|
||||
, 0 /* thread local buffer */
|
||||
);
|
||||
|
||||
#pragma omp parallel
|
||||
#pragma omp parallel num_threads(pool_size)
|
||||
{
|
||||
Impl::HostThreadTeamData & self = *Impl::OpenMPExec::get_thread_data();
|
||||
Impl::HostThreadTeamData & self = *(instance->get_thread_data());
|
||||
|
||||
// Organizing threads into a team performs a barrier across the
|
||||
// entire pool to insure proper initialization of the team
|
||||
|
@ -142,18 +138,6 @@ fflush(stdout);
|
|||
Member single_exec( team_data_single );
|
||||
Member team_exec( self );
|
||||
|
||||
#if 0
|
||||
fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) team(%d of %d) league(%d of %d) running\n"
|
||||
, self.pool_rank()
|
||||
, self.pool_size()
|
||||
, team_exec.team_rank()
|
||||
, team_exec.team_size()
|
||||
, team_exec.league_rank()
|
||||
, team_exec.league_size()
|
||||
);
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
// Loop until all queues are empty and no tasks in flight
|
||||
|
||||
task_root_type * task = 0 ;
|
||||
|
@ -197,15 +181,6 @@ fflush(stdout);
|
|||
|
||||
// if a single thread task then execute now
|
||||
|
||||
#if 0
|
||||
fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) executing single task 0x%lx\n"
|
||||
, self.pool_rank()
|
||||
, self.pool_size()
|
||||
, int64_t(task)
|
||||
);
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
(*task->m_apply)( task , & single_exec );
|
||||
|
||||
leader_loop = true ;
|
||||
|
@ -220,57 +195,14 @@ fflush(stdout);
|
|||
|
||||
if ( 0 != task ) { // Thread Team Task
|
||||
|
||||
#if 0
|
||||
fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) team((%d of %d) league(%d of %d) executing team task 0x%lx\n"
|
||||
, self.pool_rank()
|
||||
, self.pool_size()
|
||||
, team_exec.team_rank()
|
||||
, team_exec.team_size()
|
||||
, team_exec.league_rank()
|
||||
, team_exec.league_size()
|
||||
, int64_t(task)
|
||||
);
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
(*task->m_apply)( task , & team_exec );
|
||||
|
||||
// The m_apply function performs a barrier
|
||||
}
|
||||
} while( 0 != task );
|
||||
|
||||
#if 0
|
||||
fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) team(%d of %d) league(%d of %d) ending\n"
|
||||
, self.pool_rank()
|
||||
, self.pool_size()
|
||||
, team_exec.team_rank()
|
||||
, team_exec.team_size()
|
||||
, team_exec.league_rank()
|
||||
, team_exec.league_size()
|
||||
);
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
self.disband_team();
|
||||
|
||||
#if 0
|
||||
fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) disbanded\n"
|
||||
, self.pool_rank()
|
||||
, self.pool_size()
|
||||
);
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
}
|
||||
// END #pragma omp parallel
|
||||
|
||||
#if 0
|
||||
fprintf(stdout,"TaskQueue<OpenMP> execute %d end\n", team_size );
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void TaskQueueSpecialization< Kokkos::OpenMP >::
|
||||
|
@ -279,10 +211,10 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::
|
|||
{
|
||||
using execution_space = Kokkos::OpenMP ;
|
||||
using queue_type = TaskQueue< execution_space > ;
|
||||
using task_root_type = TaskBase< execution_space , void , void > ;
|
||||
using task_root_type = TaskBase< void , void , void > ;
|
||||
using Member = Impl::HostThreadTeamMember< execution_space > ;
|
||||
|
||||
if ( 1 == omp_get_num_threads() ) {
|
||||
if ( 1 == OpenMP::thread_pool_size() ) {
|
||||
|
||||
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
|
||||
|
||||
|
|
|
@ -45,7 +45,7 @@
|
|||
#define KOKKOS_IMPL_OPENMP_TASK_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_TASKDAG )
|
||||
#if defined( KOKKOS_ENABLE_OPENMP ) && defined( KOKKOS_ENABLE_TASKDAG )
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
@ -60,7 +60,7 @@ public:
|
|||
|
||||
using execution_space = Kokkos::OpenMP ;
|
||||
using queue_type = Kokkos::Impl::TaskQueue< execution_space > ;
|
||||
using task_base_type = Kokkos::Impl::TaskBase< execution_space , void , void > ;
|
||||
using task_base_type = Kokkos::Impl::TaskBase< void , void , void > ;
|
||||
using member_type = Kokkos::Impl::HostThreadTeamMember< execution_space > ;
|
||||
|
||||
// Must specify memory space
|
||||
|
|
|
@ -0,0 +1,245 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_OPENMP_TEAM_HPP
|
||||
#define KOKKOS_OPENMP_TEAM_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_OPENMP )
|
||||
|
||||
#include <OpenMP/Kokkos_OpenMP_Exec.hpp>
|
||||
|
||||
namespace Kokkos { namespace Impl {
|
||||
|
||||
template< class ... Properties >
|
||||
class TeamPolicyInternal< Kokkos::OpenMP, Properties ... >: public PolicyTraits<Properties ...>
|
||||
{
|
||||
public:
|
||||
|
||||
//! Tag this class as a kokkos execution policy
|
||||
typedef TeamPolicyInternal execution_policy ;
|
||||
|
||||
typedef PolicyTraits<Properties ... > traits;
|
||||
|
||||
TeamPolicyInternal& operator = (const TeamPolicyInternal& p) {
|
||||
m_league_size = p.m_league_size;
|
||||
m_team_size = p.m_team_size;
|
||||
m_team_alloc = p.m_team_alloc;
|
||||
m_team_iter = p.m_team_iter;
|
||||
m_team_scratch_size[0] = p.m_team_scratch_size[0];
|
||||
m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
|
||||
m_team_scratch_size[1] = p.m_team_scratch_size[1];
|
||||
m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
|
||||
m_chunk_size = p.m_chunk_size;
|
||||
return *this;
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
template< class FunctorType >
|
||||
inline static
|
||||
int team_size_max( const FunctorType & ) {
|
||||
int pool_size = traits::execution_space::thread_pool_size(1);
|
||||
int max_host_team_size = Impl::HostThreadTeamData::max_team_members;
|
||||
return pool_size<max_host_team_size?pool_size:max_host_team_size;
|
||||
}
|
||||
|
||||
template< class FunctorType >
|
||||
inline static
|
||||
int team_size_recommended( const FunctorType & )
|
||||
{ return traits::execution_space::thread_pool_size(2); }
|
||||
|
||||
template< class FunctorType >
|
||||
inline static
|
||||
int team_size_recommended( const FunctorType &, const int& )
|
||||
{ return traits::execution_space::thread_pool_size(2); }
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
private:
|
||||
|
||||
int m_league_size ;
|
||||
int m_team_size ;
|
||||
int m_team_alloc ;
|
||||
int m_team_iter ;
|
||||
|
||||
size_t m_team_scratch_size[2];
|
||||
size_t m_thread_scratch_size[2];
|
||||
|
||||
int m_chunk_size;
|
||||
|
||||
inline void init( const int league_size_request
|
||||
, const int team_size_request )
|
||||
{
|
||||
const int pool_size = traits::execution_space::thread_pool_size(0);
|
||||
const int max_host_team_size = Impl::HostThreadTeamData::max_team_members;
|
||||
const int team_max = pool_size<max_host_team_size?pool_size:max_host_team_size;
|
||||
const int team_grain = traits::execution_space::thread_pool_size(2);
|
||||
|
||||
m_league_size = league_size_request ;
|
||||
|
||||
m_team_size = team_size_request < team_max ?
|
||||
team_size_request : team_max ;
|
||||
|
||||
// Round team size up to a multiple of 'team_gain'
|
||||
const int team_size_grain = team_grain * ( ( m_team_size + team_grain - 1 ) / team_grain );
|
||||
const int team_count = pool_size / team_size_grain ;
|
||||
|
||||
// Constraint : pool_size = m_team_alloc * team_count
|
||||
m_team_alloc = pool_size / team_count ;
|
||||
|
||||
// Maxumum number of iterations each team will take:
|
||||
m_team_iter = ( m_league_size + team_count - 1 ) / team_count ;
|
||||
|
||||
set_auto_chunk_size();
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
inline int team_size() const { return m_team_size ; }
|
||||
inline int league_size() const { return m_league_size ; }
|
||||
|
||||
inline size_t scratch_size(const int& level, int team_size_ = -1) const {
|
||||
if(team_size_ < 0) team_size_ = m_team_size;
|
||||
return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level] ;
|
||||
}
|
||||
|
||||
/** \brief Specify league size, request team size */
|
||||
TeamPolicyInternal( typename traits::execution_space &
|
||||
, int league_size_request
|
||||
, int team_size_request
|
||||
, int /* vector_length_request */ = 1 )
|
||||
: m_team_scratch_size { 0 , 0 }
|
||||
, m_thread_scratch_size { 0 , 0 }
|
||||
, m_chunk_size(0)
|
||||
{ init( league_size_request , team_size_request ); }
|
||||
|
||||
TeamPolicyInternal( typename traits::execution_space &
|
||||
, int league_size_request
|
||||
, const Kokkos::AUTO_t & /* team_size_request */
|
||||
, int /* vector_length_request */ = 1)
|
||||
: m_team_scratch_size { 0 , 0 }
|
||||
, m_thread_scratch_size { 0 , 0 }
|
||||
, m_chunk_size(0)
|
||||
{ init( league_size_request , traits::execution_space::thread_pool_size(2) ); }
|
||||
|
||||
TeamPolicyInternal( int league_size_request
|
||||
, int team_size_request
|
||||
, int /* vector_length_request */ = 1 )
|
||||
: m_team_scratch_size { 0 , 0 }
|
||||
, m_thread_scratch_size { 0 , 0 }
|
||||
, m_chunk_size(0)
|
||||
{ init( league_size_request , team_size_request ); }
|
||||
|
||||
TeamPolicyInternal( int league_size_request
|
||||
, const Kokkos::AUTO_t & /* team_size_request */
|
||||
, int /* vector_length_request */ = 1 )
|
||||
: m_team_scratch_size { 0 , 0 }
|
||||
, m_thread_scratch_size { 0 , 0 }
|
||||
, m_chunk_size(0)
|
||||
{ init( league_size_request , traits::execution_space::thread_pool_size(2) ); }
|
||||
|
||||
inline int team_alloc() const { return m_team_alloc ; }
|
||||
inline int team_iter() const { return m_team_iter ; }
|
||||
|
||||
inline int chunk_size() const { return m_chunk_size ; }
|
||||
|
||||
/** \brief set chunk_size to a discrete value*/
|
||||
inline TeamPolicyInternal set_chunk_size(typename traits::index_type chunk_size_) const {
|
||||
TeamPolicyInternal p = *this;
|
||||
p.m_chunk_size = chunk_size_;
|
||||
return p;
|
||||
}
|
||||
|
||||
inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
|
||||
TeamPolicyInternal p = *this;
|
||||
p.m_team_scratch_size[level] = per_team.value;
|
||||
return p;
|
||||
};
|
||||
|
||||
inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
|
||||
TeamPolicyInternal p = *this;
|
||||
p.m_thread_scratch_size[level] = per_thread.value;
|
||||
return p;
|
||||
};
|
||||
|
||||
inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
|
||||
TeamPolicyInternal p = *this;
|
||||
p.m_team_scratch_size[level] = per_team.value;
|
||||
p.m_thread_scratch_size[level] = per_thread.value;
|
||||
return p;
|
||||
};
|
||||
|
||||
private:
|
||||
/** \brief finalize chunk_size if it was set to AUTO*/
|
||||
inline void set_auto_chunk_size() {
|
||||
|
||||
int concurrency = traits::execution_space::thread_pool_size(0)/m_team_alloc;
|
||||
if( concurrency==0 ) concurrency=1;
|
||||
|
||||
if(m_chunk_size > 0) {
|
||||
if(!Impl::is_integral_power_of_two( m_chunk_size ))
|
||||
Kokkos::abort("TeamPolicy blocking granularity must be power of two" );
|
||||
}
|
||||
|
||||
int new_chunk_size = 1;
|
||||
while(new_chunk_size*100*concurrency < m_league_size)
|
||||
new_chunk_size *= 2;
|
||||
if(new_chunk_size < 128) {
|
||||
new_chunk_size = 1;
|
||||
while( (new_chunk_size*40*concurrency < m_league_size ) && (new_chunk_size<128) )
|
||||
new_chunk_size*=2;
|
||||
}
|
||||
m_chunk_size = new_chunk_size;
|
||||
}
|
||||
|
||||
public:
|
||||
typedef Impl::HostThreadTeamMember< Kokkos::OpenMP > member_type ;
|
||||
};
|
||||
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
#endif
|
||||
#endif /* KOKKOS_OPENMP_TEAM_HPP */
|
||||
|
||||
|
|
@ -0,0 +1,107 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_OPENMP_WORKGRAPHPOLICY_HPP
|
||||
#define KOKKOS_OPENMP_WORKGRAPHPOLICY_HPP
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template< class FunctorType , class ... Traits >
|
||||
class ParallelFor< FunctorType ,
|
||||
Kokkos::Experimental::WorkGraphPolicy< Traits ... > ,
|
||||
Kokkos::OpenMP
|
||||
>
|
||||
: public Kokkos::Impl::Experimental::
|
||||
WorkGraphExec< FunctorType,
|
||||
Kokkos::OpenMP,
|
||||
Traits ...
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ;
|
||||
typedef Kokkos::Impl::Experimental::
|
||||
WorkGraphExec<FunctorType, Kokkos::OpenMP, Traits ... > Base ;
|
||||
|
||||
template< class TagType >
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec_one(const typename Policy::member_type& i) const {
|
||||
Base::m_functor( i );
|
||||
}
|
||||
|
||||
template< class TagType >
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
exec_one(const typename Policy::member_type& i) const {
|
||||
const TagType t{} ;
|
||||
Base::m_functor( t , i );
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
inline
|
||||
void execute()
|
||||
{
|
||||
const int pool_size = OpenMP::thread_pool_size();
|
||||
|
||||
#pragma omp parallel num_threads(pool_size)
|
||||
{
|
||||
for (std::int32_t i; (-1 != (i = Base::before_work())); ) {
|
||||
exec_one< typename Policy::work_tag >( i );
|
||||
Base::after_work(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline
|
||||
ParallelFor( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy )
|
||||
: Base( arg_functor, arg_policy )
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif /* #define KOKKOS_OPENMP_WORKGRAPHPOLICY_HPP */
|
|
@ -1,13 +1,13 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
|
@ -36,7 +36,7 @@
|
|||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
@ -45,7 +45,7 @@
|
|||
#define KOKKOS_OPENMPTARGETEXEC_HPP
|
||||
|
||||
#include <impl/Kokkos_Traits.hpp>
|
||||
#include <impl/Kokkos_spinwait.hpp>
|
||||
#include <impl/Kokkos_Spinwait.hpp>
|
||||
|
||||
#include <Kokkos_Atomic.hpp>
|
||||
#include <iostream>
|
||||
|
@ -59,10 +59,10 @@ namespace Impl {
|
|||
|
||||
|
||||
class OpenMPTargetExec {
|
||||
public:
|
||||
public:
|
||||
enum { MAX_ACTIVE_THREADS = 256*8*56*4 };
|
||||
enum { MAX_ACTIVE_TEAMS = MAX_ACTIVE_THREADS/32 };
|
||||
|
||||
|
||||
private:
|
||||
static void* scratch_ptr;
|
||||
|
||||
|
@ -70,7 +70,7 @@ public:
|
|||
static void verify_is_process( const char * const );
|
||||
static void verify_initialized( const char * const );
|
||||
|
||||
static void* get_scratch_ptr();
|
||||
static void* get_scratch_ptr();
|
||||
static void clear_scratch();
|
||||
static void resize_scratch( int64_t reduce_bytes , int64_t team_reduce_bytes, int64_t team_shared_bytes, int64_t thread_local_bytes );
|
||||
|
||||
|
@ -159,7 +159,7 @@ public:
|
|||
|
||||
KOKKOS_INLINE_FUNCTION void team_barrier() const
|
||||
{
|
||||
#pragma omp barrier
|
||||
#pragma omp barrier
|
||||
}
|
||||
|
||||
template<class ValueType>
|
||||
|
@ -191,13 +191,13 @@ public:
|
|||
|
||||
typedef ValueType value_type;
|
||||
const JoinLambdaAdapter<value_type,JoinOp> op(op_in);
|
||||
|
||||
|
||||
// Make sure there is enough scratch space:
|
||||
typedef typename if_c< sizeof(value_type) < TEAM_REDUCE_SIZE
|
||||
, value_type , void >::type type ;
|
||||
|
||||
const int n_values = TEAM_REDUCE_SIZE/sizeof(value_type);
|
||||
type * team_scratch = (type*) ((char*)m_glb_scratch + TEAM_REDUCE_SIZE*omp_get_team_num());
|
||||
type * team_scratch = (type*) ((char*)m_glb_scratch + TEAM_REDUCE_SIZE*omp_get_team_num());
|
||||
for(int i = m_team_rank; i < n_values; i+= m_team_size) {
|
||||
team_scratch[i] = value_type();
|
||||
}
|
||||
|
@ -209,7 +209,7 @@ public:
|
|||
team_scratch[m_team_rank%n_values]+=value;
|
||||
#pragma omp barrier
|
||||
}
|
||||
|
||||
|
||||
for(int d = 1; d<n_values;d*=2) {
|
||||
if((m_team_rank+d<n_values) && (m_team_rank%(2*d)==0)) {
|
||||
team_scratch[m_team_rank] += team_scratch[m_team_rank+d];
|
||||
|
@ -374,12 +374,12 @@ private:
|
|||
int m_chunk_size;
|
||||
|
||||
inline void init( const int league_size_request
|
||||
, const int team_size_request
|
||||
, const int team_size_request
|
||||
, const int vector_length_request )
|
||||
{
|
||||
m_league_size = league_size_request ;
|
||||
|
||||
m_team_size = team_size_request;
|
||||
m_team_size = team_size_request;
|
||||
|
||||
m_vector_length = vector_length_request;
|
||||
|
||||
|
|
|
@ -47,7 +47,7 @@
|
|||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_QTHREADS )
|
||||
|
||||
#include <impl/Kokkos_spinwait.hpp>
|
||||
#include <impl/Kokkos_Spinwait.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
|
|
@ -45,14 +45,14 @@
|
|||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_THREADS )
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <utility>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
#include <impl/Kokkos_CPUDiscovery.hpp>
|
||||
#include <impl/Kokkos_Profiling_Interface.hpp>
|
||||
|
@ -80,9 +80,7 @@ const void * volatile s_current_function_arg = 0 ;
|
|||
|
||||
struct Sentinel {
|
||||
Sentinel()
|
||||
{
|
||||
HostSpace::register_in_parallel( ThreadsExec::in_parallel );
|
||||
}
|
||||
{}
|
||||
|
||||
~Sentinel()
|
||||
{
|
||||
|
@ -122,6 +120,8 @@ void execute_function_noop( ThreadsExec & , const void * ) {}
|
|||
|
||||
void ThreadsExec::driver(void)
|
||||
{
|
||||
SharedAllocationRecord< void, void >::tracking_enable();
|
||||
|
||||
ThreadsExec this_thread ;
|
||||
|
||||
while ( ThreadsExec::Active == this_thread.m_pool_state ) {
|
||||
|
@ -726,6 +726,8 @@ void ThreadsExec::initialize( unsigned thread_count ,
|
|||
// Init the array for used for arbitrarily sized atomics
|
||||
Impl::init_lock_array_host_space();
|
||||
|
||||
Impl::SharedAllocationRecord< void, void >::tracking_enable();
|
||||
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
Kokkos::Profiling::initialize();
|
||||
#endif
|
||||
|
|
|
@ -50,11 +50,12 @@
|
|||
#include <cstdio>
|
||||
|
||||
#include <utility>
|
||||
#include <impl/Kokkos_spinwait.hpp>
|
||||
#include <impl/Kokkos_Spinwait.hpp>
|
||||
#include <impl/Kokkos_FunctorAdapter.hpp>
|
||||
|
||||
#include <Kokkos_Atomic.hpp>
|
||||
|
||||
#include <Kokkos_UniqueToken.hpp>
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
|
@ -275,6 +276,17 @@ public:
|
|||
if ( ! rev_rank ) {
|
||||
Final::final( f , reduce_memory() );
|
||||
}
|
||||
|
||||
// This thread has updated 'reduce_memory()' and upon returning
|
||||
// from this function will set 'm_pool_state' to inactive.
|
||||
// If this is a non-root thread then setting 'm_pool_state'
|
||||
// to inactive triggers another thread to exit a spinwait
|
||||
// and read the 'reduce_memory'.
|
||||
// Must 'memory_fence()' to guarantee that storing the update to
|
||||
// 'reduce_memory()' will complete before storing the the update to
|
||||
// 'm_pool_state'.
|
||||
|
||||
memory_fence();
|
||||
}
|
||||
|
||||
inline
|
||||
|
@ -627,6 +639,62 @@ inline void Threads::fence()
|
|||
|
||||
} /* namespace Kokkos */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos { namespace Experimental {
|
||||
|
||||
template<>
|
||||
class UniqueToken< Threads, UniqueTokenScope::Instance>
|
||||
{
|
||||
public:
|
||||
using execution_space = Threads;
|
||||
using size_type = int;
|
||||
|
||||
/// \brief create object size for concurrency on the given instance
|
||||
///
|
||||
/// This object should not be shared between instances
|
||||
UniqueToken( execution_space const& = execution_space() ) noexcept {}
|
||||
|
||||
/// \brief upper bound for acquired values, i.e. 0 <= value < size()
|
||||
inline
|
||||
int size() const noexcept { return Threads::thread_pool_size(); }
|
||||
|
||||
/// \brief acquire value such that 0 <= value < size()
|
||||
inline
|
||||
int acquire() const noexcept { return Threads::thread_pool_rank(); }
|
||||
|
||||
/// \brief release a value acquired by generate
|
||||
inline
|
||||
void release( int ) const noexcept {}
|
||||
};
|
||||
|
||||
template<>
|
||||
class UniqueToken< Threads, UniqueTokenScope::Global>
|
||||
{
|
||||
public:
|
||||
using execution_space = Threads;
|
||||
using size_type = int;
|
||||
|
||||
/// \brief create object size for concurrency on the given instance
|
||||
///
|
||||
/// This object should not be shared between instances
|
||||
UniqueToken( execution_space const& = execution_space() ) noexcept {}
|
||||
|
||||
/// \brief upper bound for acquired values, i.e. 0 <= value < size()
|
||||
inline
|
||||
int size() const noexcept { return Threads::thread_pool_size(); }
|
||||
|
||||
/// \brief acquire value such that 0 <= value < size()
|
||||
inline
|
||||
int acquire() const noexcept { return Threads::thread_pool_rank(); }
|
||||
|
||||
/// \brief release a value acquired by generate
|
||||
inline
|
||||
void release( int ) const noexcept {}
|
||||
};
|
||||
|
||||
}} // namespace Kokkos::Experimental
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
#endif
|
||||
|
|
|
@ -50,7 +50,7 @@
|
|||
#include <cstdio>
|
||||
|
||||
#include <utility>
|
||||
#include <impl/Kokkos_spinwait.hpp>
|
||||
#include <impl/Kokkos_Spinwait.hpp>
|
||||
#include <impl/Kokkos_FunctorAdapter.hpp>
|
||||
#include <impl/Kokkos_HostThreadTeam.hpp>
|
||||
|
||||
|
@ -482,6 +482,8 @@ public:
|
|||
void next_static()
|
||||
{
|
||||
if ( m_league_rank < m_league_end ) {
|
||||
// Make sure all stores are complete before entering the barrier
|
||||
memory_fence();
|
||||
team_barrier();
|
||||
set_team_shared();
|
||||
}
|
||||
|
@ -518,6 +520,8 @@ public:
|
|||
return;
|
||||
|
||||
if ( m_league_rank < m_league_chunk_end ) {
|
||||
// Make sure all stores are complete before entering the barrier
|
||||
memory_fence();
|
||||
team_barrier();
|
||||
set_team_shared();
|
||||
}
|
||||
|
|
|
@ -55,6 +55,8 @@
|
|||
#include <impl/Kokkos_StaticAssert.hpp>
|
||||
#include <impl/Kokkos_FunctorAdapter.hpp>
|
||||
|
||||
#include <KokkosExp_MDRangePolicy.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
|
@ -174,6 +176,108 @@ public:
|
|||
{}
|
||||
};
|
||||
|
||||
|
||||
// MDRangePolicy impl
|
||||
template< class FunctorType , class ... Traits >
|
||||
class ParallelFor< FunctorType
|
||||
, Kokkos::Experimental::MDRangePolicy< Traits ... >
|
||||
, Kokkos::Threads
|
||||
>
|
||||
{
|
||||
private:
|
||||
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
|
||||
typedef typename MDRangePolicy::impl_range_policy Policy ;
|
||||
|
||||
typedef typename MDRangePolicy::work_tag WorkTag ;
|
||||
|
||||
typedef typename Policy::WorkRange WorkRange ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
typedef typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const MDRangePolicy m_mdr_policy ;
|
||||
const Policy m_policy ; // construct as RangePolicy( 0, num_tiles ).set_chunk_size(1) in ctor
|
||||
|
||||
inline static
|
||||
void
|
||||
exec_range( const MDRangePolicy & mdr_policy
|
||||
, const FunctorType & functor
|
||||
, const Member ibeg , const Member iend )
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
|
||||
defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for ( Member i = ibeg ; i < iend ; ++i ) {
|
||||
iterate_type( mdr_policy, functor )( i );
|
||||
}
|
||||
}
|
||||
|
||||
static void exec( ThreadsExec & exec , const void * arg )
|
||||
{
|
||||
exec_schedule<typename Policy::schedule_type::type>(exec,arg);
|
||||
}
|
||||
|
||||
template<class Schedule>
|
||||
static
|
||||
typename std::enable_if< std::is_same<Schedule,Kokkos::Static>::value >::type
|
||||
exec_schedule( ThreadsExec & exec , const void * arg )
|
||||
{
|
||||
const ParallelFor & self = * ((const ParallelFor *) arg );
|
||||
|
||||
WorkRange range( self.m_policy , exec.pool_rank() , exec.pool_size() );
|
||||
|
||||
ParallelFor::exec_range
|
||||
( self.m_mdr_policy, self.m_functor , range.begin() , range.end() );
|
||||
|
||||
exec.fan_in();
|
||||
}
|
||||
|
||||
template<class Schedule>
|
||||
static
|
||||
typename std::enable_if< std::is_same<Schedule,Kokkos::Dynamic>::value >::type
|
||||
exec_schedule( ThreadsExec & exec , const void * arg )
|
||||
{
|
||||
const ParallelFor & self = * ((const ParallelFor *) arg );
|
||||
|
||||
WorkRange range( self.m_policy , exec.pool_rank() , exec.pool_size() );
|
||||
|
||||
exec.set_work_range(range.begin(),range.end(),self.m_policy.chunk_size());
|
||||
exec.reset_steal_target();
|
||||
exec.barrier();
|
||||
|
||||
long work_index = exec.get_work_index();
|
||||
|
||||
while(work_index != -1) {
|
||||
const Member begin = static_cast<Member>(work_index) * self.m_policy.chunk_size();
|
||||
const Member end = begin + self.m_policy.chunk_size() < self.m_policy.end()?begin+self.m_policy.chunk_size():self.m_policy.end();
|
||||
|
||||
ParallelFor::exec_range
|
||||
( self.m_mdr_policy, self.m_functor , begin , end );
|
||||
work_index = exec.get_work_index();
|
||||
}
|
||||
|
||||
exec.fan_in();
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
inline
|
||||
void execute() const
|
||||
{
|
||||
ThreadsExec::start( & ParallelFor::exec , this );
|
||||
ThreadsExec::fence();
|
||||
}
|
||||
|
||||
ParallelFor( const FunctorType & arg_functor
|
||||
, const MDRangePolicy & arg_policy )
|
||||
: m_functor( arg_functor )
|
||||
, m_mdr_policy( arg_policy )
|
||||
, m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
|
||||
{}
|
||||
};
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
/* ParallelFor Kokkos::Threads with TeamPolicy */
|
||||
|
||||
|
@ -440,6 +544,169 @@ public:
|
|||
|
||||
};
|
||||
|
||||
|
||||
// MDRangePolicy impl
|
||||
template< class FunctorType , class ReducerType, class ... Traits >
|
||||
class ParallelReduce< FunctorType
|
||||
, Kokkos::Experimental::MDRangePolicy< Traits ... >
|
||||
, ReducerType
|
||||
, Kokkos::Threads
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
|
||||
typedef typename MDRangePolicy::impl_range_policy Policy ;
|
||||
|
||||
typedef typename MDRangePolicy::work_tag WorkTag ;
|
||||
typedef typename Policy::WorkRange WorkRange ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
|
||||
typedef typename ReducerConditional::type ReducerTypeFwd;
|
||||
|
||||
typedef typename ReducerTypeFwd::value_type ValueType;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ;
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
|
||||
using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy
|
||||
, FunctorType
|
||||
, WorkTag
|
||||
, ValueType
|
||||
>;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const MDRangePolicy m_mdr_policy ;
|
||||
const Policy m_policy ; // construct as RangePolicy( 0, num_tiles ).set_chunk_size(1) in ctor
|
||||
const ReducerType m_reducer ;
|
||||
const pointer_type m_result_ptr ;
|
||||
|
||||
inline static
|
||||
void
|
||||
exec_range( const MDRangePolicy & mdr_policy
|
||||
, const FunctorType & functor
|
||||
, const Member & ibeg , const Member & iend
|
||||
, reference_type update )
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
|
||||
defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for ( Member i = ibeg ; i < iend ; ++i ) {
|
||||
iterate_type( mdr_policy, functor, update )( i );
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
exec( ThreadsExec & exec , const void * arg ) {
|
||||
exec_schedule<typename Policy::schedule_type::type>(exec, arg);
|
||||
}
|
||||
|
||||
template<class Schedule>
|
||||
static
|
||||
typename std::enable_if< std::is_same<Schedule,Kokkos::Static>::value >::type
|
||||
exec_schedule( ThreadsExec & exec , const void * arg )
|
||||
{
|
||||
const ParallelReduce & self = * ((const ParallelReduce *) arg );
|
||||
const WorkRange range( self.m_policy, exec.pool_rank(), exec.pool_size() );
|
||||
|
||||
ParallelReduce::exec_range
|
||||
( self.m_mdr_policy, self.m_functor , range.begin() , range.end()
|
||||
, ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() ) );
|
||||
|
||||
exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) );
|
||||
}
|
||||
|
||||
template<class Schedule>
|
||||
static
|
||||
typename std::enable_if< std::is_same<Schedule,Kokkos::Dynamic>::value >::type
|
||||
exec_schedule( ThreadsExec & exec , const void * arg )
|
||||
{
|
||||
const ParallelReduce & self = * ((const ParallelReduce *) arg );
|
||||
const WorkRange range( self.m_policy, exec.pool_rank(), exec.pool_size() );
|
||||
|
||||
exec.set_work_range(range.begin(),range.end(),self.m_policy.chunk_size());
|
||||
exec.reset_steal_target();
|
||||
exec.barrier();
|
||||
|
||||
long work_index = exec.get_work_index();
|
||||
reference_type update = ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() );
|
||||
while(work_index != -1) {
|
||||
const Member begin = static_cast<Member>(work_index) * self.m_policy.chunk_size();
|
||||
const Member end = begin + self.m_policy.chunk_size() < self.m_policy.end()?begin+self.m_policy.chunk_size():self.m_policy.end();
|
||||
ParallelReduce::exec_range
|
||||
( self.m_mdr_policy, self.m_functor , begin , end
|
||||
, update );
|
||||
work_index = exec.get_work_index();
|
||||
}
|
||||
|
||||
exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) );
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
inline
|
||||
void execute() const
|
||||
{
|
||||
ThreadsExec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
|
||||
|
||||
ThreadsExec::start( & ParallelReduce::exec , this );
|
||||
|
||||
ThreadsExec::fence();
|
||||
|
||||
if ( m_result_ptr ) {
|
||||
|
||||
const pointer_type data =
|
||||
(pointer_type) ThreadsExec::root_reduce_scratch();
|
||||
|
||||
const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
|
||||
for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
|
||||
}
|
||||
}
|
||||
|
||||
template< class HostViewType >
|
||||
ParallelReduce( const FunctorType & arg_functor ,
|
||||
const MDRangePolicy & arg_policy ,
|
||||
const HostViewType & arg_result_view ,
|
||||
typename std::enable_if<
|
||||
Kokkos::is_view< HostViewType >::value &&
|
||||
!Kokkos::is_reducer_type<ReducerType>::value
|
||||
,void*>::type = NULL)
|
||||
: m_functor( arg_functor )
|
||||
, m_mdr_policy( arg_policy )
|
||||
, m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
|
||||
, m_reducer( InvalidType() )
|
||||
, m_result_ptr( arg_result_view.ptr_on_device() )
|
||||
{
|
||||
static_assert( Kokkos::is_view< HostViewType >::value
|
||||
, "Kokkos::Threads reduce result must be a View" );
|
||||
|
||||
static_assert( std::is_same< typename HostViewType::memory_space , HostSpace >::value
|
||||
, "Kokkos::Threads reduce result must be a View in HostSpace" );
|
||||
}
|
||||
|
||||
inline
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, MDRangePolicy arg_policy
|
||||
, const ReducerType& reducer )
|
||||
: m_functor( arg_functor )
|
||||
, m_mdr_policy( arg_policy )
|
||||
, m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
|
||||
, m_reducer( reducer )
|
||||
, m_result_ptr( reducer.view().data() )
|
||||
{
|
||||
/*static_assert( std::is_same< typename ViewType::memory_space
|
||||
, Kokkos::HostSpace >::value
|
||||
, "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
/* ParallelReduce with Kokkos::Threads and TeamPolicy */
|
||||
|
||||
|
|
|
@ -0,0 +1,115 @@
|
|||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_THREADS_WORKGRAPHPOLICY_HPP
|
||||
#define KOKKOS_THREADS_WORKGRAPHPOLICY_HPP
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template< class FunctorType , class ... Traits >
|
||||
class ParallelFor< FunctorType ,
|
||||
Kokkos::Experimental::WorkGraphPolicy< Traits ... > ,
|
||||
Kokkos::Threads
|
||||
>
|
||||
: public Kokkos::Impl::Experimental::
|
||||
WorkGraphExec< FunctorType,
|
||||
Kokkos::Threads,
|
||||
Traits ...
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ;
|
||||
typedef Kokkos::Impl::Experimental::
|
||||
WorkGraphExec<FunctorType, Kokkos::Threads, Traits ... > Base ;
|
||||
typedef ParallelFor<FunctorType,
|
||||
Kokkos::Experimental::WorkGraphPolicy<Traits ...>,
|
||||
Kokkos::Threads> Self ;
|
||||
|
||||
template< class TagType >
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec_one(const typename Policy::member_type& i) const {
|
||||
Base::m_functor( i );
|
||||
}
|
||||
|
||||
template< class TagType >
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
exec_one(const typename Policy::member_type& i) const {
|
||||
const TagType t{} ;
|
||||
Base::m_functor( t , i );
|
||||
}
|
||||
|
||||
inline void exec_one_thread() const {
|
||||
for (std::int32_t i; (-1 != (i = Base::before_work())); ) {
|
||||
exec_one< typename Policy::work_tag >( i );
|
||||
Base::after_work(i);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void thread_main( ThreadsExec&, const void* arg ) {
|
||||
const Self& self = *(static_cast<const Self*>(arg));
|
||||
self.exec_one_thread();
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
inline
|
||||
void execute()
|
||||
{
|
||||
ThreadsExec::start( & Self::thread_main, this );
|
||||
ThreadsExec::fence();
|
||||
}
|
||||
|
||||
inline
|
||||
ParallelFor( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy )
|
||||
: Base( arg_functor, arg_policy )
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif /* #define KOKKOS_THREADS_WORKGRAPHPOLICY_HPP */
|
|
@ -141,7 +141,6 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
|||
#define LOOP_ARGS_8 LOOP_ARGS_7, i7 + m_offset[7]
|
||||
|
||||
|
||||
|
||||
// New Loop Macros...
|
||||
// parallel_for, non-tagged
|
||||
#define APPLY( func, ... ) \
|
||||
|
@ -1010,8 +1009,6 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
|||
// end tagged macros
|
||||
|
||||
|
||||
|
||||
|
||||
// Structs for calling loops
|
||||
template < int Rank, bool IsLeft, typename IType, typename Tagged, typename Enable = void >
|
||||
struct Tile_Loop_Type;
|
||||
|
@ -1279,6 +1276,19 @@ struct Tile_Loop_Type<8, IsLeft, IType, Tagged, typename std::enable_if< !std::i
|
|||
template <typename T>
|
||||
using is_void = std::is_same< T , void >;
|
||||
|
||||
template <typename T>
|
||||
struct is_type_array : std::false_type
|
||||
{
|
||||
using value_type = T;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct is_type_array< T[] > : std::true_type
|
||||
{
|
||||
using value_type = T;
|
||||
};
|
||||
|
||||
|
||||
template < typename RP
|
||||
, typename Functor
|
||||
, typename Tag = void
|
||||
|
@ -1761,18 +1771,17 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
|||
RP const& m_rp;
|
||||
Functor const& m_func;
|
||||
typename std::conditional< std::is_same<Tag,void>::value,int,Tag>::type m_tag;
|
||||
// value_type & m_v;
|
||||
|
||||
};
|
||||
|
||||
|
||||
// ValueType: For reductions
|
||||
// For ParallelReduce
|
||||
// ValueType - scalar: For reductions
|
||||
template < typename RP
|
||||
, typename Functor
|
||||
, typename Tag
|
||||
, typename ValueType
|
||||
>
|
||||
struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void<ValueType >::value >::type >
|
||||
struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void<ValueType >::value && !is_type_array<ValueType>::value >::type >
|
||||
{
|
||||
using index_type = typename RP::index_type;
|
||||
using point_type = typename RP::point_type;
|
||||
|
@ -2251,12 +2260,497 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
|||
};
|
||||
|
||||
|
||||
// For ParallelReduce
|
||||
// Extra specialization for array reductions
|
||||
// ValueType[]: For array reductions
|
||||
template < typename RP
|
||||
, typename Functor
|
||||
, typename Tag
|
||||
, typename ValueType
|
||||
>
|
||||
struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void<ValueType >::value && is_type_array<ValueType>::value >::type >
|
||||
{
|
||||
using index_type = typename RP::index_type;
|
||||
using point_type = typename RP::point_type;
|
||||
|
||||
using value_type = typename is_type_array<ValueType>::value_type; // strip away the 'array-ness' [], only underlying type remains
|
||||
|
||||
inline
|
||||
HostIterateTile( RP const& rp, Functor const& func, value_type *v ) // v should be an array; treat as pointer for compatibility since size is not known nor needed here
|
||||
: m_rp(rp) //Cuda 7.0 does not like braces...
|
||||
, m_func(func)
|
||||
, m_v(v) // use with non-void ValueType struct
|
||||
{}
|
||||
|
||||
inline
|
||||
bool check_iteration_bounds( point_type& partial_tile , point_type& offset ) const {
|
||||
bool is_full_tile = true;
|
||||
|
||||
for ( int i = 0; i < RP::rank; ++i ) {
|
||||
if ((offset[i] + m_rp.m_tile[i]) <= m_rp.m_upper[i]) {
|
||||
partial_tile[i] = m_rp.m_tile[i] ;
|
||||
}
|
||||
else {
|
||||
is_full_tile = false ;
|
||||
partial_tile[i] = (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1
|
||||
: (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 ? (m_rp.m_upper[i] - offset[i])
|
||||
: (m_rp.m_upper[i] - m_rp.m_lower[i]) ; // when single tile encloses range
|
||||
}
|
||||
}
|
||||
|
||||
return is_full_tile ;
|
||||
} // end check bounds
|
||||
|
||||
|
||||
template <int Rank>
|
||||
struct RankTag
|
||||
{
|
||||
typedef RankTag type;
|
||||
enum { value = (int)Rank };
|
||||
};
|
||||
|
||||
|
||||
#if KOKKOS_ENABLE_NEW_LOOP_MACROS
|
||||
template <typename IType>
|
||||
inline
|
||||
void
|
||||
operator()(IType tile_idx) const
|
||||
{
|
||||
point_type m_offset;
|
||||
point_type m_tiledims;
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (int i=RP::rank-1; i>=0; --i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
|
||||
//Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
|
||||
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
|
||||
|
||||
Tile_Loop_Type< RP::rank, (RP::inner_direction == RP::Left), index_type, Tag >::apply( m_v, m_func, full_tile, m_offset, m_rp.m_tile, m_tiledims );
|
||||
|
||||
}
|
||||
|
||||
#else
|
||||
template <typename IType>
|
||||
inline
|
||||
void
|
||||
operator()(IType tile_idx) const
|
||||
{ operator_impl( tile_idx , RankTag<RP::rank>() ); }
|
||||
// added due to compiler error when using sfinae to choose operator based on rank
|
||||
|
||||
|
||||
template <typename IType>
|
||||
inline
|
||||
void operator_impl( IType tile_idx , const RankTag<2> ) const
|
||||
{
|
||||
point_type m_offset;
|
||||
point_type m_tiledims;
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (int i=RP::rank-1; i>=0; --i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
|
||||
//Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
|
||||
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
|
||||
|
||||
if (RP::inner_direction == RP::Left) {
|
||||
if ( full_tile ) {
|
||||
// #pragma simd
|
||||
LOOP_2L(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_2 );
|
||||
}
|
||||
} else {
|
||||
// #pragma simd
|
||||
LOOP_2L(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_2 );
|
||||
}
|
||||
}
|
||||
} // end RP::Left
|
||||
else {
|
||||
if ( full_tile ) {
|
||||
// #pragma simd
|
||||
LOOP_2R(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_2 );
|
||||
}
|
||||
} else {
|
||||
// #pragma simd
|
||||
LOOP_2R(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_2 );
|
||||
}
|
||||
}
|
||||
} // end RP::Right
|
||||
|
||||
} //end op() rank == 2
|
||||
|
||||
|
||||
template <typename IType>
|
||||
inline
|
||||
void operator_impl( IType tile_idx , const RankTag<3> ) const
|
||||
{
|
||||
point_type m_offset;
|
||||
point_type m_tiledims;
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (int i=RP::rank-1; i>=0; --i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
|
||||
//Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
|
||||
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
|
||||
|
||||
if (RP::inner_direction == RP::Left) {
|
||||
if ( full_tile ) {
|
||||
// #pragma simd
|
||||
LOOP_3L(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_3 );
|
||||
}
|
||||
} else {
|
||||
// #pragma simd
|
||||
LOOP_3L(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_3 );
|
||||
}
|
||||
}
|
||||
} // end RP::Left
|
||||
else {
|
||||
if ( full_tile ) {
|
||||
// #pragma simd
|
||||
LOOP_3R(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_3 );
|
||||
}
|
||||
} else {
|
||||
// #pragma simd
|
||||
LOOP_3R(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_3 );
|
||||
}
|
||||
}
|
||||
} // end RP::Right
|
||||
|
||||
} //end op() rank == 3
|
||||
|
||||
|
||||
template <typename IType>
|
||||
inline
|
||||
void operator_impl( IType tile_idx , const RankTag<4> ) const
|
||||
{
|
||||
point_type m_offset;
|
||||
point_type m_tiledims;
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (int i=RP::rank-1; i>=0; --i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
|
||||
//Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
|
||||
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
|
||||
|
||||
if (RP::inner_direction == RP::Left) {
|
||||
if ( full_tile ) {
|
||||
// #pragma simd
|
||||
LOOP_4L(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_4 );
|
||||
}
|
||||
} else {
|
||||
// #pragma simd
|
||||
LOOP_4L(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_4 );
|
||||
}
|
||||
}
|
||||
} // end RP::Left
|
||||
else {
|
||||
if ( full_tile ) {
|
||||
// #pragma simd
|
||||
LOOP_4R(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_4 );
|
||||
}
|
||||
} else {
|
||||
// #pragma simd
|
||||
LOOP_4R(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_4 );
|
||||
}
|
||||
}
|
||||
} // end RP::Right
|
||||
|
||||
} //end op() rank == 4
|
||||
|
||||
|
||||
template <typename IType>
|
||||
inline
|
||||
void operator_impl( IType tile_idx , const RankTag<5> ) const
|
||||
{
|
||||
point_type m_offset;
|
||||
point_type m_tiledims;
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (int i=RP::rank-1; i>=0; --i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
|
||||
//Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
|
||||
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
|
||||
|
||||
if (RP::inner_direction == RP::Left) {
|
||||
if ( full_tile ) {
|
||||
// #pragma simd
|
||||
LOOP_5L(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_5 );
|
||||
}
|
||||
} else {
|
||||
// #pragma simd
|
||||
LOOP_5L(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_5 );
|
||||
}
|
||||
}
|
||||
} // end RP::Left
|
||||
else {
|
||||
if ( full_tile ) {
|
||||
// #pragma simd
|
||||
LOOP_5R(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_5 );
|
||||
}
|
||||
} else {
|
||||
// #pragma simd
|
||||
LOOP_5R(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_5 );
|
||||
}
|
||||
}
|
||||
} // end RP::Right
|
||||
|
||||
} //end op() rank == 5
|
||||
|
||||
|
||||
template <typename IType>
|
||||
inline
|
||||
void operator_impl( IType tile_idx , const RankTag<6> ) const
|
||||
{
|
||||
point_type m_offset;
|
||||
point_type m_tiledims;
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (int i=RP::rank-1; i>=0; --i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
|
||||
//Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
|
||||
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
|
||||
|
||||
if (RP::inner_direction == RP::Left) {
|
||||
if ( full_tile ) {
|
||||
// #pragma simd
|
||||
LOOP_6L(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_6 );
|
||||
}
|
||||
} else {
|
||||
// #pragma simd
|
||||
LOOP_6L(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_6 );
|
||||
}
|
||||
}
|
||||
} // end RP::Left
|
||||
else {
|
||||
if ( full_tile ) {
|
||||
// #pragma simd
|
||||
LOOP_6R(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_6 );
|
||||
}
|
||||
} else {
|
||||
// #pragma simd
|
||||
LOOP_6R(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_6 );
|
||||
}
|
||||
}
|
||||
} // end RP::Right
|
||||
|
||||
} //end op() rank == 6
|
||||
|
||||
|
||||
template <typename IType>
|
||||
inline
|
||||
void operator_impl( IType tile_idx , const RankTag<7> ) const
|
||||
{
|
||||
point_type m_offset;
|
||||
point_type m_tiledims;
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (int i=RP::rank-1; i>=0; --i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
|
||||
//Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
|
||||
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
|
||||
|
||||
if (RP::inner_direction == RP::Left) {
|
||||
if ( full_tile ) {
|
||||
// #pragma simd
|
||||
LOOP_7L(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_7 );
|
||||
}
|
||||
} else {
|
||||
// #pragma simd
|
||||
LOOP_7L(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_7 );
|
||||
}
|
||||
}
|
||||
} // end RP::Left
|
||||
else {
|
||||
if ( full_tile ) {
|
||||
// #pragma simd
|
||||
LOOP_7R(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_7 );
|
||||
}
|
||||
} else {
|
||||
// #pragma simd
|
||||
LOOP_7R(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_7 );
|
||||
}
|
||||
}
|
||||
} // end RP::Right
|
||||
|
||||
} //end op() rank == 7
|
||||
|
||||
|
||||
template <typename IType>
|
||||
inline
|
||||
void operator_impl( IType tile_idx , const RankTag<8> ) const
|
||||
{
|
||||
point_type m_offset;
|
||||
point_type m_tiledims;
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (int i=RP::rank-1; i>=0; --i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
|
||||
//Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
|
||||
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
|
||||
|
||||
if (RP::inner_direction == RP::Left) {
|
||||
if ( full_tile ) {
|
||||
// #pragma simd
|
||||
LOOP_8L(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_8 );
|
||||
}
|
||||
} else {
|
||||
// #pragma simd
|
||||
LOOP_8L(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_8 );
|
||||
}
|
||||
}
|
||||
} // end RP::Left
|
||||
else {
|
||||
if ( full_tile ) {
|
||||
// #pragma simd
|
||||
LOOP_8R(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_8 );
|
||||
}
|
||||
} else {
|
||||
// #pragma simd
|
||||
LOOP_8R(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_8 );
|
||||
}
|
||||
}
|
||||
} // end RP::Right
|
||||
|
||||
} //end op() rank == 8
|
||||
#endif
|
||||
|
||||
|
||||
template <typename... Args>
|
||||
typename std::enable_if<( sizeof...(Args) == RP::rank && std::is_same<Tag,void>::value), void>::type
|
||||
apply(Args &&... args) const
|
||||
{
|
||||
m_func(args... , m_v);
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
typename std::enable_if<( sizeof...(Args) == RP::rank && !std::is_same<Tag,void>::value), void>::type
|
||||
apply(Args &&... args) const
|
||||
{
|
||||
m_func( m_tag, args... , m_v);
|
||||
}
|
||||
|
||||
|
||||
RP const& m_rp;
|
||||
Functor const& m_func;
|
||||
value_type * m_v;
|
||||
typename std::conditional< std::is_same<Tag,void>::value,int,Tag>::type m_tag;
|
||||
|
||||
};
|
||||
|
||||
|
||||
// ------------------------------------------------------------------ //
|
||||
|
||||
// MDFunctor - wraps the range_policy and functor to pass to IterateTile
|
||||
// Serial, Threads, OpenMP
|
||||
// Used for md_parallel_{for,reduce} with Serial, Threads, OpenMP
|
||||
// Cuda uses DeviceIterateTile directly within md_parallel_for
|
||||
// ParallelReduce
|
||||
// TODO Once md_parallel_{for,reduce} removed, this can be removed
|
||||
|
||||
// ParallelReduce - scalar reductions
|
||||
template < typename MDRange, typename Functor, typename ValueType = void >
|
||||
struct MDFunctor
|
||||
{
|
||||
|
@ -2273,7 +2767,7 @@ struct MDFunctor
|
|||
|
||||
|
||||
inline
|
||||
MDFunctor( MDRange const& range, Functor const& f, ValueType & v )
|
||||
MDFunctor( MDRange const& range, Functor const& f )
|
||||
: m_range( range )
|
||||
, m_func( f )
|
||||
{}
|
||||
|
@ -2290,7 +2784,6 @@ struct MDFunctor
|
|||
inline
|
||||
MDFunctor& operator=( MDFunctor && ) = default;
|
||||
|
||||
// KOKKOS_FORCEINLINE_FUNCTION //Caused cuda warning - __host__ warning
|
||||
inline
|
||||
void operator()(index_type t, value_type & v) const
|
||||
{
|
||||
|
@ -2301,6 +2794,56 @@ struct MDFunctor
|
|||
Functor m_func;
|
||||
};
|
||||
|
||||
|
||||
// ParallelReduce - array reductions
|
||||
template < typename MDRange, typename Functor, typename ValueType >
|
||||
struct MDFunctor< MDRange, Functor, ValueType[] >
|
||||
{
|
||||
using range_policy = MDRange;
|
||||
using functor_type = Functor;
|
||||
using value_type = ValueType[];
|
||||
using work_tag = typename range_policy::work_tag;
|
||||
using index_type = typename range_policy::index_type;
|
||||
using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRange
|
||||
, Functor
|
||||
, work_tag
|
||||
, value_type
|
||||
>;
|
||||
|
||||
|
||||
inline
|
||||
MDFunctor( MDRange const& range, Functor const& f )
|
||||
: m_range( range )
|
||||
, m_func( f )
|
||||
, value_count( f.value_count )
|
||||
{}
|
||||
|
||||
inline
|
||||
MDFunctor( MDFunctor const& ) = default;
|
||||
|
||||
inline
|
||||
MDFunctor& operator=( MDFunctor const& ) = default;
|
||||
|
||||
inline
|
||||
MDFunctor( MDFunctor && ) = default;
|
||||
|
||||
inline
|
||||
MDFunctor& operator=( MDFunctor && ) = default;
|
||||
|
||||
// FIXME Init and Join, as defined in m_func, are not working through the MDFunctor
|
||||
// Best path forward is to eliminate need for MDFunctor, directly use MDRangePolicy within Parallel{For,Reduce} ??
|
||||
inline
|
||||
void operator()(index_type t, value_type v) const
|
||||
{
|
||||
iterate_type(m_range, m_func, v)(t);
|
||||
}
|
||||
|
||||
MDRange m_range;
|
||||
Functor m_func;
|
||||
size_t value_count;
|
||||
};
|
||||
|
||||
|
||||
// ParallelFor
|
||||
template < typename MDRange, typename Functor >
|
||||
struct MDFunctor< MDRange, Functor, void >
|
||||
|
@ -2349,4 +2892,3 @@ struct MDFunctor< MDRange, Functor, void >
|
|||
} } } //end namespace Kokkos::Experimental::Impl
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
@ -55,16 +55,19 @@ template < typename ExecutionSpace = void
|
|||
, typename WorkTag = void
|
||||
, typename IndexType = void
|
||||
, typename IterationPattern = void
|
||||
, typename LaunchBounds = void
|
||||
>
|
||||
struct PolicyTraitsBase
|
||||
{
|
||||
using type = PolicyTraitsBase< ExecutionSpace, Schedule, WorkTag, IndexType, IterationPattern>;
|
||||
using type = PolicyTraitsBase< ExecutionSpace, Schedule, WorkTag, IndexType,
|
||||
IterationPattern, LaunchBounds>;
|
||||
|
||||
using execution_space = ExecutionSpace;
|
||||
using schedule_type = Schedule;
|
||||
using work_tag = WorkTag;
|
||||
using index_type = IndexType;
|
||||
using iteration_pattern = IterationPattern;
|
||||
using launch_bounds = LaunchBounds;
|
||||
};
|
||||
|
||||
|
||||
|
@ -78,6 +81,7 @@ struct SetExecutionSpace
|
|||
, typename PolicyBase::work_tag
|
||||
, typename PolicyBase::index_type
|
||||
, typename PolicyBase::iteration_pattern
|
||||
, typename PolicyBase::launch_bounds
|
||||
>;
|
||||
};
|
||||
|
||||
|
@ -91,6 +95,7 @@ struct SetSchedule
|
|||
, typename PolicyBase::work_tag
|
||||
, typename PolicyBase::index_type
|
||||
, typename PolicyBase::iteration_pattern
|
||||
, typename PolicyBase::launch_bounds
|
||||
>;
|
||||
};
|
||||
|
||||
|
@ -104,6 +109,7 @@ struct SetWorkTag
|
|||
, WorkTag
|
||||
, typename PolicyBase::index_type
|
||||
, typename PolicyBase::iteration_pattern
|
||||
, typename PolicyBase::launch_bounds
|
||||
>;
|
||||
};
|
||||
|
||||
|
@ -117,6 +123,7 @@ struct SetIndexType
|
|||
, typename PolicyBase::work_tag
|
||||
, IndexType
|
||||
, typename PolicyBase::iteration_pattern
|
||||
, typename PolicyBase::launch_bounds
|
||||
>;
|
||||
};
|
||||
|
||||
|
@ -131,6 +138,22 @@ struct SetIterationPattern
|
|||
, typename PolicyBase::work_tag
|
||||
, typename PolicyBase::index_type
|
||||
, IterationPattern
|
||||
, typename PolicyBase::launch_bounds
|
||||
>;
|
||||
};
|
||||
|
||||
|
||||
template <typename PolicyBase, typename LaunchBounds>
|
||||
struct SetLaunchBounds
|
||||
{
|
||||
static_assert( is_void<typename PolicyBase::launch_bounds>::value
|
||||
, "Kokkos Error: More than one launch_bounds given" );
|
||||
using type = PolicyTraitsBase< typename PolicyBase::execution_space
|
||||
, typename PolicyBase::schedule_type
|
||||
, typename PolicyBase::work_tag
|
||||
, typename PolicyBase::index_type
|
||||
, typename PolicyBase::iteration_pattern
|
||||
, LaunchBounds
|
||||
>;
|
||||
};
|
||||
|
||||
|
@ -146,8 +169,9 @@ struct AnalyzePolicy<Base, T, Traits...> : public
|
|||
, typename std::conditional< is_index_type<T>::value , SetIndexType<Base,T>
|
||||
, typename std::conditional< std::is_integral<T>::value , SetIndexType<Base, IndexType<T> >
|
||||
, typename std::conditional< is_iteration_pattern<T>::value, SetIterationPattern<Base,T>
|
||||
, typename std::conditional< is_launch_bounds<T>::value , SetLaunchBounds<Base,T>
|
||||
, SetWorkTag<Base,T>
|
||||
>::type >::type >::type >::type>::type::type
|
||||
>::type >::type >::type >::type >::type>::type::type
|
||||
, Traits...
|
||||
>
|
||||
{};
|
||||
|
@ -178,11 +202,18 @@ struct AnalyzePolicy<Base>
|
|||
, void // TODO set default iteration pattern
|
||||
, typename Base::iteration_pattern
|
||||
>::type;
|
||||
|
||||
using launch_bounds = typename std::conditional< is_void< typename Base::launch_bounds >::value
|
||||
, LaunchBounds<>
|
||||
, typename Base::launch_bounds
|
||||
>::type;
|
||||
|
||||
using type = PolicyTraitsBase< execution_space
|
||||
, schedule_type
|
||||
, work_tag
|
||||
, index_type
|
||||
, iteration_pattern
|
||||
, launch_bounds
|
||||
>;
|
||||
};
|
||||
|
||||
|
|
|
@ -41,6 +41,10 @@
|
|||
//@HEADER
|
||||
*/
|
||||
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
#include <xmmintrin.h>
|
||||
#endif
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP )
|
||||
#define KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP
|
||||
|
@ -126,11 +130,21 @@ T atomic_compare_exchange( volatile T * const dest , const T & compare ,
|
|||
|
||||
inline
|
||||
int atomic_compare_exchange( volatile int * const dest, const int compare, const int val)
|
||||
{ return __sync_val_compare_and_swap(dest,compare,val); }
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
return __sync_val_compare_and_swap(dest,compare,val);
|
||||
}
|
||||
|
||||
inline
|
||||
long atomic_compare_exchange( volatile long * const dest, const long compare, const long val )
|
||||
{ return __sync_val_compare_and_swap(dest,compare,val); }
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
return __sync_val_compare_and_swap(dest,compare,val);
|
||||
}
|
||||
|
||||
#if defined( KOKKOS_ENABLE_GNU_ATOMICS )
|
||||
|
||||
|
@ -159,6 +173,10 @@ T atomic_compare_exchange( volatile T * const dest, const T & compare,
|
|||
KOKKOS_INLINE_FUNCTION U() {};
|
||||
} tmp ;
|
||||
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
tmp.i = __sync_val_compare_and_swap( (int*) dest , *((int*)&compare) , *((int*)&val) );
|
||||
return tmp.t ;
|
||||
}
|
||||
|
@ -175,6 +193,10 @@ T atomic_compare_exchange( volatile T * const dest, const T & compare,
|
|||
KOKKOS_INLINE_FUNCTION U() {};
|
||||
} tmp ;
|
||||
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
tmp.i = __sync_val_compare_and_swap( (long*) dest , *((long*)&compare) , *((long*)&val) );
|
||||
return tmp.t ;
|
||||
}
|
||||
|
@ -193,6 +215,10 @@ T atomic_compare_exchange( volatile T * const dest, const T & compare,
|
|||
KOKKOS_INLINE_FUNCTION U() {};
|
||||
} tmp ;
|
||||
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
tmp.i = Impl::cas128( (Impl::cas128_t*) dest , *((Impl::cas128_t*)&compare) , *((Impl::cas128_t*)&val) );
|
||||
return tmp.t ;
|
||||
}
|
||||
|
@ -209,6 +235,10 @@ T atomic_compare_exchange( volatile T * const dest , const T compare ,
|
|||
#endif
|
||||
, const T >::type& val )
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
while( !Impl::lock_address_host_space( (void*) dest ) );
|
||||
T return_val = *dest;
|
||||
if( return_val == compare ) {
|
||||
|
|
|
@ -41,6 +41,10 @@
|
|||
//@HEADER
|
||||
*/
|
||||
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
#include <xmmintrin.h>
|
||||
#endif
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP) && ! defined( KOKKOS_ATOMIC_DECREMENT_HPP )
|
||||
#define KOKKOS_ATOMIC_DECREMENT_HPP
|
||||
|
@ -54,6 +58,10 @@ template<>
|
|||
KOKKOS_INLINE_FUNCTION
|
||||
void atomic_decrement<char>(volatile char* a) {
|
||||
#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) a, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
__asm__ __volatile__(
|
||||
"lock decb %0"
|
||||
: /* no output registers */
|
||||
|
@ -69,6 +77,10 @@ template<>
|
|||
KOKKOS_INLINE_FUNCTION
|
||||
void atomic_decrement<short>(volatile short* a) {
|
||||
#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) a, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
__asm__ __volatile__(
|
||||
"lock decw %0"
|
||||
: /* no output registers */
|
||||
|
@ -84,6 +96,10 @@ template<>
|
|||
KOKKOS_INLINE_FUNCTION
|
||||
void atomic_decrement<int>(volatile int* a) {
|
||||
#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) a, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
__asm__ __volatile__(
|
||||
"lock decl %0"
|
||||
: /* no output registers */
|
||||
|
@ -99,6 +115,9 @@ template<>
|
|||
KOKKOS_INLINE_FUNCTION
|
||||
void atomic_decrement<long long int>(volatile long long int* a) {
|
||||
#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) a, _MM_HINT_ET0 );
|
||||
#endif
|
||||
__asm__ __volatile__(
|
||||
"lock decq %0"
|
||||
: /* no output registers */
|
||||
|
|
|
@ -41,6 +41,10 @@
|
|||
//@HEADER
|
||||
*/
|
||||
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
#include <xmmintrin.h>
|
||||
#endif
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_EXCHANGE_HPP )
|
||||
#define KOKKOS_ATOMIC_EXCHANGE_HPP
|
||||
|
@ -81,6 +85,10 @@ T atomic_exchange(
|
|||
typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val )
|
||||
{
|
||||
// int tmp = __ullAtomicExch( (int*) dest , *((int*)&val) );
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
int tmp = atomicExch( ((int*)dest) , *((int*)&val) );
|
||||
return *((T*)&tmp);
|
||||
}
|
||||
|
@ -93,6 +101,11 @@ T atomic_exchange(
|
|||
sizeof(T) == sizeof(unsigned long long int) , const T & >::type val )
|
||||
{
|
||||
typedef unsigned long long int type ;
|
||||
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
// type tmp = __ullAtomicExch( (type*) dest , *((type*)&val) );
|
||||
type tmp = atomicExch( ((type*)dest) , *((type*)&val) );
|
||||
return *((T*)&tmp);
|
||||
|
@ -108,6 +121,10 @@ T atomic_exchange( volatile T * const dest ,
|
|||
{
|
||||
T return_val;
|
||||
// This is a way to (hopefully) avoid dead lock in a warp
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
int done = 0;
|
||||
unsigned int active = __ballot(1);
|
||||
unsigned int done_active = 0;
|
||||
|
@ -173,6 +190,9 @@ T atomic_exchange( volatile T * const dest ,
|
|||
, const T & >::type val )
|
||||
{
|
||||
typedef typename Kokkos::Impl::if_c< sizeof(T) == sizeof(int) , int , long >::type type ;
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
const type v = *((type*)&val); // Extract to be sure the value doesn't change
|
||||
|
||||
|
@ -201,6 +221,10 @@ T atomic_exchange( volatile T * const dest ,
|
|||
typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(Impl::cas128_t)
|
||||
, const T & >::type val )
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
union U {
|
||||
Impl::cas128_t i ;
|
||||
T t ;
|
||||
|
@ -260,6 +284,10 @@ void atomic_assign( volatile T * const dest ,
|
|||
{
|
||||
typedef typename Kokkos::Impl::if_c< sizeof(T) == sizeof(int) , int , long >::type type ;
|
||||
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
const type v = *((type*)&val); // Extract to be sure the value doesn't change
|
||||
|
||||
type assumed ;
|
||||
|
@ -285,6 +313,10 @@ void atomic_assign( volatile T * const dest ,
|
|||
typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(Impl::cas128_t)
|
||||
, const T & >::type val )
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
union U {
|
||||
Impl::cas128_t i ;
|
||||
T t ;
|
||||
|
|
|
@ -41,6 +41,10 @@
|
|||
//@HEADER
|
||||
*/
|
||||
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
#include <xmmintrin.h>
|
||||
#endif
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_ADD_HPP )
|
||||
#define KOKKOS_ATOMIC_FETCH_ADD_HPP
|
||||
|
@ -161,36 +165,60 @@ T atomic_fetch_add( volatile T * const dest ,
|
|||
inline
|
||||
int atomic_fetch_add( volatile int * dest , const int val )
|
||||
{
|
||||
int original = val;
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
__asm__ __volatile__(
|
||||
"lock xadd %1, %0"
|
||||
: "+m" (*dest), "+r" (original)
|
||||
: "m" (*dest), "r" (original)
|
||||
: "memory"
|
||||
int original = val;
|
||||
|
||||
__asm__ __volatile__(
|
||||
"lock xadd %1, %0"
|
||||
: "+m" (*dest), "+r" (original)
|
||||
: "m" (*dest), "r" (original)
|
||||
: "memory"
|
||||
);
|
||||
|
||||
return original;
|
||||
return original;
|
||||
}
|
||||
#else
|
||||
inline
|
||||
int atomic_fetch_add( volatile int * const dest , const int val )
|
||||
{ return __sync_fetch_and_add(dest, val); }
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
return __sync_fetch_and_add(dest, val);
|
||||
}
|
||||
#endif
|
||||
|
||||
inline
|
||||
long int atomic_fetch_add( volatile long int * const dest , const long int val )
|
||||
{ return __sync_fetch_and_add(dest,val); }
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
return __sync_fetch_and_add(dest,val);
|
||||
}
|
||||
|
||||
#if defined( KOKKOS_ENABLE_GNU_ATOMICS )
|
||||
|
||||
inline
|
||||
unsigned int atomic_fetch_add( volatile unsigned int * const dest , const unsigned int val )
|
||||
{ return __sync_fetch_and_add(dest,val); }
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
return __sync_fetch_and_add(dest,val);
|
||||
}
|
||||
|
||||
inline
|
||||
unsigned long int atomic_fetch_add( volatile unsigned long int * const dest , const unsigned long int val )
|
||||
{ return __sync_fetch_and_add(dest,val); }
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
return __sync_fetch_and_add(dest,val);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -205,6 +233,10 @@ T atomic_fetch_add( volatile T * const dest ,
|
|||
inline U() {};
|
||||
} assume , oldval , newval ;
|
||||
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
oldval.t = *dest ;
|
||||
|
||||
do {
|
||||
|
@ -228,6 +260,10 @@ T atomic_fetch_add( volatile T * const dest ,
|
|||
inline U() {};
|
||||
} assume , oldval , newval ;
|
||||
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
oldval.t = *dest ;
|
||||
|
||||
do {
|
||||
|
@ -253,6 +289,10 @@ T atomic_fetch_add( volatile T * const dest ,
|
|||
inline U() {};
|
||||
} assume , oldval , newval ;
|
||||
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
oldval.t = *dest ;
|
||||
|
||||
do {
|
||||
|
|
|
@ -41,6 +41,10 @@
|
|||
//@HEADER
|
||||
*/
|
||||
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
#include <xmmintrin.h>
|
||||
#endif
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_AND_HPP )
|
||||
#define KOKKOS_ATOMIC_FETCH_AND_HPP
|
||||
|
@ -76,21 +80,41 @@ unsigned long long int atomic_fetch_and( volatile unsigned long long int * const
|
|||
|
||||
inline
|
||||
int atomic_fetch_and( volatile int * const dest , const int val )
|
||||
{ return __sync_fetch_and_and(dest,val); }
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
return __sync_fetch_and_and(dest,val);
|
||||
}
|
||||
|
||||
inline
|
||||
long int atomic_fetch_and( volatile long int * const dest , const long int val )
|
||||
{ return __sync_fetch_and_and(dest,val); }
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
return __sync_fetch_and_and(dest,val);
|
||||
}
|
||||
|
||||
#if defined( KOKKOS_ENABLE_GNU_ATOMICS )
|
||||
|
||||
inline
|
||||
unsigned int atomic_fetch_and( volatile unsigned int * const dest , const unsigned int val )
|
||||
{ return __sync_fetch_and_and(dest,val); }
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
return __sync_fetch_and_and(dest,val);
|
||||
}
|
||||
|
||||
inline
|
||||
unsigned long int atomic_fetch_and( volatile unsigned long int * const dest , const unsigned long int val )
|
||||
{ return __sync_fetch_and_and(dest,val); }
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
return __sync_fetch_and_and(dest,val);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
@ -41,6 +41,10 @@
|
|||
//@HEADER
|
||||
*/
|
||||
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
#include <xmmintrin.h>
|
||||
#endif
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_OR_HPP )
|
||||
#define KOKKOS_ATOMIC_FETCH_OR_HPP
|
||||
|
@ -76,21 +80,41 @@ unsigned long long int atomic_fetch_or( volatile unsigned long long int * const
|
|||
|
||||
inline
|
||||
int atomic_fetch_or( volatile int * const dest , const int val )
|
||||
{ return __sync_fetch_and_or(dest,val); }
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
return __sync_fetch_and_or(dest,val);
|
||||
}
|
||||
|
||||
inline
|
||||
long int atomic_fetch_or( volatile long int * const dest , const long int val )
|
||||
{ return __sync_fetch_and_or(dest,val); }
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
return __sync_fetch_and_or(dest,val);
|
||||
}
|
||||
|
||||
#if defined( KOKKOS_ENABLE_GNU_ATOMICS )
|
||||
|
||||
inline
|
||||
unsigned int atomic_fetch_or( volatile unsigned int * const dest , const unsigned int val )
|
||||
{ return __sync_fetch_and_or(dest,val); }
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
return __sync_fetch_and_or(dest,val);
|
||||
}
|
||||
|
||||
inline
|
||||
unsigned long int atomic_fetch_or( volatile unsigned long int * const dest , const unsigned long int val )
|
||||
{ return __sync_fetch_and_or(dest,val); }
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
return __sync_fetch_and_or(dest,val);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
@ -41,6 +41,10 @@
|
|||
//@HEADER
|
||||
*/
|
||||
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
#include <xmmintrin.h>
|
||||
#endif
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_SUB_HPP )
|
||||
#define KOKKOS_ATOMIC_FETCH_SUB_HPP
|
||||
|
@ -136,21 +140,41 @@ T atomic_fetch_sub( volatile T * const dest ,
|
|||
|
||||
inline
|
||||
int atomic_fetch_sub( volatile int * const dest , const int val )
|
||||
{ return __sync_fetch_and_sub(dest,val); }
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
return __sync_fetch_and_sub(dest,val);
|
||||
}
|
||||
|
||||
inline
|
||||
long int atomic_fetch_sub( volatile long int * const dest , const long int val )
|
||||
{ return __sync_fetch_and_sub(dest,val); }
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
return __sync_fetch_and_sub(dest,val);
|
||||
}
|
||||
|
||||
#if defined( KOKKOS_ENABLE_GNU_ATOMICS )
|
||||
|
||||
inline
|
||||
unsigned int atomic_fetch_sub( volatile unsigned int * const dest , const unsigned int val )
|
||||
{ return __sync_fetch_and_sub(dest,val); }
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
return __sync_fetch_and_sub(dest,val);
|
||||
}
|
||||
|
||||
inline
|
||||
unsigned long int atomic_fetch_sub( volatile unsigned long int * const dest , const unsigned long int val )
|
||||
{ return __sync_fetch_and_sub(dest,val); }
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
return __sync_fetch_and_sub(dest,val);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -161,6 +185,10 @@ T atomic_fetch_sub( volatile T * const dest ,
|
|||
{
|
||||
union { int i ; T t ; } assume , oldval , newval ;
|
||||
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
oldval.t = *dest ;
|
||||
|
||||
do {
|
||||
|
@ -178,6 +206,10 @@ T atomic_fetch_sub( volatile T * const dest ,
|
|||
typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
|
||||
sizeof(T) == sizeof(long) , const T >::type val )
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
union { long i ; T t ; } assume , oldval , newval ;
|
||||
|
||||
oldval.t = *dest ;
|
||||
|
@ -202,6 +234,10 @@ T atomic_fetch_sub( volatile T * const dest ,
|
|||
&& ( sizeof(T) != 8 )
|
||||
, const T >::type& val )
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
while( !Impl::lock_address_host_space( (void*) dest ) );
|
||||
T return_val = *dest;
|
||||
*dest = return_val - val;
|
||||
|
|
|
@ -41,6 +41,10 @@
|
|||
//@HEADER
|
||||
*/
|
||||
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
#include <xmmintrin.h>
|
||||
#endif
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP) && ! defined( KOKKOS_ATOMIC_INCREMENT_HPP )
|
||||
#define KOKKOS_ATOMIC_INCREMENT_HPP
|
||||
|
@ -52,6 +56,9 @@ template<>
|
|||
KOKKOS_INLINE_FUNCTION
|
||||
void atomic_increment<char>(volatile char* a) {
|
||||
#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) a, _MM_HINT_ET0 );
|
||||
#endif
|
||||
__asm__ __volatile__(
|
||||
"lock incb %0"
|
||||
: /* no output registers */
|
||||
|
@ -67,6 +74,9 @@ template<>
|
|||
KOKKOS_INLINE_FUNCTION
|
||||
void atomic_increment<short>(volatile short* a) {
|
||||
#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) a, _MM_HINT_ET0 );
|
||||
#endif
|
||||
__asm__ __volatile__(
|
||||
"lock incw %0"
|
||||
: /* no output registers */
|
||||
|
@ -82,6 +92,9 @@ template<>
|
|||
KOKKOS_INLINE_FUNCTION
|
||||
void atomic_increment<int>(volatile int* a) {
|
||||
#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) a, _MM_HINT_ET0 );
|
||||
#endif
|
||||
__asm__ __volatile__(
|
||||
"lock incl %0"
|
||||
: /* no output registers */
|
||||
|
@ -97,6 +110,9 @@ template<>
|
|||
KOKKOS_INLINE_FUNCTION
|
||||
void atomic_increment<long long int>(volatile long long int* a) {
|
||||
#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) a, _MM_HINT_ET0 );
|
||||
#endif
|
||||
__asm__ __volatile__(
|
||||
"lock incq %0"
|
||||
: /* no output registers */
|
||||
|
|
|
@ -87,17 +87,12 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
|
|||
#if defined( KOKKOS_ENABLE_OPENMP )
|
||||
if( std::is_same< Kokkos::OpenMP , Kokkos::DefaultExecutionSpace >::value ||
|
||||
std::is_same< Kokkos::OpenMP , Kokkos::HostSpace::execution_space >::value ) {
|
||||
if(num_threads>0) {
|
||||
if(use_numa>0) {
|
||||
Kokkos::OpenMP::initialize(num_threads,use_numa);
|
||||
}
|
||||
else {
|
||||
Kokkos::OpenMP::initialize(num_threads);
|
||||
}
|
||||
} else {
|
||||
Kokkos::OpenMP::initialize();
|
||||
if(use_numa>0) {
|
||||
Kokkos::OpenMP::initialize(num_threads,use_numa);
|
||||
}
|
||||
else {
|
||||
Kokkos::OpenMP::initialize(num_threads);
|
||||
}
|
||||
//std::cout << "Kokkos::initialize() fyi: OpenMP enabled and initialized" << std::endl ;
|
||||
}
|
||||
else {
|
||||
//std::cout << "Kokkos::initialize() fyi: OpenMP enabled but not initialized" << std::endl ;
|
||||
|
@ -437,10 +432,7 @@ void initialize(int& narg, char* arg[])
|
|||
iarg++;
|
||||
}
|
||||
|
||||
InitArguments arguments;
|
||||
arguments.num_threads = num_threads;
|
||||
arguments.num_numa = numa;
|
||||
arguments.device_id = device;
|
||||
InitArguments arguments{num_threads, numa, device};
|
||||
Impl::initialize_internal(arguments);
|
||||
}
|
||||
|
||||
|
|
|
@ -170,28 +170,31 @@ struct FunctorValueTraits< FunctorType , ArgTag , true /* == exists FunctorType:
|
|||
static_assert( 0 == ( sizeof(value_type) % sizeof(int) ) ,
|
||||
"Reduction functor's declared value_type requires: 0 == sizeof(value_type) % sizeof(int)" );
|
||||
|
||||
/* this cast to bool is needed for correctness by NVCC */
|
||||
enum : bool { IsArray = static_cast<bool>(Impl::is_array< typename FunctorType::value_type >::value) };
|
||||
|
||||
// If not an array then what is the sizeof(value_type)
|
||||
enum { StaticValueSize = Impl::is_array< typename FunctorType::value_type >::value ? 0 : sizeof(value_type) };
|
||||
enum { StaticValueSize = IsArray ? 0 : sizeof(value_type) };
|
||||
|
||||
typedef value_type * pointer_type ;
|
||||
|
||||
// The reference_type for an array is 'value_type *'
|
||||
// The reference_type for a single value is 'value_type &'
|
||||
|
||||
typedef typename Impl::if_c< ! StaticValueSize , value_type *
|
||||
, value_type & >::type reference_type ;
|
||||
typedef typename Impl::if_c< IsArray , value_type *
|
||||
, value_type & >::type reference_type ;
|
||||
|
||||
// Number of values if single value
|
||||
template< class F >
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
typename Impl::enable_if< std::is_same<F,FunctorType>::value && StaticValueSize , unsigned >::type
|
||||
typename Impl::enable_if< std::is_same<F,FunctorType>::value && ! IsArray , unsigned >::type
|
||||
value_count( const F & ) { return 1 ; }
|
||||
|
||||
// Number of values if an array, protect via templating because 'f.value_count'
|
||||
// will only exist when the functor declares the value_type to be an array.
|
||||
template< class F >
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
typename Impl::enable_if< std::is_same<F,FunctorType>::value && ! StaticValueSize , unsigned >::type
|
||||
typename Impl::enable_if< std::is_same<F,FunctorType>::value && IsArray , unsigned >::type
|
||||
value_count( const F & f ) { return f.value_count ; }
|
||||
|
||||
// Total size of the value
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue